From 2c7f8581256c7fd98b29558195ab2594c009a91d Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Sat, 10 Sep 2011 11:36:38 +0000 Subject: [PATCH] handle duplicates using openlibrary ids for edition, work and author --- core/{books.py => bookloader.py} | 59 +++--- core/management/commands/load_books.py | 4 +- ...l_workidentifier__add_field_work_openli.py | 170 ++++++++++++++++++ core/models.py | 46 +++-- core/tests.py | 23 ++- settings/common.py | 19 +- 6 files changed, 259 insertions(+), 62 deletions(-) rename core/{books.py => bookloader.py} (55%) create mode 100644 core/migrations/0010_auto__del_editionidentifier__del_workidentifier__add_field_work_openli.py diff --git a/core/books.py b/core/bookloader.py similarity index 55% rename from core/books.py rename to core/bookloader.py index e1055af9..b699cd2a 100755 --- a/core/books.py +++ b/core/bookloader.py @@ -12,36 +12,44 @@ from django.conf import settings from regluit.core import models from regluit.core.isbn import convert_10_to_13 +logger = logging.getLogger(__name__) def add_book(isbn): url = "http://openlibrary.org/api/books" bibkeys = "ISBN:%s" % isbn params = {"bibkeys": bibkeys, "jscmd": "details", "format": "json"} - results = get_json(url, params) + results = _get_json(url, params) + + edition = None if results.has_key(bibkeys): - return save_edition(results[bibkeys]['details']) + logger.info("saving book info for %s", isbn) + edition = _save_edition(results[bibkeys]['details']) elif len(isbn) == 10: - return add_book(convert_10_to_13(isbn)) + isbn_13 = convert_10_to_13(isbn) + logger.info("lookup failed for %s trying isbn13 %s", isbn, isbn_13) + edition = add_book(isbn_13) else: - return None + logger.info("lookup failed for %s", isbn) + + return edition -def save_edition(edition_data): - edition = models.Edition() +def _save_edition(edition_data): + edition_key = edition_data['key'] + edition, created = models.Edition.objects.get_or_create(openlibrary_id=edition_key) edition.title = edition_data.get('title') edition.description = edition_data.get('description') - edition.publisher = first(edition_data, 'publishers') + edition.publisher = _first(edition_data, 'publishers') edition.publication_date = edition_data.get('publish_date') + + # assumption: OL has only one isbn_10 or isbn_13 for an edition + edition.isbn_10 = _first(edition_data, 'isbn_10') + edition.isbn_13 = _first(edition_data, 'isbn_13') + edition.save() for work_data in edition_data.get('works', []): - save_work(work_data['key'], edition) - - for isbn_10 in edition_data.get('isbn_10', []): - models.EditionIdentifier.objects.get_or_create(name='isbn_10', value=isbn_10, edition=edition) - - for isbn_13 in edition_data.get('isbn_13', []): - models.EditionIdentifier.objects.get_or_create(name='isbn_13', value=isbn_13, edition=edition) + _save_work(work_data['key'], edition) for cover_id in edition_data.get('covers', []): models.EditionCover.objects.get_or_create(openlibrary_id=cover_id, edition=edition) @@ -49,17 +57,16 @@ def save_edition(edition_data): return edition -def save_work(work_key, edition): +def _save_work(work_key, edition): url = "http://openlibrary.org" + work_key - work_data = get_json(url) + work_data = _get_json(url) - work = models.Work() + work, created = models.Work.objects.get_or_create(openlibrary_id=work_key) work.title = work_data.get('title') - work.openlibrary_id = work_key work.save() for author_data in work_data.get('authors', []): - save_author(author_data['author']['key'], work) + _save_author(author_data['author']['key'], work) for subject_name in work_data.get('subjects', []): subject, created = models.Subject.objects.get_or_create(name=subject_name) @@ -70,11 +77,11 @@ def save_work(work_key, edition): return work -def save_author(author_key, work): +def _save_author(author_key, work): url = "http://openlibrary.org" + author_key - author_data = get_json(url) + author_data = _get_json(url) - author = models.Author() + author, created = models.Author.objects.get_or_create(openlibrary_id=author_key) author.name = author_data['name'] author.save() @@ -83,19 +90,19 @@ def save_author(author_key, work): return author -def first(dictionary, key): +def _first(dictionary, key): l = dictionary.get(key, []) if len(l) == 0: return None return l[0] -def get_json(url, params={}): - headers = {'User-Agent': 'unglue.it bot', 'Accept': 'application/json'} +def _get_json(url, params={}): + headers = {'User-Agent': settings.USER_AGENT, 'Accept': 'application/json'} response = requests.get(url, params=params, headers=headers) if response.status_code == 200: return json.loads(response.content) else: - logging.error("unexpected HTTP response: %s" % response) + logger.error("unexpected HTTP response: %s" % response) raise LookupFailure("GET failed: url=%s and params=%s" % (url, params)) diff --git a/core/management/commands/load_books.py b/core/management/commands/load_books.py index e6555181..9fa5d5e5 100644 --- a/core/management/commands/load_books.py +++ b/core/management/commands/load_books.py @@ -1,6 +1,6 @@ from django.core.management.base import BaseCommand -from regluit.core import books +from regluit.core import bookloader class Command(BaseCommand): help = "load books based on a text file of ISBNs" @@ -9,7 +9,7 @@ class Command(BaseCommand): def handle(self, filename, **options): for isbn in open(filename): isbn = isbn.strip() - edition = books.add_book(isbn) + edition = bookloader.add_book(isbn) if edition: print edition else: diff --git a/core/migrations/0010_auto__del_editionidentifier__del_workidentifier__add_field_work_openli.py b/core/migrations/0010_auto__del_editionidentifier__del_workidentifier__add_field_work_openli.py new file mode 100644 index 00000000..7e86a20f --- /dev/null +++ b/core/migrations/0010_auto__del_editionidentifier__del_workidentifier__add_field_work_openli.py @@ -0,0 +1,170 @@ +# encoding: utf-8 +import datetime +from south.db import db +from south.v2 import SchemaMigration +from django.db import models + +class Migration(SchemaMigration): + + def forwards(self, orm): + + # Deleting model 'EditionIdentifier' + db.delete_table('core_editionidentifier') + + # Deleting model 'WorkIdentifier' + db.delete_table('core_workidentifier') + + # Adding field 'Work.openlibrary_id' + db.add_column('core_work', 'openlibrary_id', self.gf('django.db.models.fields.CharField')(max_length=50, null=True), keep_default=False) + + # Adding field 'Edition.isbn_10' + db.add_column('core_edition', 'isbn_10', self.gf('django.db.models.fields.CharField')(max_length=10, null=True), keep_default=False) + + # Adding field 'Edition.isbn_13' + db.add_column('core_edition', 'isbn_13', self.gf('django.db.models.fields.CharField')(max_length=13, null=True), keep_default=False) + + # Adding field 'Edition.openlibrary_id' + db.add_column('core_edition', 'openlibrary_id', self.gf('django.db.models.fields.CharField')(max_length=50, null=True), keep_default=False) + + # Adding field 'Author.openlibrary_id' + db.add_column('core_author', 'openlibrary_id', self.gf('django.db.models.fields.CharField')(max_length=50, null=True), keep_default=False) + + + def backwards(self, orm): + + # Adding model 'EditionIdentifier' + db.create_table('core_editionidentifier', ( + ('name', self.gf('django.db.models.fields.CharField')(max_length=10)), + ('created', self.gf('django.db.models.fields.DateTimeField')(auto_now_add=True, blank=True)), + ('value', self.gf('django.db.models.fields.CharField')(max_length=500)), + ('edition', self.gf('django.db.models.fields.related.ForeignKey')(related_name='identifiers', to=orm['core.Edition'])), + ('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), + )) + db.send_create_signal('core', ['EditionIdentifier']) + + # Adding model 'WorkIdentifier' + db.create_table('core_workidentifier', ( + ('name', self.gf('django.db.models.fields.CharField')(max_length=10)), + ('created', self.gf('django.db.models.fields.DateTimeField')(auto_now_add=True, blank=True)), + ('work', self.gf('django.db.models.fields.related.ForeignKey')(related_name='identifiers', to=orm['core.Work'])), + ('value', self.gf('django.db.models.fields.CharField')(max_length=500)), + ('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), + )) + db.send_create_signal('core', ['WorkIdentifier']) + + # Deleting field 'Work.openlibrary_id' + db.delete_column('core_work', 'openlibrary_id') + + # Deleting field 'Edition.isbn_10' + db.delete_column('core_edition', 'isbn_10') + + # Deleting field 'Edition.isbn_13' + db.delete_column('core_edition', 'isbn_13') + + # Deleting field 'Edition.openlibrary_id' + db.delete_column('core_edition', 'openlibrary_id') + + # Deleting field 'Author.openlibrary_id' + db.delete_column('core_author', 'openlibrary_id') + + + models = { + 'auth.group': { + 'Meta': {'object_name': 'Group'}, + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '80'}), + 'permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'}) + }, + 'auth.permission': { + 'Meta': {'ordering': "('content_type__app_label', 'content_type__model', 'codename')", 'unique_together': "(('content_type', 'codename'),)", 'object_name': 'Permission'}, + 'codename': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'content_type': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['contenttypes.ContentType']"}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '50'}) + }, + 'auth.user': { + 'Meta': {'object_name': 'User'}, + 'date_joined': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), + 'email': ('django.db.models.fields.EmailField', [], {'max_length': '75', 'blank': 'True'}), + 'first_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}), + 'groups': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Group']", 'symmetrical': 'False', 'blank': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'is_active': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), + 'is_staff': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'is_superuser': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'last_login': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), + 'last_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}), + 'password': ('django.db.models.fields.CharField', [], {'max_length': '128'}), + 'user_permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'}), + 'username': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '30'}) + }, + 'contenttypes.contenttype': { + 'Meta': {'ordering': "('name',)", 'unique_together': "(('app_label', 'model'),)", 'object_name': 'ContentType', 'db_table': "'django_content_type'"}, + 'app_label': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'model': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '100'}) + }, + 'core.author': { + 'Meta': {'object_name': 'Author'}, + 'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '500'}), + 'openlibrary_id': ('django.db.models.fields.CharField', [], {'max_length': '50', 'null': 'True'}), + 'works': ('django.db.models.fields.related.ManyToManyField', [], {'related_name': "'authors'", 'symmetrical': 'False', 'to': "orm['core.Work']"}) + }, + 'core.campaign': { + 'Meta': {'object_name': 'Campaign'}, + 'amazon_receiver': ('django.db.models.fields.CharField', [], {'max_length': '100', 'null': 'True'}), + 'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}), + 'deadline': ('django.db.models.fields.DateTimeField', [], {}), + 'description': ('django.db.models.fields.CharField', [], {'max_length': '10000'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '500'}), + 'paypal_receiver': ('django.db.models.fields.CharField', [], {'max_length': '100', 'null': 'True'}), + 'target': ('django.db.models.fields.FloatField', [], {}), + 'work': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'campaign'", 'to': "orm['core.Work']"}) + }, + 'core.edition': { + 'Meta': {'object_name': 'Edition'}, + 'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}), + 'description': ('django.db.models.fields.TextField', [], {'default': "''"}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'isbn_10': ('django.db.models.fields.CharField', [], {'max_length': '10', 'null': 'True'}), + 'isbn_13': ('django.db.models.fields.CharField', [], {'max_length': '13', 'null': 'True'}), + 'openlibrary_id': ('django.db.models.fields.CharField', [], {'max_length': '50', 'null': 'True'}), + 'publication_date': ('django.db.models.fields.CharField', [], {'max_length': '50'}), + 'publisher': ('django.db.models.fields.CharField', [], {'max_length': '255'}), + 'title': ('django.db.models.fields.CharField', [], {'max_length': '1000'}), + 'work': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'editions'", 'to': "orm['core.Work']"}) + }, + 'core.editioncover': { + 'Meta': {'object_name': 'EditionCover'}, + 'edition': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'covers'", 'to': "orm['core.Edition']"}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'openlibrary_id': ('django.db.models.fields.IntegerField', [], {}) + }, + 'core.subject': { + 'Meta': {'object_name': 'Subject'}, + 'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '500'}), + 'works': ('django.db.models.fields.related.ManyToManyField', [], {'related_name': "'subjects'", 'symmetrical': 'False', 'to': "orm['core.Work']"}) + }, + 'core.wishlist': { + 'Meta': {'object_name': 'Wishlist'}, + 'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'user': ('django.db.models.fields.related.OneToOneField', [], {'related_name': "'wishlist'", 'unique': 'True', 'to': "orm['auth.User']"}), + 'works': ('django.db.models.fields.related.ManyToManyField', [], {'related_name': "'wishlists'", 'symmetrical': 'False', 'to': "orm['core.Work']"}) + }, + 'core.work': { + 'Meta': {'object_name': 'Work'}, + 'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'openlibrary_id': ('django.db.models.fields.CharField', [], {'max_length': '50', 'null': 'True'}), + 'title': ('django.db.models.fields.CharField', [], {'max_length': '1000'}) + } + } + + complete_apps = ['core'] diff --git a/core/models.py b/core/models.py index e16e01f4..e1494d7b 100755 --- a/core/models.py +++ b/core/models.py @@ -12,66 +12,60 @@ class Campaign(models.Model): paypal_receiver = models.CharField(max_length=100, null=True) amazon_receiver = models.CharField(max_length=100, null=True) work = models.ForeignKey("Work", related_name="campaign") - + + def __unicode__(self): + return u"Campaign for %s" % self.work.title + + + class Work(models.Model): created = models.DateTimeField(auto_now_add=True) title = models.CharField(max_length=1000) + openlibrary_id = models.CharField(max_length=50, null=True) + + def __unicode__(self): + return self.title -class WorkIdentifier(models.Model): - created = models.DateTimeField(auto_now_add=True) - name = models.CharField(max_length=10) - value = models.CharField(max_length=500) - work = models.ForeignKey("Work", related_name="identifiers") class Author(models.Model): created = models.DateTimeField(auto_now_add=True) name = models.CharField(max_length=500) + openlibrary_id = models.CharField(max_length=50, null=True) works = models.ManyToManyField("Work", related_name="authors") def __unicode__(self): return self.name + class Subject(models.Model): created = models.DateTimeField(auto_now_add=True) name = models.CharField(max_length=500) works = models.ManyToManyField("Work", related_name="subjects") + def __unicode__(self): + return self.name + + class Edition(models.Model): created = models.DateTimeField(auto_now_add=True) title = models.CharField(max_length=1000) description = models.TextField(default='') publisher = models.CharField(max_length=255) publication_date = models.CharField(max_length=50) + isbn_10 = models.CharField(max_length=10, null=True) + isbn_13 = models.CharField(max_length=13, null=True) + openlibrary_id = models.CharField(max_length=50, null=True) work = models.ForeignKey("Work", related_name="editions") - @property - def isbn_10(self): - return self._id('isbn_10') - - @property - def isbn_13(self): - return self._id('isbn_13') - def __unicode__(self): return self.title - def _id(self, name): - for i in self.identifiers.all(): - if i.name == name: - return i.value - return None - - -class EditionIdentifier(models.Model): - created = models.DateTimeField(auto_now_add=True) - name = models.CharField(max_length=10) - value = models.CharField(max_length=500) - edition = models.ForeignKey("Edition", related_name="identifiers") class EditionCover(models.Model): openlibrary_id = models.IntegerField() edition = models.ForeignKey("Edition", related_name="covers") + class Wishlist(models.Model): created = models.DateTimeField(auto_now_add=True) user = models.OneToOneField(User, related_name='wishlist') diff --git a/core/tests.py b/core/tests.py index 0a3083a2..28509ee3 100755 --- a/core/tests.py +++ b/core/tests.py @@ -1,28 +1,43 @@ from django.test import TestCase -from regluit.core import books +from regluit.core import bookloader, models class TestBooks(TestCase): def test_add_book(self): - edition = books.add_book(isbn='0441012035') + # edition + edition = bookloader.add_book(isbn='0441012035') self.assertEqual(edition.title, 'Neuromancer') self.assertEqual(edition.publication_date, '2004') self.assertEqual(edition.publisher, 'Ace Books') - self.assertEqual(edition.isbn_10, '0441012035') self.assertEqual(edition.isbn_13, None) + self.assertEqual(edition.openlibrary_id, "/books/OL3305354M") + # edition covers covers = edition.covers.all() self.assertEqual(len(covers), 1) self.assertEqual(covers[0].openlibrary_id, 284192) + # work work = edition.work self.assertTrue(work) self.assertEqual(work.authors.all()[0].name, 'William F. Gibson') + # subjects subject_names = [subject.name for subject in work.subjects.all()] - self.assertTrue(len(subject_names) > 15) + self.assertEqual(len(subject_names), 18) self.assertTrue('Fiction' in subject_names) + # authors + author_names = [author.name for author in work.authors.all()] + self.assertEqual(len(author_names), 1) + self.assertEqual(author_names[0], "William F. Gibson") + def test_double_add(self): + bookloader.add_book(isbn='0441012035') + bookloader.add_book(isbn='0441012035') + self.assertEqual(models.Author.objects.all().count(), 1) + self.assertEqual(models.Work.objects.all().count(), 1) + self.assertEqual(models.Subject.objects.all().count(), 18) + diff --git a/settings/common.py b/settings/common.py index 5cf317a8..6a7abbea 100644 --- a/settings/common.py +++ b/settings/common.py @@ -114,7 +114,12 @@ INSTALLED_APPS = ( # more details on how to customize your logging configuration. LOGGING = { 'version': 1, - 'disable_existing_loggers': False, + 'disable_existing_loggers': True, + 'formatters': { + 'brief': { + 'format': '%(asctime)s %(levelname)s %(name)s[%(funcName)s]: %(message)s', + }, + }, 'handlers': { 'mail_admins': { 'level': 'ERROR', @@ -123,7 +128,10 @@ LOGGING = { 'file': { 'level': 'INFO', 'class': 'logging.handlers.RotatingFileHandler', - 'filename': join(PROJECT_DIR, 'logs', 'django.log') + 'filename': join(PROJECT_DIR, 'logs', 'unglue.it.log'), + 'maxBytes': 1024*1024*5, # 5 MB + 'backupCount': 5, + 'formatter': 'brief', }, }, 'loggers': { @@ -132,6 +140,10 @@ LOGGING = { 'level': 'ERROR', 'propagate': True, }, + '': { + 'handlers': ['file'], + 'level': 'INFO', + } } } @@ -142,7 +154,6 @@ ACCOUNT_ACTIVATION_DAYS = 7 # django-social-auth AUTHENTICATION_BACKENDS = ( - 'social_auth.backends.google.GoogleOAuth2Backend', 'social_auth.backends.facebook.FacebookBackend', 'social_auth.backends.twitter.TwitterBackend', @@ -169,4 +180,4 @@ LOGIN_URL = "/accounts/login/" LOGIN_REDIRECT_URL = "/" LOGOUT_URL = "/accounts/logout/" -USER_AGENT = "unglue.it bot " +USER_AGENT = "unglue.it.bot v0.0.1 "