handle duplicates using openlibrary ids for edition, work and author

pull/1/head
Ed Summers 2011-09-10 11:36:38 +00:00
parent 5de83896e1
commit 2c7f858125
6 changed files with 259 additions and 62 deletions

View File

@ -12,36 +12,44 @@ from django.conf import settings
from regluit.core import models
from regluit.core.isbn import convert_10_to_13
logger = logging.getLogger(__name__)
def add_book(isbn):
url = "http://openlibrary.org/api/books"
bibkeys = "ISBN:%s" % isbn
params = {"bibkeys": bibkeys, "jscmd": "details", "format": "json"}
results = get_json(url, params)
results = _get_json(url, params)
edition = None
if results.has_key(bibkeys):
return save_edition(results[bibkeys]['details'])
logger.info("saving book info for %s", isbn)
edition = _save_edition(results[bibkeys]['details'])
elif len(isbn) == 10:
return add_book(convert_10_to_13(isbn))
isbn_13 = convert_10_to_13(isbn)
logger.info("lookup failed for %s trying isbn13 %s", isbn, isbn_13)
edition = add_book(isbn_13)
else:
return None
logger.info("lookup failed for %s", isbn)
return edition
def save_edition(edition_data):
edition = models.Edition()
def _save_edition(edition_data):
edition_key = edition_data['key']
edition, created = models.Edition.objects.get_or_create(openlibrary_id=edition_key)
edition.title = edition_data.get('title')
edition.description = edition_data.get('description')
edition.publisher = first(edition_data, 'publishers')
edition.publisher = _first(edition_data, 'publishers')
edition.publication_date = edition_data.get('publish_date')
# assumption: OL has only one isbn_10 or isbn_13 for an edition
edition.isbn_10 = _first(edition_data, 'isbn_10')
edition.isbn_13 = _first(edition_data, 'isbn_13')
edition.save()
for work_data in edition_data.get('works', []):
save_work(work_data['key'], edition)
for isbn_10 in edition_data.get('isbn_10', []):
models.EditionIdentifier.objects.get_or_create(name='isbn_10', value=isbn_10, edition=edition)
for isbn_13 in edition_data.get('isbn_13', []):
models.EditionIdentifier.objects.get_or_create(name='isbn_13', value=isbn_13, edition=edition)
_save_work(work_data['key'], edition)
for cover_id in edition_data.get('covers', []):
models.EditionCover.objects.get_or_create(openlibrary_id=cover_id, edition=edition)
@ -49,17 +57,16 @@ def save_edition(edition_data):
return edition
def save_work(work_key, edition):
def _save_work(work_key, edition):
url = "http://openlibrary.org" + work_key
work_data = get_json(url)
work_data = _get_json(url)
work = models.Work()
work, created = models.Work.objects.get_or_create(openlibrary_id=work_key)
work.title = work_data.get('title')
work.openlibrary_id = work_key
work.save()
for author_data in work_data.get('authors', []):
save_author(author_data['author']['key'], work)
_save_author(author_data['author']['key'], work)
for subject_name in work_data.get('subjects', []):
subject, created = models.Subject.objects.get_or_create(name=subject_name)
@ -70,11 +77,11 @@ def save_work(work_key, edition):
return work
def save_author(author_key, work):
def _save_author(author_key, work):
url = "http://openlibrary.org" + author_key
author_data = get_json(url)
author_data = _get_json(url)
author = models.Author()
author, created = models.Author.objects.get_or_create(openlibrary_id=author_key)
author.name = author_data['name']
author.save()
@ -83,19 +90,19 @@ def save_author(author_key, work):
return author
def first(dictionary, key):
def _first(dictionary, key):
l = dictionary.get(key, [])
if len(l) == 0: return None
return l[0]
def get_json(url, params={}):
headers = {'User-Agent': 'unglue.it bot', 'Accept': 'application/json'}
def _get_json(url, params={}):
headers = {'User-Agent': settings.USER_AGENT, 'Accept': 'application/json'}
response = requests.get(url, params=params, headers=headers)
if response.status_code == 200:
return json.loads(response.content)
else:
logging.error("unexpected HTTP response: %s" % response)
logger.error("unexpected HTTP response: %s" % response)
raise LookupFailure("GET failed: url=%s and params=%s" % (url, params))

View File

@ -1,6 +1,6 @@
from django.core.management.base import BaseCommand
from regluit.core import books
from regluit.core import bookloader
class Command(BaseCommand):
help = "load books based on a text file of ISBNs"
@ -9,7 +9,7 @@ class Command(BaseCommand):
def handle(self, filename, **options):
for isbn in open(filename):
isbn = isbn.strip()
edition = books.add_book(isbn)
edition = bookloader.add_book(isbn)
if edition:
print edition
else:

View File

@ -0,0 +1,170 @@
# encoding: utf-8
import datetime
from south.db import db
from south.v2 import SchemaMigration
from django.db import models
class Migration(SchemaMigration):
def forwards(self, orm):
# Deleting model 'EditionIdentifier'
db.delete_table('core_editionidentifier')
# Deleting model 'WorkIdentifier'
db.delete_table('core_workidentifier')
# Adding field 'Work.openlibrary_id'
db.add_column('core_work', 'openlibrary_id', self.gf('django.db.models.fields.CharField')(max_length=50, null=True), keep_default=False)
# Adding field 'Edition.isbn_10'
db.add_column('core_edition', 'isbn_10', self.gf('django.db.models.fields.CharField')(max_length=10, null=True), keep_default=False)
# Adding field 'Edition.isbn_13'
db.add_column('core_edition', 'isbn_13', self.gf('django.db.models.fields.CharField')(max_length=13, null=True), keep_default=False)
# Adding field 'Edition.openlibrary_id'
db.add_column('core_edition', 'openlibrary_id', self.gf('django.db.models.fields.CharField')(max_length=50, null=True), keep_default=False)
# Adding field 'Author.openlibrary_id'
db.add_column('core_author', 'openlibrary_id', self.gf('django.db.models.fields.CharField')(max_length=50, null=True), keep_default=False)
def backwards(self, orm):
# Adding model 'EditionIdentifier'
db.create_table('core_editionidentifier', (
('name', self.gf('django.db.models.fields.CharField')(max_length=10)),
('created', self.gf('django.db.models.fields.DateTimeField')(auto_now_add=True, blank=True)),
('value', self.gf('django.db.models.fields.CharField')(max_length=500)),
('edition', self.gf('django.db.models.fields.related.ForeignKey')(related_name='identifiers', to=orm['core.Edition'])),
('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)),
))
db.send_create_signal('core', ['EditionIdentifier'])
# Adding model 'WorkIdentifier'
db.create_table('core_workidentifier', (
('name', self.gf('django.db.models.fields.CharField')(max_length=10)),
('created', self.gf('django.db.models.fields.DateTimeField')(auto_now_add=True, blank=True)),
('work', self.gf('django.db.models.fields.related.ForeignKey')(related_name='identifiers', to=orm['core.Work'])),
('value', self.gf('django.db.models.fields.CharField')(max_length=500)),
('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)),
))
db.send_create_signal('core', ['WorkIdentifier'])
# Deleting field 'Work.openlibrary_id'
db.delete_column('core_work', 'openlibrary_id')
# Deleting field 'Edition.isbn_10'
db.delete_column('core_edition', 'isbn_10')
# Deleting field 'Edition.isbn_13'
db.delete_column('core_edition', 'isbn_13')
# Deleting field 'Edition.openlibrary_id'
db.delete_column('core_edition', 'openlibrary_id')
# Deleting field 'Author.openlibrary_id'
db.delete_column('core_author', 'openlibrary_id')
models = {
'auth.group': {
'Meta': {'object_name': 'Group'},
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '80'}),
'permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'})
},
'auth.permission': {
'Meta': {'ordering': "('content_type__app_label', 'content_type__model', 'codename')", 'unique_together': "(('content_type', 'codename'),)", 'object_name': 'Permission'},
'codename': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
'content_type': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['contenttypes.ContentType']"}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'name': ('django.db.models.fields.CharField', [], {'max_length': '50'})
},
'auth.user': {
'Meta': {'object_name': 'User'},
'date_joined': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}),
'email': ('django.db.models.fields.EmailField', [], {'max_length': '75', 'blank': 'True'}),
'first_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}),
'groups': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Group']", 'symmetrical': 'False', 'blank': 'True'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'is_active': ('django.db.models.fields.BooleanField', [], {'default': 'True'}),
'is_staff': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
'is_superuser': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
'last_login': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}),
'last_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}),
'password': ('django.db.models.fields.CharField', [], {'max_length': '128'}),
'user_permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'}),
'username': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '30'})
},
'contenttypes.contenttype': {
'Meta': {'ordering': "('name',)", 'unique_together': "(('app_label', 'model'),)", 'object_name': 'ContentType', 'db_table': "'django_content_type'"},
'app_label': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'model': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
'name': ('django.db.models.fields.CharField', [], {'max_length': '100'})
},
'core.author': {
'Meta': {'object_name': 'Author'},
'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'name': ('django.db.models.fields.CharField', [], {'max_length': '500'}),
'openlibrary_id': ('django.db.models.fields.CharField', [], {'max_length': '50', 'null': 'True'}),
'works': ('django.db.models.fields.related.ManyToManyField', [], {'related_name': "'authors'", 'symmetrical': 'False', 'to': "orm['core.Work']"})
},
'core.campaign': {
'Meta': {'object_name': 'Campaign'},
'amazon_receiver': ('django.db.models.fields.CharField', [], {'max_length': '100', 'null': 'True'}),
'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}),
'deadline': ('django.db.models.fields.DateTimeField', [], {}),
'description': ('django.db.models.fields.CharField', [], {'max_length': '10000'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'name': ('django.db.models.fields.CharField', [], {'max_length': '500'}),
'paypal_receiver': ('django.db.models.fields.CharField', [], {'max_length': '100', 'null': 'True'}),
'target': ('django.db.models.fields.FloatField', [], {}),
'work': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'campaign'", 'to': "orm['core.Work']"})
},
'core.edition': {
'Meta': {'object_name': 'Edition'},
'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}),
'description': ('django.db.models.fields.TextField', [], {'default': "''"}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'isbn_10': ('django.db.models.fields.CharField', [], {'max_length': '10', 'null': 'True'}),
'isbn_13': ('django.db.models.fields.CharField', [], {'max_length': '13', 'null': 'True'}),
'openlibrary_id': ('django.db.models.fields.CharField', [], {'max_length': '50', 'null': 'True'}),
'publication_date': ('django.db.models.fields.CharField', [], {'max_length': '50'}),
'publisher': ('django.db.models.fields.CharField', [], {'max_length': '255'}),
'title': ('django.db.models.fields.CharField', [], {'max_length': '1000'}),
'work': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'editions'", 'to': "orm['core.Work']"})
},
'core.editioncover': {
'Meta': {'object_name': 'EditionCover'},
'edition': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'covers'", 'to': "orm['core.Edition']"}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'openlibrary_id': ('django.db.models.fields.IntegerField', [], {})
},
'core.subject': {
'Meta': {'object_name': 'Subject'},
'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'name': ('django.db.models.fields.CharField', [], {'max_length': '500'}),
'works': ('django.db.models.fields.related.ManyToManyField', [], {'related_name': "'subjects'", 'symmetrical': 'False', 'to': "orm['core.Work']"})
},
'core.wishlist': {
'Meta': {'object_name': 'Wishlist'},
'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'user': ('django.db.models.fields.related.OneToOneField', [], {'related_name': "'wishlist'", 'unique': 'True', 'to': "orm['auth.User']"}),
'works': ('django.db.models.fields.related.ManyToManyField', [], {'related_name': "'wishlists'", 'symmetrical': 'False', 'to': "orm['core.Work']"})
},
'core.work': {
'Meta': {'object_name': 'Work'},
'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'openlibrary_id': ('django.db.models.fields.CharField', [], {'max_length': '50', 'null': 'True'}),
'title': ('django.db.models.fields.CharField', [], {'max_length': '1000'})
}
}
complete_apps = ['core']

View File

@ -13,65 +13,59 @@ class Campaign(models.Model):
amazon_receiver = models.CharField(max_length=100, null=True)
work = models.ForeignKey("Work", related_name="campaign")
def __unicode__(self):
return u"Campaign for %s" % self.work.title
class Work(models.Model):
created = models.DateTimeField(auto_now_add=True)
title = models.CharField(max_length=1000)
openlibrary_id = models.CharField(max_length=50, null=True)
def __unicode__(self):
return self.title
class WorkIdentifier(models.Model):
created = models.DateTimeField(auto_now_add=True)
name = models.CharField(max_length=10)
value = models.CharField(max_length=500)
work = models.ForeignKey("Work", related_name="identifiers")
class Author(models.Model):
created = models.DateTimeField(auto_now_add=True)
name = models.CharField(max_length=500)
openlibrary_id = models.CharField(max_length=50, null=True)
works = models.ManyToManyField("Work", related_name="authors")
def __unicode__(self):
return self.name
class Subject(models.Model):
created = models.DateTimeField(auto_now_add=True)
name = models.CharField(max_length=500)
works = models.ManyToManyField("Work", related_name="subjects")
def __unicode__(self):
return self.name
class Edition(models.Model):
created = models.DateTimeField(auto_now_add=True)
title = models.CharField(max_length=1000)
description = models.TextField(default='')
publisher = models.CharField(max_length=255)
publication_date = models.CharField(max_length=50)
isbn_10 = models.CharField(max_length=10, null=True)
isbn_13 = models.CharField(max_length=13, null=True)
openlibrary_id = models.CharField(max_length=50, null=True)
work = models.ForeignKey("Work", related_name="editions")
@property
def isbn_10(self):
return self._id('isbn_10')
@property
def isbn_13(self):
return self._id('isbn_13')
def __unicode__(self):
return self.title
def _id(self, name):
for i in self.identifiers.all():
if i.name == name:
return i.value
return None
class EditionIdentifier(models.Model):
created = models.DateTimeField(auto_now_add=True)
name = models.CharField(max_length=10)
value = models.CharField(max_length=500)
edition = models.ForeignKey("Edition", related_name="identifiers")
class EditionCover(models.Model):
openlibrary_id = models.IntegerField()
edition = models.ForeignKey("Edition", related_name="covers")
class Wishlist(models.Model):
created = models.DateTimeField(auto_now_add=True)
user = models.OneToOneField(User, related_name='wishlist')

View File

@ -1,28 +1,43 @@
from django.test import TestCase
from regluit.core import books
from regluit.core import bookloader, models
class TestBooks(TestCase):
def test_add_book(self):
edition = books.add_book(isbn='0441012035')
# edition
edition = bookloader.add_book(isbn='0441012035')
self.assertEqual(edition.title, 'Neuromancer')
self.assertEqual(edition.publication_date, '2004')
self.assertEqual(edition.publisher, 'Ace Books')
self.assertEqual(edition.isbn_10, '0441012035')
self.assertEqual(edition.isbn_13, None)
self.assertEqual(edition.openlibrary_id, "/books/OL3305354M")
# edition covers
covers = edition.covers.all()
self.assertEqual(len(covers), 1)
self.assertEqual(covers[0].openlibrary_id, 284192)
# work
work = edition.work
self.assertTrue(work)
self.assertEqual(work.authors.all()[0].name, 'William F. Gibson')
# subjects
subject_names = [subject.name for subject in work.subjects.all()]
self.assertTrue(len(subject_names) > 15)
self.assertEqual(len(subject_names), 18)
self.assertTrue('Fiction' in subject_names)
# authors
author_names = [author.name for author in work.authors.all()]
self.assertEqual(len(author_names), 1)
self.assertEqual(author_names[0], "William F. Gibson")
def test_double_add(self):
bookloader.add_book(isbn='0441012035')
bookloader.add_book(isbn='0441012035')
self.assertEqual(models.Author.objects.all().count(), 1)
self.assertEqual(models.Work.objects.all().count(), 1)
self.assertEqual(models.Subject.objects.all().count(), 18)

View File

@ -114,7 +114,12 @@ INSTALLED_APPS = (
# more details on how to customize your logging configuration.
LOGGING = {
'version': 1,
'disable_existing_loggers': False,
'disable_existing_loggers': True,
'formatters': {
'brief': {
'format': '%(asctime)s %(levelname)s %(name)s[%(funcName)s]: %(message)s',
},
},
'handlers': {
'mail_admins': {
'level': 'ERROR',
@ -123,7 +128,10 @@ LOGGING = {
'file': {
'level': 'INFO',
'class': 'logging.handlers.RotatingFileHandler',
'filename': join(PROJECT_DIR, 'logs', 'django.log')
'filename': join(PROJECT_DIR, 'logs', 'unglue.it.log'),
'maxBytes': 1024*1024*5, # 5 MB
'backupCount': 5,
'formatter': 'brief',
},
},
'loggers': {
@ -132,6 +140,10 @@ LOGGING = {
'level': 'ERROR',
'propagate': True,
},
'': {
'handlers': ['file'],
'level': 'INFO',
}
}
}
@ -142,7 +154,6 @@ ACCOUNT_ACTIVATION_DAYS = 7
# django-social-auth
AUTHENTICATION_BACKENDS = (
'social_auth.backends.google.GoogleOAuth2Backend',
'social_auth.backends.facebook.FacebookBackend',
'social_auth.backends.twitter.TwitterBackend',
@ -169,4 +180,4 @@ LOGIN_URL = "/accounts/login/"
LOGIN_REDIRECT_URL = "/"
LOGOUT_URL = "/accounts/logout/"
USER_AGENT = "unglue.it bot <http://unglue.it>"
USER_AGENT = "unglue.it.bot v0.0.1 <http://unglue.it>"