2011-09-07 09:34:03 +00:00
|
|
|
import json
|
|
|
|
import logging
|
2011-12-19 06:33:13 +00:00
|
|
|
import datetime
|
2011-09-07 09:34:03 +00:00
|
|
|
|
|
|
|
import requests
|
2011-11-06 22:42:09 +00:00
|
|
|
from xml.etree import ElementTree
|
2011-10-19 03:00:07 +00:00
|
|
|
|
|
|
|
from django.db.models import Q
|
2011-11-06 22:42:09 +00:00
|
|
|
from django.conf import settings
|
2011-10-19 03:45:02 +00:00
|
|
|
from django.db import IntegrityError
|
2011-09-09 05:38:28 +00:00
|
|
|
|
|
|
|
from regluit.core import models
|
2011-12-20 04:26:55 +00:00
|
|
|
import regluit.core.isbn
|
2011-09-09 05:38:28 +00:00
|
|
|
|
2011-09-10 11:36:38 +00:00
|
|
|
logger = logging.getLogger(__name__)
|
2011-09-09 05:38:28 +00:00
|
|
|
|
2011-12-19 06:33:13 +00:00
|
|
|
|
2011-11-06 21:33:04 +00:00
|
|
|
def add_by_oclc(oclc):
|
|
|
|
logger.info("adding book by oclc %s", oclc)
|
|
|
|
for edition in models.Edition.objects.filter(oclc=oclc):
|
|
|
|
return edition
|
|
|
|
|
|
|
|
url = "https://www.googleapis.com/books/v1/volumes"
|
|
|
|
results = _get_json(url, {"q": '"OCLC%s"' % oclc})
|
|
|
|
|
|
|
|
if not results.has_key('items') or len(results['items']) == 0:
|
|
|
|
logger.warn("no google hits for %s" % oclc)
|
|
|
|
return None
|
|
|
|
|
|
|
|
try:
|
|
|
|
e = add_by_googlebooks_id(results['items'][0]['id'])
|
|
|
|
e.oclc = oclc
|
|
|
|
e.save()
|
|
|
|
return e
|
|
|
|
except LookupFailure, e:
|
|
|
|
logger.exception("failed to add edition for %s", oclc)
|
2011-11-06 22:42:09 +00:00
|
|
|
except IntegrityError, e:
|
|
|
|
logger.exception("google books data for %s didn't fit our db", oclc)
|
2011-11-06 21:33:04 +00:00
|
|
|
return None
|
2011-09-07 09:34:03 +00:00
|
|
|
|
2011-11-06 22:42:09 +00:00
|
|
|
|
2011-10-20 18:43:40 +00:00
|
|
|
def add_by_isbn(isbn, work=None):
|
2011-10-14 04:12:20 +00:00
|
|
|
"""add a book to the UnglueIt database based on ISBN. The work parameter
|
|
|
|
is optional, and if not supplied the edition will be associated with
|
|
|
|
a stub work.
|
|
|
|
"""
|
2011-12-22 19:29:46 +00:00
|
|
|
if not isbn:
|
|
|
|
return None
|
2011-12-20 04:26:55 +00:00
|
|
|
if len(isbn)==10:
|
|
|
|
isbn=regluit.core.isbn.convert_10_to_13(isbn)
|
|
|
|
|
2011-11-06 21:33:04 +00:00
|
|
|
logger.info("adding book by isbn %s", isbn)
|
2011-10-19 03:45:02 +00:00
|
|
|
# save a lookup to google if we already have this isbn
|
2011-12-20 04:26:55 +00:00
|
|
|
has_isbn = Q(isbn_13=isbn)
|
2011-10-19 03:00:07 +00:00
|
|
|
for edition in models.Edition.objects.filter(has_isbn):
|
2011-12-22 19:29:46 +00:00
|
|
|
edition.new = False
|
2011-10-19 03:00:07 +00:00
|
|
|
return edition
|
|
|
|
|
2011-10-10 16:57:10 +00:00
|
|
|
url = "https://www.googleapis.com/books/v1/volumes"
|
|
|
|
results = _get_json(url, {"q": "isbn:%s" % isbn})
|
2011-09-09 05:38:28 +00:00
|
|
|
|
2011-10-10 21:26:38 +00:00
|
|
|
if not results.has_key('items') or len(results['items']) == 0:
|
2011-10-10 16:57:10 +00:00
|
|
|
logger.warn("no google hits for %s" % isbn)
|
|
|
|
return None
|
2011-09-07 09:34:03 +00:00
|
|
|
|
2011-10-19 03:45:02 +00:00
|
|
|
try:
|
2011-10-20 18:43:40 +00:00
|
|
|
return add_by_googlebooks_id(results['items'][0]['id'], work)
|
2011-10-19 03:45:02 +00:00
|
|
|
except LookupFailure, e:
|
|
|
|
logger.exception("failed to add edition for %s", isbn)
|
2011-11-06 22:42:09 +00:00
|
|
|
except IntegrityError, e:
|
|
|
|
logger.exception("google books data for %s didn't fit our db", isbn)
|
2011-10-19 03:45:02 +00:00
|
|
|
return None
|
2011-09-07 09:34:03 +00:00
|
|
|
|
|
|
|
|
2011-10-14 04:02:19 +00:00
|
|
|
def add_by_googlebooks_id(googlebooks_id, work=None):
|
2011-10-14 04:12:20 +00:00
|
|
|
"""add a book to the UnglueIt database based on the GoogleBooks ID. The
|
|
|
|
work parameter is optional, and if not supplied the edition will be
|
|
|
|
associated with a stub work.
|
|
|
|
"""
|
2011-12-13 14:55:26 +00:00
|
|
|
|
2011-10-14 04:02:19 +00:00
|
|
|
# don't ping google again if we already know about the edition
|
2011-12-13 14:55:26 +00:00
|
|
|
try:
|
|
|
|
e = models.Edition.objects.get(googlebooks_id=googlebooks_id)
|
2011-10-10 16:57:10 +00:00
|
|
|
return e
|
2011-12-13 14:55:26 +00:00
|
|
|
except models.Edition.DoesNotExist:
|
|
|
|
pass
|
2011-09-07 09:34:03 +00:00
|
|
|
|
2011-10-20 03:31:16 +00:00
|
|
|
logger.info("loading metadata from google for %s", googlebooks_id)
|
2011-10-14 04:02:19 +00:00
|
|
|
url = "https://www.googleapis.com/books/v1/volumes/%s" % googlebooks_id
|
2011-11-06 21:33:04 +00:00
|
|
|
item = _get_json(url)
|
|
|
|
d = item['volumeInfo']
|
2011-10-14 04:02:19 +00:00
|
|
|
|
2011-12-13 14:55:26 +00:00
|
|
|
# don't add the edition to a work with a different language
|
|
|
|
# https://www.pivotaltracker.com/story/show/17234433
|
|
|
|
language = d.get('language')
|
|
|
|
if work and work.language != language:
|
|
|
|
logger.warn("ignoring %s since it is %s instead of %s" %
|
|
|
|
(googlebooks_id, language, work.language))
|
|
|
|
return
|
|
|
|
|
|
|
|
e = models.Edition(googlebooks_id=googlebooks_id)
|
2011-10-10 16:57:10 +00:00
|
|
|
e.title = d.get('title')
|
|
|
|
e.description = d.get('description')
|
|
|
|
e.publisher = d.get('publisher')
|
2011-11-21 19:13:29 +00:00
|
|
|
e.publication_date = d.get('publishedDate', '')
|
2011-12-13 14:55:26 +00:00
|
|
|
|
2011-10-10 16:57:10 +00:00
|
|
|
for i in d.get('industryIdentifiers', []):
|
2011-12-20 04:26:55 +00:00
|
|
|
if i['type'] == 'ISBN_13':
|
|
|
|
e.isbn_13 = i['identifier']
|
2011-10-10 16:57:10 +00:00
|
|
|
elif i['type'] == 'ISBN_13':
|
|
|
|
e.isbn_13 = i['identifier']
|
2011-09-09 05:38:28 +00:00
|
|
|
|
2011-12-13 14:55:26 +00:00
|
|
|
e.save()
|
2011-12-22 19:29:46 +00:00
|
|
|
e.new=True
|
2011-12-13 14:55:26 +00:00
|
|
|
|
2011-10-10 16:57:10 +00:00
|
|
|
for a in d.get('authors', []):
|
|
|
|
a, created = models.Author.objects.get_or_create(name=a)
|
|
|
|
a.editions.add(e)
|
2011-09-09 05:38:28 +00:00
|
|
|
|
2011-11-06 21:33:04 +00:00
|
|
|
access_info = item.get('accessInfo')
|
|
|
|
if access_info:
|
|
|
|
e.public_domain = item.get('public_domain', None)
|
|
|
|
epub = access_info.get('epub')
|
2011-11-06 22:42:09 +00:00
|
|
|
if epub and epub.get('downloadLink'):
|
2011-11-06 22:49:25 +00:00
|
|
|
ebook = models.Ebook(edition=e, format='epub',
|
2011-11-06 22:42:09 +00:00
|
|
|
url=epub.get('downloadLink'),
|
2011-11-06 21:33:04 +00:00
|
|
|
provider='google')
|
|
|
|
ebook.save()
|
2011-11-06 22:49:25 +00:00
|
|
|
|
|
|
|
pdf = access_info.get('pdf')
|
|
|
|
if pdf and pdf.get('downloadLink'):
|
|
|
|
ebook = models.Ebook(edition=e, format='pdf',
|
|
|
|
url=pdf.get('downloadLink', None),
|
|
|
|
provider='google')
|
|
|
|
ebook.save()
|
2011-11-06 21:33:04 +00:00
|
|
|
|
2011-12-13 14:55:26 +00:00
|
|
|
# if we know what work the edition should be attached to, attach it
|
2011-10-14 04:02:19 +00:00
|
|
|
if work:
|
|
|
|
work.editions.add(e)
|
|
|
|
|
|
|
|
# otherwise we need to create a stub work
|
|
|
|
else:
|
2011-12-13 14:55:26 +00:00
|
|
|
w = models.Work.objects.create(title=e.title, language=language)
|
2011-10-10 16:57:10 +00:00
|
|
|
w.editions.add(e)
|
2011-09-09 05:38:28 +00:00
|
|
|
|
2011-10-10 16:57:10 +00:00
|
|
|
return e
|
2011-09-09 05:38:28 +00:00
|
|
|
|
|
|
|
|
2011-10-14 04:12:20 +00:00
|
|
|
def add_related(isbn):
|
|
|
|
"""add all books related to a particular ISBN to the UnglueIt database.
|
|
|
|
The initial seed ISBN will be added if it's not already there.
|
|
|
|
"""
|
2011-10-14 04:02:19 +00:00
|
|
|
# make sure the seed edition is there
|
2011-10-20 03:31:16 +00:00
|
|
|
logger.info("adding related editions for %s", isbn)
|
2011-10-14 04:02:19 +00:00
|
|
|
edition = add_by_isbn(isbn)
|
|
|
|
|
|
|
|
# this is the work everything will hang off
|
|
|
|
work = edition.work
|
|
|
|
|
2011-10-20 05:23:30 +00:00
|
|
|
new_editions = []
|
2011-10-14 04:02:19 +00:00
|
|
|
for other_isbn in thingisbn(isbn):
|
2011-12-20 04:26:55 +00:00
|
|
|
# 979's come back as 13
|
|
|
|
if len(other_isbn)==10:
|
|
|
|
other_isbn=regluit.core.isbn.convert_10_to_13(other_isbn)
|
2011-10-19 03:45:02 +00:00
|
|
|
related_edition = add_by_isbn(other_isbn, work)
|
|
|
|
if related_edition and related_edition.work != edition.work:
|
|
|
|
merge_works(edition.work, related_edition.work)
|
2011-10-20 05:23:30 +00:00
|
|
|
if related_edition:
|
|
|
|
new_editions.append(related_edition)
|
|
|
|
|
|
|
|
return new_editions
|
2011-10-14 04:02:19 +00:00
|
|
|
|
2011-10-14 04:12:20 +00:00
|
|
|
|
2011-10-13 01:59:46 +00:00
|
|
|
def thingisbn(isbn):
|
2011-10-14 04:12:20 +00:00
|
|
|
"""given an ISBN return a list of related edition ISBNs, according to
|
2011-12-20 04:26:55 +00:00
|
|
|
Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns, which come back as isbn_13')
|
2011-10-14 04:12:20 +00:00
|
|
|
"""
|
2011-10-20 03:31:16 +00:00
|
|
|
logger.info("looking up %s at ThingISBN" % isbn)
|
2011-10-13 01:59:46 +00:00
|
|
|
url = "http://www.librarything.com/api/thingISBN/%s" % isbn
|
|
|
|
xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content
|
|
|
|
doc = ElementTree.fromstring(xml)
|
|
|
|
return [e.text for e in doc.findall('isbn')]
|
|
|
|
|
2011-10-19 03:45:02 +00:00
|
|
|
|
2011-10-19 03:00:07 +00:00
|
|
|
def merge_works(w1, w2):
|
|
|
|
"""will merge the second work (w2) into the first (w1)
|
|
|
|
"""
|
2011-10-20 03:31:16 +00:00
|
|
|
logger.info("merging work %s into %s", w1, w2)
|
2011-10-19 03:00:07 +00:00
|
|
|
for edition in w2.editions.all():
|
|
|
|
edition.work = w1
|
|
|
|
edition.save()
|
|
|
|
for campaign in w2.campaigns.all():
|
|
|
|
campaign.work = w1
|
|
|
|
campaign.save()
|
|
|
|
for wishlist in models.Wishlist.objects.filter(works__in=[w2]):
|
2011-12-08 23:22:05 +00:00
|
|
|
w2source=wishlist.work_source(w2)
|
|
|
|
wishlist.remove_work(w2)
|
|
|
|
wishlist.add_work(w1, w2source)
|
2011-12-13 14:55:26 +00:00
|
|
|
# TODO: should we decommission w2 instead of deleting it, so that we can
|
|
|
|
# redirect from the old work URL to the new one?
|
2011-10-19 03:00:07 +00:00
|
|
|
w2.delete()
|
2011-10-14 04:12:20 +00:00
|
|
|
|
2011-10-19 03:45:02 +00:00
|
|
|
|
2011-12-19 06:33:13 +00:00
|
|
|
def add_openlibrary(work):
|
|
|
|
work.openlibrary_lookup = datetime.datetime.now()
|
|
|
|
work.save()
|
|
|
|
|
|
|
|
# find the first ISBN match in OpenLibrary
|
|
|
|
logger.info("looking up openlibrary data for work %s", work.id)
|
|
|
|
found = False
|
|
|
|
e = None # openlibrary edition json
|
|
|
|
w = None # openlibrary work json
|
|
|
|
|
|
|
|
# get the 1st openlibrary match by isbn that has an associated work
|
|
|
|
url = "http://openlibrary.org/api/books"
|
|
|
|
params = {"format": "json", "jscmd": "details"}
|
|
|
|
for edition in work.editions.all():
|
2011-12-20 04:26:55 +00:00
|
|
|
isbn_key = "ISBN:%s" % edition.isbn_13
|
2011-12-19 06:33:13 +00:00
|
|
|
params['bibkeys'] = isbn_key
|
|
|
|
e = _get_json(url, params)
|
|
|
|
if e.has_key(isbn_key) and e[isbn_key]['details'].has_key('works'):
|
|
|
|
work_key = e[isbn_key]['details']['works'].pop(0)['key']
|
|
|
|
logger.info("got openlibrary work %s for isbn %s", work_key, isbn_key)
|
|
|
|
w = _get_json("http://openlibrary.org" + work_key)
|
|
|
|
if w.has_key('subjects'):
|
|
|
|
found = True
|
|
|
|
break
|
|
|
|
|
|
|
|
if not found:
|
|
|
|
logger.warn("unable to find work %s at openlibrary", work.id)
|
|
|
|
return
|
|
|
|
|
|
|
|
# add the subjects to the Work
|
|
|
|
for s in w.get('subjects', []):
|
|
|
|
logger.info("adding subject %s to work %s", s, work.id)
|
|
|
|
subject, created = models.Subject.objects.get_or_create(name=s)
|
|
|
|
work.subjects.add(subject)
|
|
|
|
|
|
|
|
work.openlibrary_id = w['key']
|
|
|
|
work.save()
|
|
|
|
# TODO: add authors here once they are moved from Edition to Work
|
|
|
|
# TODO: add LCCN, LibraryThing, GoodReads to appropriate models
|
|
|
|
|
|
|
|
|
2011-09-10 11:36:38 +00:00
|
|
|
def _get_json(url, params={}):
|
2011-10-10 19:57:12 +00:00
|
|
|
# TODO: should X-Forwarded-For change based on the request from client?
|
|
|
|
headers = {'User-Agent': settings.USER_AGENT,
|
|
|
|
'Accept': 'application/json',
|
|
|
|
'X-Forwarded-For': '69.174.114.214'}
|
2011-10-10 16:57:10 +00:00
|
|
|
params['key'] = settings.GOOGLE_BOOKS_API_KEY
|
2011-09-09 05:38:28 +00:00
|
|
|
response = requests.get(url, params=params, headers=headers)
|
2011-09-07 09:34:03 +00:00
|
|
|
if response.status_code == 200:
|
|
|
|
return json.loads(response.content)
|
|
|
|
else:
|
2011-09-10 11:36:38 +00:00
|
|
|
logger.error("unexpected HTTP response: %s" % response)
|
2011-09-07 09:34:03 +00:00
|
|
|
raise LookupFailure("GET failed: url=%s and params=%s" % (url, params))
|
|
|
|
|
2011-09-09 05:38:28 +00:00
|
|
|
|
2011-09-07 09:34:03 +00:00
|
|
|
class LookupFailure(Exception):
|
|
|
|
pass
|