regluit/core/bookloader.py

195 lines
6.4 KiB
Python
Executable File

import json
import logging
import requests
from xml.etree import ElementTree
from django.db.models import Q
from django.conf import settings
from django.db import IntegrityError
from regluit.core import models
logger = logging.getLogger(__name__)
def add_by_oclc(oclc):
logger.info("adding book by oclc %s", oclc)
for edition in models.Edition.objects.filter(oclc=oclc):
return edition
url = "https://www.googleapis.com/books/v1/volumes"
results = _get_json(url, {"q": '"OCLC%s"' % oclc})
if not results.has_key('items') or len(results['items']) == 0:
logger.warn("no google hits for %s" % oclc)
return None
try:
e = add_by_googlebooks_id(results['items'][0]['id'])
e.oclc = oclc
e.save()
return e
except LookupFailure, e:
logger.exception("failed to add edition for %s", oclc)
except IntegrityError, e:
logger.exception("google books data for %s didn't fit our db", oclc)
return None
def add_by_isbn(isbn, work=None):
"""add a book to the UnglueIt database based on ISBN. The work parameter
is optional, and if not supplied the edition will be associated with
a stub work.
"""
logger.info("adding book by isbn %s", isbn)
# save a lookup to google if we already have this isbn
has_isbn = Q(isbn_10=isbn) | Q(isbn_13=isbn)
for edition in models.Edition.objects.filter(has_isbn):
return edition
url = "https://www.googleapis.com/books/v1/volumes"
results = _get_json(url, {"q": "isbn:%s" % isbn})
if not results.has_key('items') or len(results['items']) == 0:
logger.warn("no google hits for %s" % isbn)
return None
try:
return add_by_googlebooks_id(results['items'][0]['id'], work)
except LookupFailure, e:
logger.exception("failed to add edition for %s", isbn)
except IntegrityError, e:
logger.exception("google books data for %s didn't fit our db", isbn)
return None
def add_by_googlebooks_id(googlebooks_id, work=None):
"""add a book to the UnglueIt database based on the GoogleBooks ID. The
work parameter is optional, and if not supplied the edition will be
associated with a stub work.
"""
# don't ping google again if we already know about the edition
e, created = models.Edition.objects.get_or_create(googlebooks_id=googlebooks_id)
if not created:
return e
logger.info("loading metadata from google for %s", googlebooks_id)
url = "https://www.googleapis.com/books/v1/volumes/%s" % googlebooks_id
item = _get_json(url)
d = item['volumeInfo']
e.title = d.get('title')
e.description = d.get('description')
e.publisher = d.get('publisher')
e.publication_date = d.get('publishedDate', '')
e.language = d.get('language')
for i in d.get('industryIdentifiers', []):
if i['type'] == 'ISBN_10':
e.isbn_10 = i['identifier']
elif i['type'] == 'ISBN_13':
e.isbn_13 = i['identifier']
for a in d.get('authors', []):
a, created = models.Author.objects.get_or_create(name=a)
a.editions.add(e)
for s in d.get('categories', []):
s, created = models.Subject.objects.get_or_create(name=s)
s.editions.add(e)
access_info = item.get('accessInfo')
if access_info:
e.public_domain = item.get('public_domain', None)
epub = access_info.get('epub')
if epub and epub.get('downloadLink'):
ebook = models.Ebook(edition=e, format='epub',
url=epub.get('downloadLink'),
provider='google')
ebook.save()
pdf = access_info.get('pdf')
if pdf and pdf.get('downloadLink'):
ebook = models.Ebook(edition=e, format='pdf',
url=pdf.get('downloadLink', None),
provider='google')
ebook.save()
# if we know what work to add the edition to do it
if work:
work.editions.add(e)
# otherwise we need to create a stub work
else:
w = models.Work.objects.create(title=e.title)
w.editions.add(e)
return e
def add_related(isbn):
"""add all books related to a particular ISBN to the UnglueIt database.
The initial seed ISBN will be added if it's not already there.
"""
# make sure the seed edition is there
logger.info("adding related editions for %s", isbn)
edition = add_by_isbn(isbn)
# this is the work everything will hang off
work = edition.work
new_editions = []
for other_isbn in thingisbn(isbn):
related_edition = add_by_isbn(other_isbn, work)
if related_edition and related_edition.work != edition.work:
merge_works(edition.work, related_edition.work)
if related_edition:
new_editions.append(related_edition)
return new_editions
def thingisbn(isbn):
"""given an ISBN return a list of related edition ISBNs, according to
Library Thing.
"""
logger.info("looking up %s at ThingISBN" % isbn)
url = "http://www.librarything.com/api/thingISBN/%s" % isbn
xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content
doc = ElementTree.fromstring(xml)
return [e.text for e in doc.findall('isbn')]
def merge_works(w1, w2):
"""will merge the second work (w2) into the first (w1)
"""
logger.info("merging work %s into %s", w1, w2)
for edition in w2.editions.all():
edition.work = w1
edition.save()
for campaign in w2.campaigns.all():
campaign.work = w1
campaign.save()
for wishlist in models.Wishlist.objects.filter(works__in=[w2]):
wishlist.works.remove(w2)
wishlist.works.add(w1)
w2.delete()
def _get_json(url, params={}):
# TODO: should X-Forwarded-For change based on the request from client?
headers = {'User-Agent': settings.USER_AGENT,
'Accept': 'application/json',
'X-Forwarded-For': '69.174.114.214'}
params['key'] = settings.GOOGLE_BOOKS_API_KEY
response = requests.get(url, params=params, headers=headers)
if response.status_code == 200:
return json.loads(response.content)
else:
logger.error("unexpected HTTP response: %s" % response)
raise LookupFailure("GET failed: url=%s and params=%s" % (url, params))
class LookupFailure(Exception):
pass