regluit/core/bookloader.py

import json
import logging

from xml.etree import ElementTree
import requests

from django.conf import settings
from django.db.models import Q
from django.db import IntegrityError

from regluit.core import models

logger = logging.getLogger(__name__)


def add_by_isbn(isbn, work=None, add_related=True):
    """add a book to the UnglueIt database based on ISBN. The work parameter
    is optional, and if not supplied the edition will be associated with
    a stub work.
    """
    # save a lookup to google if we already have this isbn
    has_isbn = Q(isbn_10=isbn) | Q(isbn_13=isbn)
    for edition in models.Edition.objects.filter(has_isbn):
        return edition

    url = "https://www.googleapis.com/books/v1/volumes"
    results = _get_json(url, {"q": "isbn:%s" % isbn})

    if not results.has_key('items') or len(results['items']) == 0:
        logger.warn("no google hits for %s" % isbn)
        return None

    try:
        return add_by_googlebooks_id(results['items'][0]['id'], work)
    except LookupFailure, e:
        logger.exception("failed to add edition for %s", isbn)
    except IntegrityError, e:
        logger.exception("edition data for %s does not match db schema", isbn)
    return None


def add_by_googlebooks_id(googlebooks_id, work=None):
    """add a book to the UnglueIt database based on the GoogleBooks ID. The
    work parameter is optional, and if not supplied the edition will be 
    associated with a stub work.
    """
    # don't ping google again if we already know about the edition
    e, created = models.Edition.objects.get_or_create(googlebooks_id=googlebooks_id)
    if not created:
        return e

    url = "https://www.googleapis.com/books/v1/volumes/%s" % googlebooks_id
    d = _get_json(url)['volumeInfo']

    e.title = d.get('title')
    e.description = d.get('description')
    e.publisher = d.get('publisher')
    e.publication_date = d.get('publishedDate')
    e.language = d.get('language')

    for i in d.get('industryIdentifiers', []):
        if i['type'] == 'ISBN_10':
            e.isbn_10 = i['identifier']
        elif i['type'] == 'ISBN_13':
            e.isbn_13 = i['identifier']

    for a in d.get('authors', []):
        a, created = models.Author.objects.get_or_create(name=a)
        a.editions.add(e)

    for s in d.get('categories', []):
        s, created = models.Subject.objects.get_or_create(name=s)
        s.editions.add(e)

    # if we know what work to add the edition to do it
    if work:
        work.editions.add(e)

    # otherwise we need to create a stub work
    else:
        w = models.Work.objects.create(title=e.title)
        w.editions.add(e)

    return e


def add_related(isbn):
    """add all books related to a particular ISBN to the UnglueIt database.
    The initial seed ISBN will be added if it's not already there.
    """
    # make sure the seed edition is there
    edition = add_by_isbn(isbn)

    # this is the work everything will hang off
    work = edition.work

    for other_isbn in thingisbn(isbn):
        related_edition = add_by_isbn(other_isbn, work)
        if related_edition and related_edition.work != edition.work:
            merge_works(edition.work, related_edition.work)


def thingisbn(isbn):
    """given an ISBN return a list of related edition ISBNs, according to 
    Library Thing.
    """
    url = "http://www.librarything.com/api/thingISBN/%s" % isbn
    xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content
    doc = ElementTree.fromstring(xml)
    return [e.text for e in doc.findall('isbn')]


def merge_works(w1, w2):
    """will merge the second work (w2) into the first (w1)
    """
    for edition in w2.editions.all():
        edition.work = w1
        edition.save()
    for campaign in w2.campaigns.all():
        campaign.work = w1
        campaign.save()
    for wishlist in models.Wishlist.objects.filter(works__in=[w2]):
        wishlist.works.remove(w2)
        wishlist.works.add(w1)
    w2.delete()


def _get_json(url, params={}):
    # TODO: should X-Forwarded-For change based on the request from client?
    headers = {'User-Agent': settings.USER_AGENT, 
               'Accept': 'application/json',
               'X-Forwarded-For': '69.174.114.214'}
    params['key'] = settings.GOOGLE_BOOKS_API_KEY
    response = requests.get(url, params=params, headers=headers)
    if response.status_code == 200:
        return json.loads(response.content)
    else:
        logger.error("unexpected HTTP response: %s" % response)
        raise LookupFailure("GET failed: url=%s and params=%s" % (url, params))


class LookupFailure(Exception):
    pass
added core edition googlebooks lookup and test, plus the start of a load_books management command 2011-09-07 09:34:03 +00:00			`import json`
			`import logging`

added merge_works function for merging the one work into another when adding related editions based on ThingISBN. also added a test that makes sure related campaigns and wishlists are appropriately updated. 2011-10-19 03:00:07 +00:00			`from xml.etree import ElementTree`
added core edition googlebooks lookup and test, plus the start of a load_books management command 2011-09-07 09:34:03 +00:00			`import requests`
added merge_works function for merging the one work into another when adding related editions based on ThingISBN. also added a test that makes sure related campaigns and wishlists are appropriately updated. 2011-10-19 03:00:07 +00:00
lots of changes to support dynamically loading from openlibrary based on an isbn...which still needs some work to prevent duplication 2011-09-09 05:38:28 +00:00			`from django.conf import settings`
added merge_works function for merging the one work into another when adding related editions based on ThingISBN. also added a test that makes sure related campaigns and wishlists are appropriately updated. 2011-10-19 03:00:07 +00:00			`from django.db.models import Q`
need to trap errors when looking up a LibraryThing ISBN in GoogleBooks and getting no hit 2011-10-19 03:45:02 +00:00			`from django.db import IntegrityError`
lots of changes to support dynamically loading from openlibrary based on an isbn...which still needs some work to prevent duplication 2011-09-09 05:38:28 +00:00
			`from regluit.core import models`

handle duplicates using openlibrary ids for edition, work and author 2011-09-10 11:36:38 +00:00			`logger = logging.getLogger(__name__)`
lots of changes to support dynamically loading from openlibrary based on an isbn...which still needs some work to prevent duplication 2011-09-09 05:38:28 +00:00
added core edition googlebooks lookup and test, plus the start of a load_books management command 2011-09-07 09:34:03 +00:00
added merge_works function for merging the one work into another when adding related editions based on ThingISBN. also added a test that makes sure related campaigns and wishlists are appropriately updated. 2011-10-19 03:00:07 +00:00			`def add_by_isbn(isbn, work=None, add_related=True):`
added some comments 2011-10-14 04:12:20 +00:00			`"""add a book to the UnglueIt database based on ISBN. The work parameter`
			`is optional, and if not supplied the edition will be associated with`
			`a stub work.`
			`"""`
need to trap errors when looking up a LibraryThing ISBN in GoogleBooks and getting no hit 2011-10-19 03:45:02 +00:00			`# save a lookup to google if we already have this isbn`
added merge_works function for merging the one work into another when adding related editions based on ThingISBN. also added a test that makes sure related campaigns and wishlists are appropriately updated. 2011-10-19 03:00:07 +00:00			`has_isbn = Q(isbn_10=isbn) \| Q(isbn_13=isbn)`
			`for edition in models.Edition.objects.filter(has_isbn):`
			`return edition`

moved to using googlebooks api for data. required moving authors and subjects to be associated to editions, since we google books does not have a notion of a work 2011-10-10 16:57:10 +00:00			`url = "https://www.googleapis.com/books/v1/volumes"`
			`results = _get_json(url, {"q": "isbn:%s" % isbn})`
lots of changes to support dynamically loading from openlibrary based on an isbn...which still needs some work to prevent duplication 2011-09-09 05:38:28 +00:00
make sure there are some hits when adding a book by isbn 2011-10-10 21:26:38 +00:00			`if not results.has_key('items') or len(results['items']) == 0:`
moved to using googlebooks api for data. required moving authors and subjects to be associated to editions, since we google books does not have a notion of a work 2011-10-10 16:57:10 +00:00			`logger.warn("no google hits for %s" % isbn)`
			`return None`
added core edition googlebooks lookup and test, plus the start of a load_books management command 2011-09-07 09:34:03 +00:00
need to trap errors when looking up a LibraryThing ISBN in GoogleBooks and getting no hit 2011-10-19 03:45:02 +00:00			`try:`
			`return add_by_googlebooks_id(results['items'][0]['id'], work)`
			`except LookupFailure, e:`
			`logger.exception("failed to add edition for %s", isbn)`
			`except IntegrityError, e:`
			`logger.exception("edition data for %s does not match db schema", isbn)`
			`return None`
added core edition googlebooks lookup and test, plus the start of a load_books management command 2011-09-07 09:34:03 +00:00

got bookloader.add_related working w/ some tests. also added Edition.language because it looks like it will be useful to know. 2011-10-14 04:02:19 +00:00			`def add_by_googlebooks_id(googlebooks_id, work=None):`
added some comments 2011-10-14 04:12:20 +00:00			`"""add a book to the UnglueIt database based on the GoogleBooks ID. The`
			`work parameter is optional, and if not supplied the edition will be`
			`associated with a stub work.`
			`"""`
got bookloader.add_related working w/ some tests. also added Edition.language because it looks like it will be useful to know. 2011-10-14 04:02:19 +00:00			`# don't ping google again if we already know about the edition`
moved to using googlebooks api for data. required moving authors and subjects to be associated to editions, since we google books does not have a notion of a work 2011-10-10 16:57:10 +00:00			`e, created = models.Edition.objects.get_or_create(googlebooks_id=googlebooks_id)`
			`if not created:`
			`return e`
added core edition googlebooks lookup and test, plus the start of a load_books management command 2011-09-07 09:34:03 +00:00
got bookloader.add_related working w/ some tests. also added Edition.language because it looks like it will be useful to know. 2011-10-14 04:02:19 +00:00			`url = "https://www.googleapis.com/books/v1/volumes/%s" % googlebooks_id`
			`d = _get_json(url)['volumeInfo']`

moved to using googlebooks api for data. required moving authors and subjects to be associated to editions, since we google books does not have a notion of a work 2011-10-10 16:57:10 +00:00			`e.title = d.get('title')`
			`e.description = d.get('description')`
			`e.publisher = d.get('publisher')`
			`e.publication_date = d.get('publishedDate')`
got bookloader.add_related working w/ some tests. also added Edition.language because it looks like it will be useful to know. 2011-10-14 04:02:19 +00:00			`e.language = d.get('language')`
added core edition googlebooks lookup and test, plus the start of a load_books management command 2011-09-07 09:34:03 +00:00
moved to using googlebooks api for data. required moving authors and subjects to be associated to editions, since we google books does not have a notion of a work 2011-10-10 16:57:10 +00:00			`for i in d.get('industryIdentifiers', []):`
			`if i['type'] == 'ISBN_10':`
			`e.isbn_10 = i['identifier']`
			`elif i['type'] == 'ISBN_13':`
			`e.isbn_13 = i['identifier']`
lots of changes to support dynamically loading from openlibrary based on an isbn...which still needs some work to prevent duplication 2011-09-09 05:38:28 +00:00
moved to using googlebooks api for data. required moving authors and subjects to be associated to editions, since we google books does not have a notion of a work 2011-10-10 16:57:10 +00:00			`for a in d.get('authors', []):`
			`a, created = models.Author.objects.get_or_create(name=a)`
			`a.editions.add(e)`
lots of changes to support dynamically loading from openlibrary based on an isbn...which still needs some work to prevent duplication 2011-09-09 05:38:28 +00:00
moved to using googlebooks api for data. required moving authors and subjects to be associated to editions, since we google books does not have a notion of a work 2011-10-10 16:57:10 +00:00			`for s in d.get('categories', []):`
			`s, created = models.Subject.objects.get_or_create(name=s)`
			`s.editions.add(e)`
lots of changes to support dynamically loading from openlibrary based on an isbn...which still needs some work to prevent duplication 2011-09-09 05:38:28 +00:00
got bookloader.add_related working w/ some tests. also added Edition.language because it looks like it will be useful to know. 2011-10-14 04:02:19 +00:00			`# if we know what work to add the edition to do it`
			`if work:`
			`work.editions.add(e)`

			`# otherwise we need to create a stub work`
			`else:`
moved to using googlebooks api for data. required moving authors and subjects to be associated to editions, since we google books does not have a notion of a work 2011-10-10 16:57:10 +00:00			`w = models.Work.objects.create(title=e.title)`
			`w.editions.add(e)`
lots of changes to support dynamically loading from openlibrary based on an isbn...which still needs some work to prevent duplication 2011-09-09 05:38:28 +00:00
moved to using googlebooks api for data. required moving authors and subjects to be associated to editions, since we google books does not have a notion of a work 2011-10-10 16:57:10 +00:00			`return e`
lots of changes to support dynamically loading from openlibrary based on an isbn...which still needs some work to prevent duplication 2011-09-09 05:38:28 +00:00

added some comments 2011-10-14 04:12:20 +00:00			`def add_related(isbn):`
			`"""add all books related to a particular ISBN to the UnglueIt database.`
			`The initial seed ISBN will be added if it's not already there.`
			`"""`
got bookloader.add_related working w/ some tests. also added Edition.language because it looks like it will be useful to know. 2011-10-14 04:02:19 +00:00			`# make sure the seed edition is there`
			`edition = add_by_isbn(isbn)`

			`# this is the work everything will hang off`
			`work = edition.work`

			`for other_isbn in thingisbn(isbn):`
need to trap errors when looking up a LibraryThing ISBN in GoogleBooks and getting no hit 2011-10-19 03:45:02 +00:00			`related_edition = add_by_isbn(other_isbn, work)`
			`if related_edition and related_edition.work != edition.work:`
			`merge_works(edition.work, related_edition.work)`
got bookloader.add_related working w/ some tests. also added Edition.language because it looks like it will be useful to know. 2011-10-14 04:02:19 +00:00
added some comments 2011-10-14 04:12:20 +00:00
added bookloader.thingisbn and a test 2011-10-13 01:59:46 +00:00			`def thingisbn(isbn):`
added some comments 2011-10-14 04:12:20 +00:00			`"""given an ISBN return a list of related edition ISBNs, according to`
			`Library Thing.`
			`"""`
added bookloader.thingisbn and a test 2011-10-13 01:59:46 +00:00			`url = "http://www.librarything.com/api/thingISBN/%s" % isbn`
			`xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content`
			`doc = ElementTree.fromstring(xml)`
			`return [e.text for e in doc.findall('isbn')]`

need to trap errors when looking up a LibraryThing ISBN in GoogleBooks and getting no hit 2011-10-19 03:45:02 +00:00
added merge_works function for merging the one work into another when adding related editions based on ThingISBN. also added a test that makes sure related campaigns and wishlists are appropriately updated. 2011-10-19 03:00:07 +00:00			`def merge_works(w1, w2):`
			`"""will merge the second work (w2) into the first (w1)`
			`"""`
			`for edition in w2.editions.all():`
			`edition.work = w1`
			`edition.save()`
			`for campaign in w2.campaigns.all():`
			`campaign.work = w1`
			`campaign.save()`
			`for wishlist in models.Wishlist.objects.filter(works__in=[w2]):`
			`wishlist.works.remove(w2)`
			`wishlist.works.add(w1)`
			`w2.delete()`
added some comments 2011-10-14 04:12:20 +00:00
need to trap errors when looking up a LibraryThing ISBN in GoogleBooks and getting no hit 2011-10-19 03:45:02 +00:00
handle duplicates using openlibrary ids for edition, work and author 2011-09-10 11:36:38 +00:00			`def _get_json(url, params={}):`
added X-Forwarded-For for gluejar.com for the moment 2011-10-10 19:57:12 +00:00			`# TODO: should X-Forwarded-For change based on the request from client?`
			`headers = {'User-Agent': settings.USER_AGENT,`
			`'Accept': 'application/json',`
			`'X-Forwarded-For': '69.174.114.214'}`
moved to using googlebooks api for data. required moving authors and subjects to be associated to editions, since we google books does not have a notion of a work 2011-10-10 16:57:10 +00:00			`params['key'] = settings.GOOGLE_BOOKS_API_KEY`
lots of changes to support dynamically loading from openlibrary based on an isbn...which still needs some work to prevent duplication 2011-09-09 05:38:28 +00:00			`response = requests.get(url, params=params, headers=headers)`
added core edition googlebooks lookup and test, plus the start of a load_books management command 2011-09-07 09:34:03 +00:00			`if response.status_code == 200:`
			`return json.loads(response.content)`
			`else:`
handle duplicates using openlibrary ids for edition, work and author 2011-09-10 11:36:38 +00:00			`logger.error("unexpected HTTP response: %s" % response)`
added core edition googlebooks lookup and test, plus the start of a load_books management command 2011-09-07 09:34:03 +00:00			`raise LookupFailure("GET failed: url=%s and params=%s" % (url, params))`

lots of changes to support dynamically loading from openlibrary based on an isbn...which still needs some work to prevent duplication 2011-09-09 05:38:28 +00:00
added core edition googlebooks lookup and test, plus the start of a load_books management command 2011-09-07 09:34:03 +00:00			`class LookupFailure(Exception):`
			`pass`