regluit/core/bookloader.py

import json
import logging

import requests
from xml.etree import ElementTree
from itertools import izip, islice

from datetime import timedelta

from django.db.models import Q
from django.conf import settings
from django.db import IntegrityError
from django.contrib.comments.models import Comment


import regluit
from regluit.core import models
from regluit.utils.localdatetime import now
import regluit.core.isbn


logger = logging.getLogger(__name__)


def add_by_oclc(isbn, work=None):
    # this is indirection in case we have a data source other than google
    return add_by_oclc_from_google(isbn)


def add_by_oclc_from_google(oclc):
    if oclc:
        logger.info("adding book by oclc %s" , oclc)
    else:
        return None
    try:
        return models.Identifier.objects.get(type='oclc', value=oclc).edition
    except:
        url = "https://www.googleapis.com/books/v1/volumes"
        try:
            results = _get_json(url, {"q": '"OCLC%s"' % oclc})
        except LookupFailure, e:
            logger.exception("lookup failure for %s", oclc)
            return None
        if not results.has_key('items') or len(results['items']) == 0:
            logger.warn("no google hits for %s" , oclc)
            return None

        try:
            e = add_by_googlebooks_id(results['items'][0]['id'], results=results['items'][0])
            models.Identifier(type='oclc', value=oclc, edition=e, work=e.work).save()
            return e
        except LookupFailure, e:
            logger.exception("failed to add edition for %s", oclc)
        except IntegrityError, e:
            logger.exception("google books data for %s didn't fit our db", oclc)
        return None

def add_by_isbn(isbn, work=None):
    if not isbn:
        return None
    try:
        e = add_by_isbn_from_google(isbn, work=work)
    except LookupFailure:
        logger.exception("failed google lookup for %s", isbn)
        # try again some other time
        return None
    if e:
        return e

    logger.info("null came back from add_by_isbn_from_google: %s", isbn)

    if not work or not work.title:
        return None

    # if there's a work with a title, we want to create stub editions and
    # works, even if google doesn't know about it # but if it's not valid,
    # forget it!

    try:
        isbn = regluit.core.isbn.ISBN(isbn)
    except:
        logger.exception("invalid isbn: %s", isbn)
        return None
    if not isbn.valid:
        return None
    isbn = isbn.to_string()

    # we don't know the language  ->'xx'
    w = models.Work(title=work.title, language='xx')
    w.save()
    e = models.Edition(title=work.title,work=w)
    e.save()
    e.new = True
    models.Identifier(type='isbn', value=isbn, work=w, edition=e).save()
    return e

def get_google_isbn_results(isbn):
    url = "https://www.googleapis.com/books/v1/volumes"
    try:
        results = _get_json(url, {"q": "isbn:%s" % isbn})
    except LookupFailure:
        logger.exception("lookup failure for %s", isbn)
        return None
    if not results.has_key('items') or len(results['items']) == 0:
        logger.warn("no google hits for %s" , isbn)
        return None
    else:
        return results

def add_ebooks(item, edition):
    access_info = item.get('accessInfo')
    if access_info:
        edition.public_domain = access_info.get('publicDomain', None)
        epub = access_info.get('epub')
        if epub and epub.get('downloadLink'):
            ebook = models.Ebook(edition=edition, format='epub',
                                 url=epub.get('downloadLink'),
                                 provider='Google Books')
            try:
                ebook.save()
            except IntegrityError:
                pass


        pdf = access_info.get('pdf')
        if pdf and pdf.get('downloadLink'):
            ebook = models.Ebook(edition=edition, format='pdf',
                                 url=pdf.get('downloadLink', None),
                                 provider='Google Books')
            try:
                ebook.save()
            except IntegrityError:
                pass


def update_edition(edition):
    """
    attempt to update data associated with input edition and return that updated edition
    """

    # if there is no ISBN associated with edition, just return the input edition
    try:
        isbn=edition.identifiers.filter(type='isbn')[0].value
    except models.Identifier.DoesNotExist:
        return edition

    # do a Google Books lookup on the isbn associated with the edition (there should be either 0 or 1 isbns associated
    # with an edition because of integrity constraint in Identifier)

    # if we get some data about this isbn back from Google, update the edition data accordingly
    results=get_google_isbn_results(isbn)
    if not results:
        return edition
    item=results['items'][0]
    googlebooks_id=item['id']
    d = item['volumeInfo']
    if d.has_key('title'):
        title = d['title']
    else:
        title=''
    if len(title)==0:
        # need a title to make an edition record; some crap records in GB. use title from parent if available
        title=edition.work.title

    # check for language change
    language = d['language']
    # don't track variants in main language (e.g., traditional vs simplified Chinese)
    if len(language)>2:
        language= language[0:2]

    # if the language of the edition no longer matches that of the parent work, attach edition to the
    if edition.work.language != language:
        logger.info("reconnecting %s since it is %s instead of %s" %(googlebooks_id, language, edition.work.language))
        old_work=edition.work

        new_work = models.Work(title=title, language=language)
        new_work.save()
        edition.work = new_work
        edition.save()
        for identifier in edition.identifiers.all():
            logger.info("moving identifier %s" % identifier.value)
            identifier.work=new_work
            identifier.save()
        if old_work.editions.count()==0:
            #a dangling work; make sure nothing else is attached!
            merge_works(new_work,old_work)

    # update the edition
    edition.title = title
    edition.publisher = d.get('publisher')
    edition.publication_date = d.get('publishedDate', '')
    edition.save()

    # create identifier if needed
    models.Identifier.get_or_add(type='goog',value=googlebooks_id,edition=edition,work=edition.work)

    for a in d.get('authors', []):
        a, created = models.Author.objects.get_or_create(name=a)
        a.editions.add(edition)

    add_ebooks(item, edition)

    return edition


def add_by_isbn_from_google(isbn, work=None):
    """add a book to the UnglueIt database from google based on ISBN. The work parameter
    is optional, and if not supplied the edition will be associated with
    a stub work.
    """
    if not isbn:
        return None
    if len(isbn)==10:
        isbn = regluit.core.isbn.convert_10_to_13(isbn)


    # check if we already have this isbn
    edition = get_edition_by_id(type='isbn',value=isbn)
    if edition:
        edition.new = False
        return edition

    logger.info("adding new book by isbn %s", isbn)
    results=get_google_isbn_results(isbn)
    if results:
        try:
            return add_by_googlebooks_id(results['items'][0]['id'], work=work, results=results['items'][0], isbn=isbn)
        except LookupFailure, e:
            logger.exception("failed to add edition for %s", isbn)
        except IntegrityError, e:
            logger.exception("google books data for %s didn't fit our db", isbn)
        return None
    else:
        return None

def get_work_by_id(type,value):
    if value:
        try:
            return models.Identifier.objects.get(type=type,value=value).work
        except models.Identifier.DoesNotExist:
            return None

def get_edition_by_id(type,value):
    if value:
        try:
            return models.Identifier.objects.get(type=type,value=value).edition
        except models.Identifier.DoesNotExist:
            return None


def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
    """add a book to the UnglueIt database based on the GoogleBooks ID. The
    work parameter is optional, and if not supplied the edition will be
    associated with a stub work. isbn can be passed because sometimes passed data won't include it

    """
    # don't ping google again if we already know about the edition
    try:
        edition = models.Identifier.objects.get(type='goog', value=googlebooks_id).edition
        edition.new = False
        return edition
    except models.Identifier.DoesNotExist:
        pass

    # if google has been queried by caller, don't call again
    if results:
        item =results
    else:
        logger.info("loading metadata from google for %s", googlebooks_id)
        url = "https://www.googleapis.com/books/v1/volumes/%s" % googlebooks_id
        item  = _get_json(url)
    d = item['volumeInfo']

    if d.has_key('title'):
        title = d['title']
    else:
        title=''
    if len(title)==0:
        # need a title to make an edition record; some crap records in GB. use title from parent if available
        if work:
            title=work.title
        else:
            return None

    # don't add the edition to a work with a different language
    # https://www.pivotaltracker.com/story/show/17234433
    language = d['language']
    if len(language)>2:
        language= language[0:2]
    if work and work.language != language:
        logger.info("not connecting %s since it is %s instead of %s" %
                (googlebooks_id, language, work.language))
        work = None
    # isbn = None
    if not isbn:
        for i in d.get('industryIdentifiers', []):
            if i['type'] == 'ISBN_10' and not isbn:
                isbn = regluit.core.isbn.convert_10_to_13(i['identifier'])
            elif i['type'] == 'ISBN_13':
                isbn = i['identifier']

    # now check to see if there's an existing Work
    if work:
        work.new = False
    if isbn and not work:
        work = get_work_by_id(type='isbn',value=isbn)
        if work:
            work.new = False
    if not work:
        work = models.Work.objects.create(title=title, language=language)
        work.new = True
        work.save()


    # going off to google can take some time, so we want to make sure this edition has not
    # been created in another thread while we were waiting
    try:
        e = models.Identifier.objects.get(type='goog', value=googlebooks_id).edition
        e.new = False
        # whoa nellie, somebody else created an edition while we were working.
        if work.new:
            work.delete()
        return e
    except models.Identifier.DoesNotExist:
        pass


    # because this is a new google id, we have to create a new edition
    e = models.Edition(work=work)
    e.title = title
    e.publisher = d.get('publisher')
    e.publication_date = d.get('publishedDate', '')
    e.save()
    e.new = True

    # create identifier where needed
    models.Identifier(type='goog',value=googlebooks_id,edition=e,work=work).save()
    if isbn:
        models.Identifier.get_or_add(type='isbn',value=isbn,edition=e,work=work)

    for a in d.get('authors', []):
        a, created = models.Author.objects.get_or_create(name=a)
        a.editions.add(e)

    add_ebooks(item, e)

    return e


def add_related(isbn):
    """add all books related to a particular ISBN to the UnglueIt database.
    The initial seed ISBN will be added if it's not already there.
    """
    # make sure the seed edition is there
    logger.info("adding related editions for %s", isbn)

    new_editions = []

    edition = add_by_isbn(isbn)
    if edition is None:
        return new_editions

    # this is the work everything will hang off
    work = edition.work
    other_editions = {}
    for other_isbn in thingisbn(isbn):
        # 979's come back as 13
        logger.debug("other_isbn: %s", other_isbn)
        if len(other_isbn)==10:
            other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
        related_edition = add_by_isbn(other_isbn, work=work)

        if related_edition:
            related_language = related_edition.work.language
            if edition.work.language == related_language:
                new_editions.append(related_edition)
                if related_edition.work != edition.work:
                    logger.debug("merge_works path 1 %s %s", edition.work.id, related_edition.work.id )
                    merge_works(edition.work, related_edition.work)
            else:
                if other_editions.has_key(related_language):
                    other_editions[related_language].append(related_edition)
                else:
                    other_editions[related_language]=[related_edition]

    # group the other language editions together
    for lang_group in other_editions.itervalues():
        logger.debug("lang_group (ed, work): %s", [(ed.id, ed.work.id) for ed in lang_group])
        if len(lang_group)>1:
            lang_edition = lang_group[0]
            logger.debug("lang_edition.id: %s", lang_edition.id)
            # compute the distinct set of works to merge into lang_edition.work
            works_to_merge = set([ed.work for ed in lang_group[1:]]) - set([lang_edition.work])
            for w in works_to_merge:
                logger.debug("merge_works path 2 %s %s", lang_edition.work.id, w.id )
                merge_works(lang_edition.work, w)

    return new_editions


def thingisbn(isbn):
    """given an ISBN return a list of related edition ISBNs, according to
    Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns, which come back as isbn_13')
    """
    logger.info("looking up %s at ThingISBN" , isbn)
    url = "http://www.librarything.com/api/thingISBN/%s" % isbn
    xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content
    doc = ElementTree.fromstring(xml)
    return [e.text for e in doc.findall('isbn')]


def merge_works(w1, w2, user=None):
    """will merge the second work (w2) into the first (w1)
    """
    logger.info("merging work %s into %s", w2, w1)
    # don't merge if the works are the same or at least one of the works has no id (for example, when w2 has already been deleted)
    if w1 == w2 or w1.id is None or w2.id is None:
        return

    for identifier in w2.identifiers.all():
        identifier.work = w1
        identifier.save()
    for comment in Comment.objects.for_model(w2):
        comment.object_pk = w1.pk
        comment.save()
    for edition in w2.editions.all():
        edition.work = w1
        edition.save()
    for campaign in w2.campaigns.all():
        campaign.work = w1
        campaign.save()
    for wishlist in models.Wishlist.objects.filter(works__in=[w2]):
        w2source = wishlist.work_source(w2)
        wishlist.remove_work(w2)
        wishlist.add_work(w1, w2source)

    models.WasWork(was=w2.pk, work=w1, user=user).save()
    for ww in models.WasWork.objects.filter(work = w2):
        ww.work = w1
        ww.save()

    w2.delete()


def add_openlibrary(work):
    if work.openlibrary_lookup is not None:
        # don't hit OL if we've visited in the past month or so
        if now()- work.openlibrary_lookup < timedelta(days=30):
             return
    work.openlibrary_lookup = now()
    work.save()

    # find the first ISBN match in OpenLibrary
    logger.info("looking up openlibrary data for work %s", work.id)
    found = False
    e = None # openlibrary edition json
    w = None # openlibrary work json

    # get the 1st openlibrary match by isbn that has an associated work
    url = "http://openlibrary.org/api/books"
    params = {"format": "json", "jscmd": "details"}
    subjects = []
    for edition in work.editions.all():
        isbn_key = "ISBN:%s" % edition.isbn_13
        params['bibkeys'] = isbn_key
        try:
            e = _get_json(url, params, type='ol')
        except LookupFailure:
            logger.exception("OL lookup failed for  %s", isbn_key)
            e = {}
        if e.has_key(isbn_key):
            if e[isbn_key].has_key('details'):
                if e[isbn_key]['details'].has_key('oclc_numbers'):
                    for oclcnum in e[isbn_key]['details']['oclc_numbers']:
                        models.Identifier.get_or_add(type='oclc',value=oclcnum,work=work, edition=edition)
                if e[isbn_key]['details'].has_key('identifiers'):
                    ids = e[isbn_key]['details']['identifiers']
                    if ids.has_key('goodreads'):
                        models.Identifier.get_or_add(type='gdrd',value=ids['goodreads'][0],work=work,edition=edition)
                    if ids.has_key('librarything'):
                        models.Identifier.get_or_add(type='ltwk',value=ids['librarything'][0],work=work)
                    if ids.has_key('google'):
                        models.Identifier.get_or_add(type='goog',value=ids['google'][0],work=work)
                    if ids.has_key('project_gutenberg'):
                        models.Identifier.get_or_add(type='gute',value=ids['project_gutenberg'][0],work=work)
                if e[isbn_key]['details'].has_key('works'):
                    work_key = e[isbn_key]['details']['works'].pop(0)['key']
                    logger.info("got openlibrary work %s for isbn %s", work_key, isbn_key)
                    models.Identifier.get_or_add(type='olwk',value=work_key,work=work)
                    try:
                        w = _get_json("http://openlibrary.org" + work_key,type='ol')
                        if w.has_key('description'):
                            if not work.description or  len(w['description']) > len(work.description):
                                work.description = w['description']
                                work.save()
                        if w.has_key('subjects') and len(w['subjects']) > len(subjects):
                            subjects = w['subjects']
                    except LookupFailure:
                        logger.exception("OL lookup failed for  %s", work_key)
    if not subjects:
        logger.warn("unable to find work %s at openlibrary", work.id)
        return

    # add the subjects to the Work
    for s in subjects:
        logger.info("adding subject %s to work %s", s, work.id)
        subject, created = models.Subject.objects.get_or_create(name=s)
        work.subjects.add(subject)

    work.save()

    # TODO: add authors here once they are moved from Edition to Work


def _get_json(url, params={}, type='gb'):
    # TODO: should X-Forwarded-For change based on the request from client?
    headers = {'User-Agent': settings.USER_AGENT,
               'Accept': 'application/json',
               'X-Forwarded-For': '69.174.114.214'}
    if type == 'gb':
        params['key'] = settings.GOOGLE_BOOKS_API_KEY
    response = requests.get(url, params=params, headers=headers)
    if response.status_code == 200:
        return json.loads(response.content)
    else:
        logger.error("unexpected HTTP response: %s" % response)
        if response.content:
            logger.error("response content: %s" % response.content)
        raise LookupFailure("GET failed: url=%s and params=%s" % (url, params))


def load_gutenberg_edition(title, gutenberg_etext_id, ol_work_id, seed_isbn, url, format, license, lang, publication_date):

    # let's start with instantiating the relevant Work and Edition if they don't already exist

    try:
        work = models.Identifier.objects.get(type='olwk',value=ol_work_id).work
    except models.Identifier.DoesNotExist: # try to find an Edition with the seed_isbn and use that work to hang off of
        sister_edition = add_by_isbn(seed_isbn)
        if sister_edition.new:
            # add related editions asynchronously
            regluit.core.tasks.populate_edition.delay(sister_edition.isbn_13)
        work = sister_edition.work
        # attach the olwk identifier to this work if it's not none.
        if ol_work_id is not None:
            work_id = models.Identifier.get_or_add(type='olwk',value=ol_work_id, work=work)

    # Now pull out any existing Gutenberg editions tied to the work with the proper Gutenberg ID
    try:
        edition = models.Identifier.objects.get( type='gtbg', value=gutenberg_etext_id ).edition
    except models.Identifier.DoesNotExist:
        edition = models.Edition()
        edition.title = title
        edition.work = work

        edition.save()
        edition_id = models.Identifier.get_or_add(type='gtbg',value=gutenberg_etext_id, edition=edition, work=work)

    # check to see whether the Edition hasn't already been loaded first
    # search by url
    ebooks = models.Ebook.objects.filter(url=url)

    # format: what's the controlled vocab?  -- from Google -- alternative would be mimetype

    if len(ebooks):
        ebook = ebooks[0]
    elif len(ebooks) == 0: # need to create new ebook
        ebook = models.Ebook()

    if len(ebooks) > 1:
        warnings.warn("There is more than one Ebook matching url {0}".format(url))


    ebook.format = format
    ebook.provider = 'Project Gutenberg'
    ebook.url =  url
    ebook.rights = license

    # is an Ebook instantiable without a corresponding Edition? (No, I think)

    ebook.edition = edition
    ebook.save()

    return ebook

def add_missing_isbn_to_editions(max_num=None, confirm=False):
    """For each of the editions with Google Books ids, do a lookup and attach ISBNs.  Set confirm to True to check db changes made correctly"""
    logger.info("Number of editions with Google Books IDs but not ISBNs (before): %d",
        models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count())

    from regluit.experimental import bookdata

    gb = bookdata.GoogleBooks(key=settings.GOOGLE_BOOKS_API_KEY)

    new_isbns = []
    google_id_not_found = []
    no_isbn_found = []
    editions_to_merge = []
    exceptions = []


    for (i, ed) in enumerate(islice(models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn'), max_num)):
        try:
            g_id = ed.identifiers.get(type='goog').value
        except Exception, e:
            # we might get an exception if there is, for example, more than one Google id attached to this Edition
            logger.exception("add_missing_isbn_to_editions for edition.id %s: %s", ed.id, e)
            exceptions.append((ed.id, e))
            continue

        # try to get ISBN from Google Books
        try:
            vol_id = gb.volumeid(g_id)
            if vol_id is None:
                google_id_not_found.append((ed.id, g_id))
                logger.debug("g_id not found: %s", g_id)
            else:
                isbn = vol_id.get('isbn')
                logger.info("g_id, isbn: %s %s", g_id, isbn)
                if isbn is not None:
                    # check to see whether the isbn is actually already in the db but attached to another Edition
                    existing_isbn_ids = models.Identifier.objects.filter(type='isbn', value=isbn)
                    if len(existing_isbn_ids):
                        # don't try to merge editions right now, just note the need to merge
                        ed2 = existing_isbn_ids[0].edition
                        editions_to_merge.append((ed.id, g_id, isbn, ed2.id))
                    else:
                        new_id = models.Identifier(type='isbn', value=isbn, edition=ed, work=ed.work)
                        new_id.save()
                        new_isbns.append((ed.id, g_id, isbn))
                else:
                    no_isbn_found.append((ed.id, g_id, None))
        except Exception, e:
            logger.exception("add_missing_isbn_to_editions for edition.id %s: %s", ed.id, e)
            exceptions.append((ed.id, g_id, None, e))

    logger.info("Number of editions with Google Books IDs but not ISBNs (after): %d",
        models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count())

    ok = None

    if confirm:
        ok = True
        for (ed_id, g_id, isbn) in new_isbns:
            if models.Edition.objects.get(id=ed_id).identifiers.get(type='isbn').value != isbn:
                ok = False
                break

    return {
        'new_isbns': new_isbns,
        'no_isbn_found': no_isbn_found,
        'editions_to_merge': editions_to_merge,
        'exceptions': exceptions,
        'google_id_not_found': google_id_not_found,
        'confirm': ok
    }


class LookupFailure(Exception):
    pass