regluit/core/doab.py

import logging
import json
from itertools import islice

import requests

from django.db.models import (Q, F)

from django.core.files.storage import default_storage
from django.core.files.base import ContentFile

import regluit
from regluit.core import models
from regluit.core import bookloader
from regluit.core.bookloader import add_by_isbn

logger = logging.getLogger(__name__)

def store_doab_cover(doab_id, redo=False):

    """
    returns tuple: 1) cover URL, 2) whether newly created (boolean)
    """

    cover_file_name= '/doab/%s/cover' % (doab_id)

    # if we don't want to redo and the cover exists, return the URL of the cover

    if not redo and default_storage.exists(cover_file_name):
        return (default_storage.url(cover_file_name), False)

    # download cover image to cover_file
    url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)
    try:
        r = requests.get(url)
        cover_file = ContentFile(r.content)
        cover_file.content_type = r.headers.get('content-type', '')

        path = default_storage.save(cover_file_name, cover_file)
        return (default_storage.url(cover_file_name), True)
    except Exception, e:
        # if there is a problem, return None for cover URL
        return (None, False)

def update_cover_doab(doab_id, store_cover=True):
    """
    update the cover url for work with doab_id
    if store_cover is True, use the cover from our own storage
    """
    work = models.Identifier.objects.get(type='doab', value=doab_id).work
    edition = work.preferred_edition

    if store_cover:
        (cover_url, new_cover) = store_doab_cover(doab_id)
    else:
        cover_url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)

    if cover_url is not None:
        edition.cover_image = cover_url
        edition.save()
        return cover_url
    else:
        return None

def attach_more_doab_metadata(ebook, description, subjects,
                              publication_date, publisher_name=None):

    """
    for given ebook, attach description, subjects, publication date to
    corresponding Edition and Work
    """
    # if edition doesn't have a publication date, update it
    edition = ebook.edition
    edition_to_save = False

    if not edition.publication_date:
        edition.publication_date = publication_date
        edition_to_save = True

    # if edition.publisher_name is empty, set it
    if not edition.publisher_name:
        edition.set_publisher(publisher_name)

    if edition_to_save:
        edition.save()

    # attach description to work if it's not empty
    work = edition.work
    if not work.description:
        work.description = description
        work.save()

    # update subjects
    work.subjects.add(*[models.Subject.objects.get_or_create(name=s)[0] for s in subjects])

    return ebook

def load_doab_edition(title, doab_id, seed_isbn, url, format, rights,
                      language, isbns,
                      provider, **kwargs):

    """
    load a record from doabooks.org represented by input parameters and return an ebook
    """
    from regluit.core import tasks

    # check to see whether the Edition hasn't already been loaded first
    # search by url
    ebooks = models.Ebook.objects.filter(url=url)

    # 1 match
    # > 1 matches
    # 0 match

    # simplest case -- if match (1 or more), we could check whether any
    # ebook.edition.work has a doab id matching given doab_id

    # put a migration to force Ebook.url to be unique id

    # if yes, then return one of the Edition(s) whose work is doab_id
    # if no, then

    if len(ebooks) > 1:
        raise Exception("There is more than one Ebook matching url {0}".format(url))
    elif len(ebooks) == 1:
        ebook = ebooks[0]
        doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id,
                                               work=ebook.edition.work)
        # update the cover id
        cover_url = update_cover_doab(doab_id)

        # attach more metadata
        attach_more_doab_metadata(ebook,
                                  description=kwargs.get('description'),
                                  subjects=kwargs.get('subject'),
                                  publication_date=kwargs.get('date'),
                                  publisher_name=kwargs.get('publisher'))

        return ebook

    # remaining case --> need to create a new Ebook
    assert len(ebooks) == 0

    # make sure we have isbns to work with before creating ebook
    if len(isbns) == 0:
        return None

    ebook = models.Ebook()
    ebook.format = format
    ebook.provider = provider
    ebook.url =  url
    ebook.rights = rights

    # we still need to find the right Edition/Work to tie Ebook to...

    # look for the Edition with which to associate ebook.
    # loop through the isbns to see whether we get one that is not None

    for isbn in isbns:
        edition = bookloader.add_by_isbn(isbn)
        if edition is not None: break

    if edition is not None:
        # if this is a new edition, then add related editions asynchronously
        if getattr(edition,'new', False):
            tasks.populate_edition.delay(edition.isbn_13)

        # QUESTION:  Is this good enough?
        # what's going to happen to edition.work if there's merging
        doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id,
                                work=edition.work)

    # we need to create Edition(s) de novo
    else:
        # if there is a Work with doab_id already, attach any new Edition(s)
        try:
            work = models.Identifier.objects.get(type='doab',value=doab_id).work
        except models.Identifier.DoesNotExist:
            work = models.Work(language=language,title=title)
            work.save()
            doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id,
                                               work=work)


        # create Edition(s) for each of the isbn from the input info
        editions = []
        for isbn in isbns:
            edition = models.Edition(title=title, work=work)
            edition.save()

            isbn_id = models.Identifier.get_or_add(type='isbn',value=isbn,work=work)

            editions.append(edition)

        # if work has any ebooks already, attach the ebook to the corresponding edition
        # otherwise pick the first one
        # pick the first edition as the one to tie ebook to
        editions_with_ebooks = models.Edition.objects.filter(Q(work__id=work.id) & \
                                                      Q(ebooks__isnull=False)).distinct()
        if editions_with_ebooks:
            edition = editions_with_ebooks[0]
        else:
            edition = editions[0]

    # make the edition the selected_edition of the work
    work = edition.work
    work.selected_edition = edition
    work.save()

    # tie the edition to ebook
    ebook.edition = edition
    ebook.save()

    # update the cover id (could be done separately)
    cover_url = update_cover_doab(doab_id)

    # attach more metadata
    attach_more_doab_metadata(ebook,
                              description=kwargs.get('description'),
                              subjects=kwargs.get('subject'),
                              publication_date=kwargs.get('date'),
                              publisher_name=kwargs.get('publisher'))
    return ebook


def load_doab_records(fname, limit=None, async=True):

    from regluit.core import (doab, tasks)
    success_count = 0

    records = json.load(open(fname))

    for (i, book) in enumerate(islice(records,limit)):
        d = dict(book)
        if d['format'] == 'pdf':
            try:
                if async:
                    task_id = tasks.load_doab_edition.delay(**dict(book))

                    ct = models.CeleryTask()
                    ct.task_id = task_id
                    ct.function_name = "load_doab_edition"
                    ct.user = None
                    ct.description = "Loading DOAB %s " % (dict(book)['doab_id'])
                    ct.save()

                else:
                    edition = load_doab_edition(**dict(book))
                success_count += 1
            except Exception, e:
                logger.warning(e)

    logger.info("Number of books successfully uploaded: " + str(success_count))