regluit/core/doab.py

import logging
import json
from itertools import islice

import requests

from django.db.models import (Q, F)

from django.core.files.storage import default_storage
from django.core.files.base import ContentFile

import regluit
from regluit.core import models
from regluit.core import bookloader
from regluit.core.bookloader import add_by_isbn

logger = logging.getLogger(__name__)

def store_doab_cover(doab_id, redo=False):
    
    """
    returns tuple: 1) cover URL, 2) whether newly created (boolean)
    """
    
    cover_file_name= '/doab/%s/cover' % (doab_id)
    
    # if we don't want to redo and the cover exists, return the URL of the cover
    
    if not redo and default_storage.exists(cover_file_name):
        return (default_storage.url(cover_file_name), False)
        
    # download cover image to cover_file
    url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)
    try:
        r = requests.get(url)
        cover_file = ContentFile(r.content)
        cover_file.content_type = r.headers.get('content-type', '')

        path = default_storage.save(cover_file_name, cover_file)    
        return (default_storage.url(cover_file_name), True)
    except Exception, e:
        # if there is a problem, return None for cover URL
        return (None, False)

def update_cover_doab(doab_id, store_cover=True):
    """
    update the cover url for work with doab_id
    if store_cover is True, use the cover from our own storage
    """
    work = models.Identifier.objects.get(type='doab', value=doab_id).work
    edition = work.preferred_edition
    
    if store_cover:
        (cover_url, new_cover) = store_doab_cover(doab_id)
    else:
        cover_url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)

    if cover_url is not None:
        edition.cover_image = cover_url
        edition.save()
        return cover_url
    else:
        return None
    
def attach_more_doab_metadata(ebook, description, subjects,
                              publication_date, publisher_name=None):
    
    """
    for given ebook, attach description, subjects, publication date to
    corresponding Edition and Work
    """
    # if edition doesn't have a publication date, update it
    edition = ebook.edition
    edition_to_save = False
    
    if not edition.publication_date:
        edition.publication_date = publication_date
        edition_to_save = True
    
    # if edition.publisher_name is empty, set it
    if not edition.publisher_name:
        edition.set_publisher(publisher_name)
        
    if edition_to_save:
        edition.save()
        
    # attach description to work if it's not empty
    work = edition.work
    if not work.description:
        work.description = description
        work.save()
        
    # update subjects
    for s in subjects:
        if bookloader.valid_subject(s):
            work.subjects.add(models.Subject.objects.get_or_create(name=s)[0])
            
    return ebook

def load_doab_edition(title, doab_id, seed_isbn, url, format, rights,
                      language, isbns,
                      provider, **kwargs):
    
    """
    load a record from doabooks.org represented by input parameters and return an ebook
    """
    from regluit.core import tasks

    # check to see whether the Edition hasn't already been loaded first
    # search by url
    ebooks = models.Ebook.objects.filter(url=url)
       
    # 1 match
    # > 1 matches
    # 0 match

    # simplest case -- if match (1 or more), we could check whether any
    # ebook.edition.work has a doab id matching given doab_id
    
    # put a migration to force Ebook.url to be unique id
    
    # if yes, then return one of the Edition(s) whose work is doab_id
    # if no, then 
    
    if len(ebooks) > 1:
        raise Exception("There is more than one Ebook matching url {0}".format(url))    
    elif len(ebooks) == 1:  
        ebook = ebooks[0]
        doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id, 
                                               work=ebook.edition.work)
        # update the cover id 
        cover_url = update_cover_doab(doab_id)
        
        # attach more metadata
        attach_more_doab_metadata(ebook, 
                                  description=kwargs.get('description'),
                                  subjects=kwargs.get('subject'),
                                  publication_date=kwargs.get('date'),
                                  publisher_name=kwargs.get('publisher'))
        
        return ebook
    
    # remaining case --> need to create a new Ebook 
    assert len(ebooks) == 0
            
    # make sure we have isbns to work with before creating ebook
    if len(isbns) == 0:
        return None
    
    ebook = models.Ebook()
    ebook.format = format
    ebook.provider = provider
    ebook.url =  url
    ebook.rights = rights

    # we still need to find the right Edition/Work to tie Ebook to...
    
    # look for the Edition with which to associate ebook.
    # loop through the isbns to see whether we get one that is not None
        
    for isbn in isbns:
        edition = bookloader.add_by_isbn(isbn)
        if edition is not None: break        
    
    if edition is not None:
        # if this is a new edition, then add related editions asynchronously
        if getattr(edition,'new', False):
            tasks.populate_edition.delay(edition.isbn_13)
            
        # QUESTION:  Is this good enough?
        # what's going to happen to edition.work if there's merging   
        doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id, 
                                work=edition.work)

    # we need to create Edition(s) de novo    
    else: 
        # if there is a Work with doab_id already, attach any new Edition(s)
        try:
            work = models.Identifier.objects.get(type='doab',value=doab_id).work
        except models.Identifier.DoesNotExist:
            work = models.Work(language=language,title=title)
            work.save()
            doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id, 
                                               work=work)
            
        
        # create Edition(s) for each of the isbn from the input info
        editions = []
        for isbn in isbns:
            edition = models.Edition(title=title, work=work)
            edition.save()
            
            isbn_id = models.Identifier.get_or_add(type='isbn',value=isbn,work=work)
            
            editions.append(edition)
  
        # if work has any ebooks already, attach the ebook to the corresponding edition
        # otherwise pick the first one
        # pick the first edition as the one to tie ebook to 
        editions_with_ebooks = models.Edition.objects.filter(Q(work__id=work.id) & \
                                                      Q(ebooks__isnull=False)).distinct()
        if editions_with_ebooks:
            edition = editions_with_ebooks[0]
        else:
            edition = editions[0]
        
    # make the edition the selected_edition of the work
    work = edition.work
    work.selected_edition = edition
    work.save()
    
    # tie the edition to ebook
    ebook.edition = edition
    ebook.save()
    
    # update the cover id (could be done separately)
    cover_url = update_cover_doab(doab_id)
    
    # attach more metadata
    attach_more_doab_metadata(ebook, 
                              description=kwargs.get('description'),
                              subjects=kwargs.get('subject'),
                              publication_date=kwargs.get('date'),
                              publisher_name=kwargs.get('publisher'))    
    return ebook


def load_doab_records(fname, limit=None, async=True):
    
    from regluit.core import (doab, tasks)
    success_count = 0
    
    records = json.load(open(fname))

    for (i, book) in enumerate(islice(records,limit)):
        d = dict(book)
        if d['format'] == 'pdf':
            try:
                if async:
                    task_id = tasks.load_doab_edition.delay(**dict(book))
                    
                    ct = models.CeleryTask()
                    ct.task_id = task_id
                    ct.function_name = "load_doab_edition"
                    ct.user = None
                    ct.description = "Loading DOAB %s " % (dict(book)['doab_id'])
                    ct.save()
                    
                else:
                    edition = load_doab_edition(**dict(book))
                success_count += 1 
            except Exception, e:
                logger.warning(e)
            
    logger.info("Number of books successfully uploaded: " + str(success_count))
some code to load DOAB records...no code here yet for how I processed the DOAB records into json format yet. 2014-06-04 22:23:47 +00:00			`import logging`
			`import json`
			`from itertools import islice`

The doab_load_books django command is working again -- I had to fix the signatures of some methods 2014-07-16 22:26:19 +00:00			`import requests`

code can now load description, subjects and covers for the pdf files 2014-07-24 23:29:28 +00:00			`from django.db.models import (Q, F)`

The doab_load_books django command is working again -- I had to fix the signatures of some methods 2014-07-16 22:26:19 +00:00			`from django.core.files.storage import default_storage`
			`from django.core.files.base import ContentFile`

some code to load DOAB records...no code here yet for how I processed the DOAB records into json format yet. 2014-06-04 22:23:47 +00:00			`import regluit`
[#70942940] Making the DOAB record loading asynchronous (ie., use Celery) 2014-06-05 23:31:14 +00:00			`from regluit.core import models`
code can now load description, subjects and covers for the pdf files 2014-07-24 23:29:28 +00:00			`from regluit.core import bookloader`
some code to load DOAB records...no code here yet for how I processed the DOAB records into json format yet. 2014-06-04 22:23:47 +00:00			`from regluit.core.bookloader import add_by_isbn`

			`logger = logging.getLogger(__name__)`

The doab_load_books django command is working again -- I had to fix the signatures of some methods 2014-07-16 22:26:19 +00:00			`def store_doab_cover(doab_id, redo=False):`

			`"""`
			`returns tuple: 1) cover URL, 2) whether newly created (boolean)`
			`"""`

			`cover_file_name= '/doab/%s/cover' % (doab_id)`

			`# if we don't want to redo and the cover exists, return the URL of the cover`

			`if not redo and default_storage.exists(cover_file_name):`
			`return (default_storage.url(cover_file_name), False)`

			`# download cover image to cover_file`
			`url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)`
			`try:`
			`r = requests.get(url)`
			`cover_file = ContentFile(r.content)`
			`cover_file.content_type = r.headers.get('content-type', '')`

			`path = default_storage.save(cover_file_name, cover_file)`
			`return (default_storage.url(cover_file_name), True)`
			`except Exception, e:`
			`# if there is a problem, return None for cover URL`
			`return (None, False)`

			`def update_cover_doab(doab_id, store_cover=True):`
			`"""`
			`update the cover url for work with doab_id`
			`if store_cover is True, use the cover from our own storage`
			`"""`
			`work = models.Identifier.objects.get(type='doab', value=doab_id).work`
			`edition = work.preferred_edition`

			`if store_cover:`
			`(cover_url, new_cover) = store_doab_cover(doab_id)`
			`else:`
			`cover_url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)`

			`if cover_url is not None:`
			`edition.cover_image = cover_url`
			`edition.save()`
			`return cover_url`
			`else:`
			`return None`
code can now load description, subjects and covers for the pdf files 2014-07-24 23:29:28 +00:00
			`def attach_more_doab_metadata(ebook, description, subjects,`
			`publication_date, publisher_name=None):`

			`"""`
			`for given ebook, attach description, subjects, publication date to`
			`corresponding Edition and Work`
			`"""`
			`# if edition doesn't have a publication date, update it`
			`edition = ebook.edition`
			`edition_to_save = False`

			`if not edition.publication_date:`
			`edition.publication_date = publication_date`
			`edition_to_save = True`

			`# if edition.publisher_name is empty, set it`
Set publisher name for edition 2014-07-25 22:16:06 +00:00			`if not edition.publisher_name:`
			`edition.set_publisher(publisher_name)`
code can now load description, subjects and covers for the pdf files 2014-07-24 23:29:28 +00:00
			`if edition_to_save:`
			`edition.save()`

			`# attach description to work if it's not empty`
			`work = edition.work`
			`if not work.description:`
			`work.description = description`
			`work.save()`

			`# update subjects`
also for doab records 2015-01-30 16:39:48 +00:00			`for s in subjects:`
			`if bookloader.valid_subject(s):`
			`work.subjects.add(models.Subject.objects.get_or_create(name=s)[0])`
code can now load description, subjects and covers for the pdf files 2014-07-24 23:29:28 +00:00
			`return ebook`
The doab_load_books django command is working again -- I had to fix the signatures of some methods 2014-07-16 22:26:19 +00:00
I've moved the new version of load_doab_edition into core/doab.py from notebook. Code in doab_loading.ipynb for testing the loading 2014-07-07 18:00:52 +00:00			`def load_doab_edition(title, doab_id, seed_isbn, url, format, rights,`
			`language, isbns,`
code can now load description, subjects and covers for the pdf files 2014-07-24 23:29:28 +00:00			`provider, **kwargs):`
some code to load DOAB records...no code here yet for how I processed the DOAB records into json format yet. 2014-06-04 22:23:47 +00:00
code can now load description, subjects and covers for the pdf files 2014-07-24 23:29:28 +00:00			`"""`
			`load a record from doabooks.org represented by input parameters and return an ebook`
			`"""`
[#70942940] Making the DOAB record loading asynchronous (ie., use Celery) 2014-06-05 23:31:14 +00:00			`from regluit.core import tasks`
code can now load description, subjects and covers for the pdf files 2014-07-24 23:29:28 +00:00
some code to load DOAB records...no code here yet for how I processed the DOAB records into json format yet. 2014-06-04 22:23:47 +00:00			`# check to see whether the Edition hasn't already been loaded first`
			`# search by url`
			`ebooks = models.Ebook.objects.filter(url=url)`
[#70942940] Making load_doab_edition more rigorous: * assuming that 1 DOAB ID associated at most with 1 Work or 1 Edition * explicitly throw exception if Google Books doesn't recognize the ISBN in question 2014-06-06 23:40:13 +00:00
I've moved the new version of load_doab_edition into core/doab.py from notebook. Code in doab_loading.ipynb for testing the loading 2014-07-07 18:00:52 +00:00			`# 1 match`
The doab_load_books django command is working again -- I had to fix the signatures of some methods 2014-07-16 22:26:19 +00:00			`# > 1 matches`
I've moved the new version of load_doab_edition into core/doab.py from notebook. Code in doab_loading.ipynb for testing the loading 2014-07-07 18:00:52 +00:00			`# 0 match`
some code to load DOAB records...no code here yet for how I processed the DOAB records into json format yet. 2014-06-04 22:23:47 +00:00
I've moved the new version of load_doab_edition into core/doab.py from notebook. Code in doab_loading.ipynb for testing the loading 2014-07-07 18:00:52 +00:00			`# simplest case -- if match (1 or more), we could check whether any`
			`# ebook.edition.work has a doab id matching given doab_id`

			`# put a migration to force Ebook.url to be unique id`

			`# if yes, then return one of the Edition(s) whose work is doab_id`
			`# if no, then`

some code to load DOAB records...no code here yet for how I processed the DOAB records into json format yet. 2014-06-04 22:23:47 +00:00			`if len(ebooks) > 1:`
I've moved the new version of load_doab_edition into core/doab.py from notebook. Code in doab_loading.ipynb for testing the loading 2014-07-07 18:00:52 +00:00			`raise Exception("There is more than one Ebook matching url {0}".format(url))`
			`elif len(ebooks) == 1:`
			`ebook = ebooks[0]`
			`doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id,`
			`work=ebook.edition.work)`
The doab_load_books django command is working again -- I had to fix the signatures of some methods 2014-07-16 22:26:19 +00:00			`# update the cover id`
			`cover_url = update_cover_doab(doab_id)`
code can now load description, subjects and covers for the pdf files 2014-07-24 23:29:28 +00:00
			`# attach more metadata`
			`attach_more_doab_metadata(ebook,`
			`description=kwargs.get('description'),`
			`subjects=kwargs.get('subject'),`
			`publication_date=kwargs.get('date'),`
			`publisher_name=kwargs.get('publisher'))`
The doab_load_books django command is working again -- I had to fix the signatures of some methods 2014-07-16 22:26:19 +00:00
I've moved the new version of load_doab_edition into core/doab.py from notebook. Code in doab_loading.ipynb for testing the loading 2014-07-07 18:00:52 +00:00			`return ebook`

			`# remaining case --> need to create a new Ebook`
			`assert len(ebooks) == 0`

			`# make sure we have isbns to work with before creating ebook`
			`if len(isbns) == 0:`
			`return None`

			`ebook = models.Ebook()`
some code to load DOAB records...no code here yet for how I processed the DOAB records into json format yet. 2014-06-04 22:23:47 +00:00			`ebook.format = format`
			`ebook.provider = provider`
			`ebook.url = url`
			`ebook.rights = rights`
I've moved the new version of load_doab_edition into core/doab.py from notebook. Code in doab_loading.ipynb for testing the loading 2014-07-07 18:00:52 +00:00
			`# we still need to find the right Edition/Work to tie Ebook to...`

			`# look for the Edition with which to associate ebook.`
			`# loop through the isbns to see whether we get one that is not None`

			`for isbn in isbns:`
			`edition = bookloader.add_by_isbn(isbn)`
			`if edition is not None: break`

			`if edition is not None:`
			`# if this is a new edition, then add related editions asynchronously`
			`if getattr(edition,'new', False):`
			`tasks.populate_edition.delay(edition.isbn_13)`

			`# QUESTION: Is this good enough?`
			`# what's going to happen to edition.work if there's merging`
			`doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id,`
			`work=edition.work)`

			`# we need to create Edition(s) de novo`
			`else:`
			`# if there is a Work with doab_id already, attach any new Edition(s)`
			`try:`
			`work = models.Identifier.objects.get(type='doab',value=doab_id).work`
			`except models.Identifier.DoesNotExist:`
			`work = models.Work(language=language,title=title)`
Fixed bug in load_doab_edition: new Work needs to be saved before attaching IDs to it. code in doab_load.ipynb to load books and test the integrity of the load. Big surprise (to me): to find invalid ISBNs in the DOAB data 2014-07-08 16:29:31 +00:00			`work.save()`
I've moved the new version of load_doab_edition into core/doab.py from notebook. Code in doab_loading.ipynb for testing the loading 2014-07-07 18:00:52 +00:00			`doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id,`
			`work=work)`
Fixed bug in load_doab_edition: new Work needs to be saved before attaching IDs to it. code in doab_load.ipynb to load books and test the integrity of the load. Big surprise (to me): to find invalid ISBNs in the DOAB data 2014-07-08 16:29:31 +00:00
I've moved the new version of load_doab_edition into core/doab.py from notebook. Code in doab_loading.ipynb for testing the loading 2014-07-07 18:00:52 +00:00
			`# create Edition(s) for each of the isbn from the input info`
			`editions = []`
			`for isbn in isbns:`
			`edition = models.Edition(title=title, work=work)`
			`edition.save()`

			`isbn_id = models.Identifier.get_or_add(type='isbn',value=isbn,work=work)`

			`editions.append(edition)`

			`# if work has any ebooks already, attach the ebook to the corresponding edition`
			`# otherwise pick the first one`
			`# pick the first edition as the one to tie ebook to`
			`editions_with_ebooks = models.Edition.objects.filter(Q(work__id=work.id) & \`
			`Q(ebooks__isnull=False)).distinct()`
			`if editions_with_ebooks:`
			`edition = editions_with_ebooks[0]`
			`else:`
			`edition = editions[0]`
some code to load DOAB records...no code here yet for how I processed the DOAB records into json format yet. 2014-06-04 22:23:47 +00:00
The doab_load_books django command is working again -- I had to fix the signatures of some methods 2014-07-16 22:26:19 +00:00			`# make the edition the selected_edition of the work`
code can now load description, subjects and covers for the pdf files 2014-07-24 23:29:28 +00:00			`work = edition.work`
			`work.selected_edition = edition`
			`work.save()`
The doab_load_books django command is working again -- I had to fix the signatures of some methods 2014-07-16 22:26:19 +00:00
I've moved the new version of load_doab_edition into core/doab.py from notebook. Code in doab_loading.ipynb for testing the loading 2014-07-07 18:00:52 +00:00			`# tie the edition to ebook`
some code to load DOAB records...no code here yet for how I processed the DOAB records into json format yet. 2014-06-04 22:23:47 +00:00			`ebook.edition = edition`
			`ebook.save()`

The doab_load_books django command is working again -- I had to fix the signatures of some methods 2014-07-16 22:26:19 +00:00			`# update the cover id (could be done separately)`
			`cover_url = update_cover_doab(doab_id)`

code can now load description, subjects and covers for the pdf files 2014-07-24 23:29:28 +00:00			`# attach more metadata`
			`attach_more_doab_metadata(ebook,`
			`description=kwargs.get('description'),`
			`subjects=kwargs.get('subject'),`
			`publication_date=kwargs.get('date'),`
			`publisher_name=kwargs.get('publisher'))`
some code to load DOAB records...no code here yet for how I processed the DOAB records into json format yet. 2014-06-04 22:23:47 +00:00			`return ebook`

code can now load description, subjects and covers for the pdf files 2014-07-24 23:29:28 +00:00
[#70942940] Making the DOAB record loading asynchronous (ie., use Celery) 2014-06-05 23:31:14 +00:00			`def load_doab_records(fname, limit=None, async=True):`
some code to load DOAB records...no code here yet for how I processed the DOAB records into json format yet. 2014-06-04 22:23:47 +00:00
[#70942940] Making the DOAB record loading asynchronous (ie., use Celery) 2014-06-05 23:31:14 +00:00			`from regluit.core import (doab, tasks)`
some code to load DOAB records...no code here yet for how I processed the DOAB records into json format yet. 2014-06-04 22:23:47 +00:00			`success_count = 0`

			`records = json.load(open(fname))`

			`for (i, book) in enumerate(islice(records,limit)):`
			`d = dict(book)`
			`if d['format'] == 'pdf':`
			`try:`
[#70942940] Making the DOAB record loading asynchronous (ie., use Celery) 2014-06-05 23:31:14 +00:00			`if async:`
			`task_id = tasks.load_doab_edition.delay(**dict(book))`

			`ct = models.CeleryTask()`
			`ct.task_id = task_id`
			`ct.function_name = "load_doab_edition"`
			`ct.user = None`
			`ct.description = "Loading DOAB %s " % (dict(book)['doab_id'])`
			`ct.save()`

			`else:`
			`edition = load_doab_edition(**dict(book))`
some code to load DOAB records...no code here yet for how I processed the DOAB records into json format yet. 2014-06-04 22:23:47 +00:00			`success_count += 1`
			`except Exception, e:`
			`logger.warning(e)`

			`logger.info("Number of books successfully uploaded: " + str(success_count))`