regluit/core/loaders/utils.py

import csv
import re
import requests
import logging
import sys

from regluit.core.models import Work, Edition, Author, PublisherName, Identifier, Subject
from regluit.core.isbn import ISBN
from regluit.core.bookloader import add_by_isbn_from_google
from regluit.api.crosswalks import inv_relator_contrib
from regluit.bisac.models import BisacHeading

logger = logging.getLogger(__name__)

def UnicodeDictReader(utf8_data, **kwargs):
    csv_reader = csv.DictReader(utf8_data, **kwargs)
    for row in csv_reader:
        yield {key: unicode(value, 'utf-8') for key, value in row.iteritems()}

def get_authors(book):
    authors=[]
    for i in range(1,3):
        fname=u'Author{}First'.format(i)
        lname=u'Author{}Last'.format(i)
        role=u'Author{}Role'.format(i)
        authname = u'{} {}'.format(book[fname].encode('utf-8'),book[lname])
        if authname != u' ':
            role = book[role] if book[role]!= u' ' else 'A01'
            authors.append((authname,role))
        else:
            break
    authlist = book["AuthorsList"].replace(' and ', ', ').split(', ')
    if len(authlist)>3:
        for authname in authlist[3:]:
            authors.append((authname, 'A01'))

    return authors

def get_subjects(book):
    subjects=[]
    for i in range(1,3):
        key=u'BISACCode{}'.format(i)
        if book[key] != '':
            try:
                bisac=BisacHeading.objects.get(notation=book[key])
                subjects.append(bisac)
            except BisacHeading.DoesNotExist:
                logger.warning( "Please add BISAC {}".format(book[key]))
    return subjects

def add_subject(subject_name, work, authority=''):
    try:
        subject= Subject.objects.get(name=subject_name)
    except Subject.DoesNotExist:
        subject=Subject.objects.create(name=subject_name, authority=authority)
    subject.works.add(work)

def get_cover(book):
    url = book['URL']
    if "10.3998" in url:
        # code for umich books; can generalize, of course!
        idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url)
        if idmatch:
            book_id = idmatch.group(2)
            if idmatch.group(1) == 'ohp':
                cover_url = "http://quod.lib.umich.edu/o/ohp/images/{}.jpg".format(book_id)
            elif idmatch.group(1) == 'ump':
                cover_url = "http://quod.lib.umich.edu/u/ump/images/{}.jpg".format(book_id)
            else:
                cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id)
            cover = requests.head(cover_url)
            if cover.status_code<400:
                return cover_url
            else:
                logger.warning( "bad cover: {} for: {}".format(cover_url, url))
            
def get_isbns(book):
    isbns = []
    edition = None
    for code in ['eISBN','PaperISBN','ClothISBN']:
        if book[code] not in ('','N/A'):
            values = book[code].split(',')
            for value in values:
                isbn = ISBN(value).to_string()
                if isbn:
                    isbns.append(isbn)
    for isbn in isbns :
        if not edition:
            edition = Edition.get_by_isbn(isbn)
    return (isbns, edition )


def _out(*args, **kwargs):

    sys.stdout.write(*args, **kwargs)
    sys.stdout.flush()

def load_from_books(books):
    ''' books is an iterator of book dicts.
        each book must have attributes
        eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList, 
        Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last, 
        Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong, 
        DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate, 
        eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights, 
        SubjectListMARC, , Book-level DOI, URL,	License
        '''

    # Goal: get or create an Edition and Work for each given book

    results = []

    for (i, book) in enumerate(books):

        # try first to get an Edition already in DB with by one of the ISBNs in book
        (isbns, edition) = get_isbns(book)
        title=book['FullTitle']
        authors = get_authors(book)

        # if matching by ISBN doesn't work, then create a Work and Edition 
        # with a title and the first ISBN
        if not edition and len(isbns):
            work = Work(title=title)
            work.save()
            edition= Edition(title=title, work=work) 
            edition.save()
            Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work)

        work=edition.work

        # at this point, work and edition exist

        if book.get('URL'):
            Identifier.set(type='http', value=book['URL'], edition=edition, work=work)

        # make sure each isbn is represented by an Edition
        # also associate authors, publication date, cover, publisher
        for isbn in isbns:
            edition = add_by_isbn_from_google(isbn)
            if not edition:
                edition= Edition(title=title, work=work)
                edition.save()
                Identifier.set(type='isbn', value=isbn, edition=edition, work=work)

            # if isbn == '9780472116713':
            #     print ('for 9780472116713, edition_id is {}'.format(edition.id))

            edition.authors.clear()
            for (author, role) in authors:
                edition.add_author(author, inv_relator_contrib.get(role, 'aut'))
            edition.publication_date = book['CopyrightYear']
            edition.cover_image = get_cover(book)
            edition.set_publisher(book['Publisher'])
            edition.save()

        # possibly replace work.description 
        description = book['DescriptionBrief']
        if len(description)>len (work.description):
            work.description = description

        # add a bisac subject (and ancestors) to work
        for bisacsh in get_subjects(book):
            while bisacsh:
                add_subject(bisacsh.full_label, work, authority="bisacsh")
                bisacsh = bisacsh.parent

        logging.info(u'loaded work {}'.format(work.title))
        loading_ok = loaded_book_ok(book, work, edition)

        results.append((book, work, edition))

        try:
            _out ("{} {} {}\n".format(i, title, loading_ok))
        except Exception as e:
            _out("{} {}\n".format(i, title, str(e) ))

    return results

    
def loaded_book_ok(book, work, edition):

    isbns = get_isbns(book)[0]
    authors = get_authors(book)
    subjects = get_subjects(book)

    if (work is None) or (edition is None):
        return False

    try:
        url_id = Identifier.objects.get(type='http', value=book['URL'])
        if url_id is None:
            print ("url_id problem: work.id {}, url: {}".format(work.id, book['URL']))
            return False
    except Exception as e:
        _out(str(e))
        return False
        
    # isbns 
    # looking at work_isbns too narrow
    # work_isbns = set([isbn.value for isbn in Identifier.objects.filter(type='isbn', work=work)])
    # if not (set(isbns) <= work_isbns):
    #     print ("isbn problem: work.id {}, work_isbns: {} isbns: {}".format(work.id, work_isbns, isbns))
    #     return False

    # isbns
    for isbn in isbns:
        if Identifier.objects.filter(type='isbn', value=isbn).count() <> 1:
            # print ("isbn problem: work.id {}, isbn: {}".format(work.id, isbn))
            return False
        else:
            try:
                edition_for_isbn = Identifier.objects.get(type='isbn', value=isbn).edition
            except Exception as e:
                print (e)
                return False

            # authors
            # print set([ed.name for ed in edition_for_isbn.authors.all()])

            if set([author[0] for author in authors]) != set([ed.name for ed in edition_for_isbn.authors.all()]):
                return False

            try:
                edition_for_isbn.publication_date = book['CopyrightYear']
                edition_for_isbn.cover_image = get_cover(book)
                edition_for_isbn.set_publisher(book['Publisher'])
            except:
                return False

    # work description
    description = book['DescriptionBrief']
    if not ((work.description == description) or (len(description) <len (work.description))):
        return False

    # bisac

    for bisacsh in subjects:
        while bisacsh:
            if bisach not in work.subjects.all():
                return False
            bisacsh = bisacsh.parent


    return True
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`import csv`
			`import re`
			`import requests`
			`import logging`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00			`import sys`
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00
			`from regluit.core.models import Work, Edition, Author, PublisherName, Identifier, Subject`
			`from regluit.core.isbn import ISBN`
			`from regluit.core.bookloader import add_by_isbn_from_google`
			`from regluit.api.crosswalks import inv_relator_contrib`
			`from regluit.bisac.models import BisacHeading`

			`logger = logging.getLogger(__name__)`

			`def UnicodeDictReader(utf8_data, **kwargs):`
			`csv_reader = csv.DictReader(utf8_data, **kwargs)`
			`for row in csv_reader:`
			`yield {key: unicode(value, 'utf-8') for key, value in row.iteritems()}`

			`def get_authors(book):`
			`authors=[]`
			`for i in range(1,3):`
			`fname=u'Author{}First'.format(i)`
			`lname=u'Author{}Last'.format(i)`
			`role=u'Author{}Role'.format(i)`
			`authname = u'{} {}'.format(book[fname].encode('utf-8'),book[lname])`
			`if authname != u' ':`
			`role = book[role] if book[role]!= u' ' else 'A01'`
			`authors.append((authname,role))`
			`else:`
			`break`
			`authlist = book["AuthorsList"].replace(' and ', ', ').split(', ')`
			`if len(authlist)>3:`
			`for authname in authlist[3:]:`
			`authors.append((authname, 'A01'))`
first pass at tests -- some cleanup needed 2016-05-24 00:03:55 +00:00
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`return authors`

			`def get_subjects(book):`
			`subjects=[]`
			`for i in range(1,3):`
			`key=u'BISACCode{}'.format(i)`
			`if book[key] != '':`
			`try:`
			`bisac=BisacHeading.objects.get(notation=book[key])`
			`subjects.append(bisac)`
			`except BisacHeading.DoesNotExist:`
			`logger.warning( "Please add BISAC {}".format(book[key]))`
			`return subjects`

code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00			`def add_subject(subject_name, work, authority=''):`
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`try:`
			`subject= Subject.objects.get(name=subject_name)`
			`except Subject.DoesNotExist:`
			`subject=Subject.objects.create(name=subject_name, authority=authority)`
			`subject.works.add(work)`

			`def get_cover(book):`
			`url = book['URL']`
			`if "10.3998" in url:`
			`# code for umich books; can generalize, of course!`
			`idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url)`
			`if idmatch:`
			`book_id = idmatch.group(2)`
			`if idmatch.group(1) == 'ohp':`
			`cover_url = "http://quod.lib.umich.edu/o/ohp/images/{}.jpg".format(book_id)`
			`elif idmatch.group(1) == 'ump':`
			`cover_url = "http://quod.lib.umich.edu/u/ump/images/{}.jpg".format(book_id)`
			`else:`
			`cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id)`
			`cover = requests.head(cover_url)`
			`if cover.status_code<400:`
			`return cover_url`
			`else:`
			`logger.warning( "bad cover: {} for: {}".format(cover_url, url))`

			`def get_isbns(book):`
			`isbns = []`
			`edition = None`
			`for code in ['eISBN','PaperISBN','ClothISBN']:`
			`if book[code] not in ('','N/A'):`
			`values = book[code].split(',')`
			`for value in values:`
			`isbn = ISBN(value).to_string()`
			`if isbn:`
			`isbns.append(isbn)`
			`for isbn in isbns :`
			`if not edition:`
			`edition = Edition.get_by_isbn(isbn)`
			`return (isbns, edition )`


code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00			`def _out(args, *kwargs):`

			`sys.stdout.write(args, *kwargs)`
			`sys.stdout.flush()`

code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`def load_from_books(books):`
			`''' books is an iterator of book dicts.`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00			`each book must have attributes`
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList,`
			`Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last,`
			`Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong,`
			`DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate,`
			`eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights,`
			`SubjectListMARC, , Book-level DOI, URL, License`
			`'''`

code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00			`# Goal: get or create an Edition and Work for each given book`

first pass at tests -- some cleanup needed 2016-05-24 00:03:55 +00:00			`results = []`

code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00			`for (i, book) in enumerate(books):`

			`# try first to get an Edition already in DB with by one of the ISBNs in book`
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`(isbns, edition) = get_isbns(book)`
			`title=book['FullTitle']`
			`authors = get_authors(book)`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00
			`# if matching by ISBN doesn't work, then create a Work and Edition`
			`# with a title and the first ISBN`
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`if not edition and len(isbns):`
			`work = Work(title=title)`
			`work.save()`
			`edition= Edition(title=title, work=work)`
			`edition.save()`
			`Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work)`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`work=edition.work`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00
			`# at this point, work and edition exist`

			`if book.get('URL'):`
			`Identifier.set(type='http', value=book['URL'], edition=edition, work=work)`

			`# make sure each isbn is represented by an Edition`
			`# also associate authors, publication date, cover, publisher`
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`for isbn in isbns:`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00			`edition = add_by_isbn_from_google(isbn)`
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`if not edition:`
			`edition= Edition(title=title, work=work)`
			`edition.save()`
			`Identifier.set(type='isbn', value=isbn, edition=edition, work=work)`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00
first pass at tests -- some cleanup needed 2016-05-24 00:03:55 +00:00			`# if isbn == '9780472116713':`
			`# print ('for 9780472116713, edition_id is {}'.format(edition.id))`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`edition.authors.clear()`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00			`for (author, role) in authors:`
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`edition.add_author(author, inv_relator_contrib.get(role, 'aut'))`
			`edition.publication_date = book['CopyrightYear']`
			`edition.cover_image = get_cover(book)`
			`edition.set_publisher(book['Publisher'])`
			`edition.save()`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00
			`# possibly replace work.description`
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`description = book['DescriptionBrief']`
			`if len(description)>len (work.description):`
			`work.description = description`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00
			`# add a bisac subject (and ancestors) to work`
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`for bisacsh in get_subjects(book):`
			`while bisacsh:`
			`add_subject(bisacsh.full_label, work, authority="bisacsh")`
			`bisacsh = bisacsh.parent`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`logging.info(u'loaded work {}'.format(work.title))`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00			`loading_ok = loaded_book_ok(book, work, edition)`
first pass at tests -- some cleanup needed 2016-05-24 00:03:55 +00:00
			`results.append((book, work, edition))`

code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00			`try:`
first pass at tests -- some cleanup needed 2016-05-24 00:03:55 +00:00			`_out ("{} {} {}\n".format(i, title, loading_ok))`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00			`except Exception as e:`
			`_out("{} {}\n".format(i, title, str(e) ))`

first pass at tests -- some cleanup needed 2016-05-24 00:03:55 +00:00			`return results`


code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00			`def loaded_book_ok(book, work, edition):`

			`isbns = get_isbns(book)[0]`
first pass at tests -- some cleanup needed 2016-05-24 00:03:55 +00:00			`authors = get_authors(book)`
			`subjects = get_subjects(book)`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00
			`if (work is None) or (edition is None):`
			`return False`

			`try:`
			`url_id = Identifier.objects.get(type='http', value=book['URL'])`
			`if url_id is None:`
			`print ("url_id problem: work.id {}, url: {}".format(work.id, book['URL']))`
			`return False`
			`except Exception as e:`
			`_out(str(e))`
			`return False`

			`# isbns`
			`# looking at work_isbns too narrow`
			`# work_isbns = set([isbn.value for isbn in Identifier.objects.filter(type='isbn', work=work)])`
			`# if not (set(isbns) <= work_isbns):`
			`# print ("isbn problem: work.id {}, work_isbns: {} isbns: {}".format(work.id, work_isbns, isbns))`
			`# return False`

			`# isbns`
			`for isbn in isbns:`
			`if Identifier.objects.filter(type='isbn', value=isbn).count() <> 1:`
first pass at tests -- some cleanup needed 2016-05-24 00:03:55 +00:00			`# print ("isbn problem: work.id {}, isbn: {}".format(work.id, isbn))`
code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00			`return False`
first pass at tests -- some cleanup needed 2016-05-24 00:03:55 +00:00			`else:`
			`try:`
			`edition_for_isbn = Identifier.objects.get(type='isbn', value=isbn).edition`
			`except Exception as e:`
			`print (e)`
			`return False`

			`# authors`
			`# print set([ed.name for ed in edition_for_isbn.authors.all()])`

			`if set([author[0] for author in authors]) != set([ed.name for ed in edition_for_isbn.authors.all()]):`
			`return False`

			`try:`
			`edition_for_isbn.publication_date = book['CopyrightYear']`
			`edition_for_isbn.cover_image = get_cover(book)`
			`edition_for_isbn.set_publisher(book['Publisher'])`
			`except:`
			`return False`

			`# work description`
			`description = book['DescriptionBrief']`
			`if not ((work.description == description) or (len(description) <len (work.description))):`
			`return False`

			`# bisac`

			`for bisacsh in subjects:`
			`while bisacsh:`
			`if bisach not in work.subjects.all():`
			`return False`
			`bisacsh = bisacsh.parent`

code in progress to test https://github.com/Gluejar/regluit/pull/584 2016-05-21 21:51:52 +00:00
			`return True`