regluit/core/loaders/utils.py

import csv
import re
import requests
import logging

from regluit.core.models import Work, Edition, Author, PublisherName, Identifier, Subject
from regluit.core.isbn import ISBN
from regluit.core.bookloader import add_by_isbn_from_google
from regluit.api.crosswalks import inv_relator_contrib
from regluit.bisac.models import BisacHeading

logger = logging.getLogger(__name__)

def UnicodeDictReader(utf8_data, **kwargs):
    csv_reader = csv.DictReader(utf8_data, **kwargs)
    for row in csv_reader:
        yield {key: unicode(value, 'utf-8') for key, value in row.iteritems()}

def get_authors(book):
    authors=[]
    for i in range(1,3):
        fname=u'Author{}First'.format(i)
        lname=u'Author{}Last'.format(i)
        role=u'Author{}Role'.format(i)
        authname = u'{} {}'.format(book[fname].encode('utf-8'),book[lname])
        if authname != u' ':
            role = book[role] if book[role]!= u' ' else 'A01'
            authors.append((authname,role))
        else:
            break
    authlist = book["AuthorsList"].replace(' and ', ', ').split(', ')
    if len(authlist)>3:
        for authname in authlist[3:]:
            authors.append((authname, 'A01'))
    return authors

def get_subjects(book):
    subjects=[]
    for i in range(1,3):
        key=u'BISACCode{}'.format(i)
        if book[key] != '':
            try:
                bisac=BisacHeading.objects.get(notation=book[key])
                subjects.append(bisac)
            except BisacHeading.DoesNotExist:
                logger.warning( "Please add BISAC {}".format(book[key]))
    return subjects

def add_subject(subject_name,work, authority=''):
    try:
        subject= Subject.objects.get(name=subject_name)
    except Subject.DoesNotExist:
        subject=Subject.objects.create(name=subject_name, authority=authority)
    subject.works.add(work)

def get_cover(book):
    url = book['URL']
    if "10.3998" in url:
        # code for umich books; can generalize, of course!
        idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url)
        if idmatch:
            book_id = idmatch.group(2)
            if idmatch.group(1) == 'ohp':
                cover_url = "http://quod.lib.umich.edu/o/ohp/images/{}.jpg".format(book_id)
            elif idmatch.group(1) == 'ump':
                cover_url = "http://quod.lib.umich.edu/u/ump/images/{}.jpg".format(book_id)
            else:
                cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id)
            cover = requests.head(cover_url)
            if cover.status_code<400:
                return cover_url
            else:
                logger.warning( "bad cover: {} for: {}".format(cover_url, url))
            
def get_isbns(book):
    isbns = []
    edition = None
    for code in ['eISBN','PaperISBN','ClothISBN']:
        if book[code] not in ('','N/A'):
            values = book[code].split(',')
            for value in values:
                isbn = ISBN(value).to_string()
                if isbn:
                    isbns.append(isbn)
    for isbn in isbns :
        if not edition:
            edition = Edition.get_by_isbn(isbn)
    return (isbns, edition )


def load_from_books(books):
    ''' books is an iterator of book dicts.
        each book mus have attributes
        eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList, 
        Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last, 
        Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong, 
        DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate, 
        eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights, 
        SubjectListMARC, , Book-level DOI, URL,	License
        '''

    for book in books:
        (isbns, edition) = get_isbns(book)
        title=book['FullTitle']
        authors = get_authors(book)
        if not edition and len(isbns):
            work = Work(title=title)
            work.save()
            edition= Edition(title=title, work=work) 
            edition.save()
            Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work)
        work=edition.work
        Identifier.set(type='http', value=book['URL'], edition=edition, work=work)
        for isbn in isbns:
            edition= add_by_isbn_from_google(isbn)
            if not edition:
                edition= Edition(title=title, work=work)
                edition.save()
                Identifier.set(type='isbn', value=isbn, edition=edition, work=work)
            edition.authors.clear()
            for (author,role) in authors:
                edition.add_author(author, inv_relator_contrib.get(role, 'aut'))
            edition.publication_date = book['CopyrightYear']
            edition.cover_image = get_cover(book)
            edition.set_publisher(book['Publisher'])
            edition.save()
        description = book['DescriptionBrief']
        if len(description)>len (work.description):
            work.description = description
        for bisacsh in get_subjects(book):
            while bisacsh:
                add_subject(bisacsh.full_label, work, authority="bisacsh")
                bisacsh = bisacsh.parent
        logging.info(u'loaded work {}'.format(work.title))
code for loading umich spreadsheet 2016-05-19 13:17:23 +00:00			`import csv`
			`import re`
			`import requests`
			`import logging`

			`from regluit.core.models import Work, Edition, Author, PublisherName, Identifier, Subject`
			`from regluit.core.isbn import ISBN`
			`from regluit.core.bookloader import add_by_isbn_from_google`
			`from regluit.api.crosswalks import inv_relator_contrib`
			`from regluit.bisac.models import BisacHeading`

			`logger = logging.getLogger(__name__)`

			`def UnicodeDictReader(utf8_data, **kwargs):`
			`csv_reader = csv.DictReader(utf8_data, **kwargs)`
			`for row in csv_reader:`
			`yield {key: unicode(value, 'utf-8') for key, value in row.iteritems()}`

			`def get_authors(book):`
			`authors=[]`
			`for i in range(1,3):`
			`fname=u'Author{}First'.format(i)`
			`lname=u'Author{}Last'.format(i)`
			`role=u'Author{}Role'.format(i)`
			`authname = u'{} {}'.format(book[fname].encode('utf-8'),book[lname])`
			`if authname != u' ':`
			`role = book[role] if book[role]!= u' ' else 'A01'`
			`authors.append((authname,role))`
			`else:`
			`break`
			`authlist = book["AuthorsList"].replace(' and ', ', ').split(', ')`
			`if len(authlist)>3:`
			`for authname in authlist[3:]:`
			`authors.append((authname, 'A01'))`
			`return authors`

			`def get_subjects(book):`
			`subjects=[]`
			`for i in range(1,3):`
			`key=u'BISACCode{}'.format(i)`
			`if book[key] != '':`
			`try:`
			`bisac=BisacHeading.objects.get(notation=book[key])`
			`subjects.append(bisac)`
			`except BisacHeading.DoesNotExist:`
			`logger.warning( "Please add BISAC {}".format(book[key]))`
			`return subjects`

			`def add_subject(subject_name,work, authority=''):`
			`try:`
			`subject= Subject.objects.get(name=subject_name)`
			`except Subject.DoesNotExist:`
			`subject=Subject.objects.create(name=subject_name, authority=authority)`
			`subject.works.add(work)`

			`def get_cover(book):`
			`url = book['URL']`
			`if "10.3998" in url:`
			`# code for umich books; can generalize, of course!`
			`idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url)`
			`if idmatch:`
			`book_id = idmatch.group(2)`
			`if idmatch.group(1) == 'ohp':`
			`cover_url = "http://quod.lib.umich.edu/o/ohp/images/{}.jpg".format(book_id)`
			`elif idmatch.group(1) == 'ump':`
			`cover_url = "http://quod.lib.umich.edu/u/ump/images/{}.jpg".format(book_id)`
			`else:`
			`cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id)`
			`cover = requests.head(cover_url)`
			`if cover.status_code<400:`
			`return cover_url`
			`else:`
			`logger.warning( "bad cover: {} for: {}".format(cover_url, url))`

			`def get_isbns(book):`
			`isbns = []`
			`edition = None`
			`for code in ['eISBN','PaperISBN','ClothISBN']:`
			`if book[code] not in ('','N/A'):`
			`values = book[code].split(',')`
			`for value in values:`
			`isbn = ISBN(value).to_string()`
			`if isbn:`
			`isbns.append(isbn)`
			`for isbn in isbns :`
			`if not edition:`
			`edition = Edition.get_by_isbn(isbn)`
			`return (isbns, edition )`


			`def load_from_books(books):`
			`''' books is an iterator of book dicts.`
			`each book mus have attributes`
			`eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList,`
			`Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last,`
			`Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong,`
			`DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate,`
			`eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights,`
			`SubjectListMARC, , Book-level DOI, URL, License`
			`'''`

			`for book in books:`
			`(isbns, edition) = get_isbns(book)`
			`title=book['FullTitle']`
			`authors = get_authors(book)`
			`if not edition and len(isbns):`
			`work = Work(title=title)`
			`work.save()`
			`edition= Edition(title=title, work=work)`
			`edition.save()`
			`Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work)`
			`work=edition.work`
			`Identifier.set(type='http', value=book['URL'], edition=edition, work=work)`
			`for isbn in isbns:`
			`edition= add_by_isbn_from_google(isbn)`
			`if not edition:`
			`edition= Edition(title=title, work=work)`
			`edition.save()`
			`Identifier.set(type='isbn', value=isbn, edition=edition, work=work)`
			`edition.authors.clear()`
			`for (author,role) in authors:`
			`edition.add_author(author, inv_relator_contrib.get(role, 'aut'))`
			`edition.publication_date = book['CopyrightYear']`
			`edition.cover_image = get_cover(book)`
			`edition.set_publisher(book['Publisher'])`
			`edition.save()`
			`description = book['DescriptionBrief']`
			`if len(description)>len (work.description):`
			`work.description = description`
			`for bisacsh in get_subjects(book):`
			`while bisacsh:`
			`add_subject(bisacsh.full_label, work, authority="bisacsh")`
			`bisacsh = bisacsh.parent`
			`logging.info(u'loaded work {}'.format(work.title))`