code for loading umich spreadsheet

2016-05-19 09:17:23 -04:00 · 2016-05-19 09:17:23 -04:00 · cb3581e932
parent 605d971078
commit cb3581e932
3 changed files with 147 additions and 0 deletions
--- a/core/loaders/init.py
+++ b/core/loaders/init.py
--- a/core/loaders/utils.py
+++ b/core/loaders/utils.py
@ -0,0 +1,134 @@
 import csv
 import re
 import requests
 import logging
 from regluit.core.models import Work, Edition, Author, PublisherName, Identifier, Subject
 from regluit.core.isbn import ISBN
 from regluit.core.bookloader import add_by_isbn_from_google
 from regluit.api.crosswalks import inv_relator_contrib
 from regluit.bisac.models import BisacHeading
 logger = logging.getLogger(__name__)
 def UnicodeDictReader(utf8_data, **kwargs):
    csv_reader = csv.DictReader(utf8_data, **kwargs)
    for row in csv_reader:
        yield {key: unicode(value, 'utf-8') for key, value in row.iteritems()}
 def get_authors(book):
    authors=[]
    for i in range(1,3):
        fname=u'Author{}First'.format(i)
        lname=u'Author{}Last'.format(i)
        role=u'Author{}Role'.format(i)
        authname = u'{} {}'.format(book[fname].encode('utf-8'),book[lname])
        if authname != u' ':
            role = book[role] if book[role]!= u' ' else 'A01'
            authors.append((authname,role))
        else:
            break
    authlist = book["AuthorsList"].replace(' and ', ', ').split(', ')
    if len(authlist)>3:
        for authname in authlist[3:]:
            authors.append((authname, 'A01'))
    return authors
 def get_subjects(book):
    subjects=[]
    for i in range(1,3):
        key=u'BISACCode{}'.format(i)
        if book[key] != '':
            try:
                bisac=BisacHeading.objects.get(notation=book[key])
                subjects.append(bisac)
            except BisacHeading.DoesNotExist:
                logger.warning( "Please add BISAC {}".format(book[key]))
    return subjects
 def add_subject(subject_name,work, authority=''):
    try:
        subject= Subject.objects.get(name=subject_name)
    except Subject.DoesNotExist:
        subject=Subject.objects.create(name=subject_name, authority=authority)
    subject.works.add(work)
 def get_cover(book):
    url = book['URL']
    if "10.3998" in url:
        # code for umich books; can generalize, of course!
        idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url)
        if idmatch:
            book_id = idmatch.group(2)
            if idmatch.group(1) == 'ohp':
                cover_url = "http://quod.lib.umich.edu/o/ohp/images/{}.jpg".format(book_id)
            elif idmatch.group(1) == 'ump':
                cover_url = "http://quod.lib.umich.edu/u/ump/images/{}.jpg".format(book_id)
            else:
                cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id)
            cover = requests.head(cover_url)
            if cover.status_code<400:
                return cover_url
            else:
                logger.warning( "bad cover: {} for: {}".format(cover_url, url))
 def get_isbns(book):
    isbns = []
    edition = None
    for code in ['eISBN','PaperISBN','ClothISBN']:
        if book[code] not in ('','N/A'):
            values = book[code].split(',')
            for value in values:
                isbn = ISBN(value).to_string()
                if isbn:
                    isbns.append(isbn)
    for isbn in isbns :
        if not edition:
            edition = Edition.get_by_isbn(isbn)
    return (isbns, edition )
 def load_from_books(books):
    ''' books is an iterator of book dicts.
        each book mus have attributes
        eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList, 
        Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last, 
        Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong, 
        DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate, 
        eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights, 
        SubjectListMARC, , Book-level DOI, URL,	License
        '''
    for book in books:
        (isbns, edition) = get_isbns(book)
        title=book['FullTitle']
        authors = get_authors(book)
        if not edition and len(isbns):
            work = Work(title=title)
            work.save()
            edition= Edition(title=title, work=work) 
            edition.save()
            Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work)
        work=edition.work
        Identifier.set(type='http', value=book['URL'], edition=edition, work=work)
        for isbn in isbns:
            edition= add_by_isbn_from_google(isbn)
            if not edition:
                edition= Edition(title=title, work=work)
                edition.save()
                Identifier.set(type='isbn', value=isbn, edition=edition, work=work)
            edition.authors.clear()
            for (author,role) in authors:
                edition.add_author(author, inv_relator_contrib.get(role, 'aut'))
            edition.publication_date = book['CopyrightYear']
            edition.cover_image = get_cover(book)
            edition.set_publisher(book['Publisher'])
            edition.save()
        description = book['DescriptionBrief']
        if len(description)>len (work.description):
            work.description = description
        for bisacsh in get_subjects(book):
            while bisacsh:
                add_subject(bisacsh.full_label, work, authority="bisacsh")
                bisacsh = bisacsh.parent
        logging.info(u'loaded work {}'.format(work.title))
--- a/core/management/commands/load_books_from_onix_csv.py
+++ b/core/management/commands/load_books_from_onix_csv.py
@ -0,0 +1,13 @@
 import csv
 from django.core.management.base import BaseCommand
 from regluit.core.loaders.utils import UnicodeDictReader, load_from_books
 class Command(BaseCommand):
    help = "load books based on a csv spreadsheet of onix data"
    args = "<filename>"
    def handle(self, filename, **options):
        sheetreader= UnicodeDictReader(open(filename,'rU'), dialect=csv.excel_tab)
        load_from_books(sheetreader)        
        print "finished loading"