code for loading umich spreadsheet

2016-05-19 09:17:23 -04:00 · 2016-05-19 09:17:23 -04:00 · cb3581e932
parent 605d971078
commit cb3581e932
3 changed files with 147 additions and 0 deletions
--- a/core/loaders/init.py
+++ b/core/loaders/init.py
--- a/core/loaders/utils.py
+++ b/core/loaders/utils.py
@ -0,0 +1,134 @@
+import csv
+import re
+import requests
+import logging
+
+from regluit.core.models import Work, Edition, Author, PublisherName, Identifier, Subject
+from regluit.core.isbn import ISBN
+from regluit.core.bookloader import add_by_isbn_from_google
+from regluit.api.crosswalks import inv_relator_contrib
+from regluit.bisac.models import BisacHeading
+
+logger = logging.getLogger(__name__)
+
+def UnicodeDictReader(utf8_data, **kwargs):
+    csv_reader = csv.DictReader(utf8_data, **kwargs)
+    for row in csv_reader:
+        yield {key: unicode(value, 'utf-8') for key, value in row.iteritems()}
+
+def get_authors(book):
+    authors=[]
+    for i in range(1,3):
+        fname=u'Author{}First'.format(i)
+        lname=u'Author{}Last'.format(i)
+        role=u'Author{}Role'.format(i)
+        authname = u'{} {}'.format(book[fname].encode('utf-8'),book[lname])
+        if authname != u' ':
+            role = book[role] if book[role]!= u' ' else 'A01'
+            authors.append((authname,role))
+        else:
+            break
+    authlist = book["AuthorsList"].replace(' and ', ', ').split(', ')
+    if len(authlist)>3:
+        for authname in authlist[3:]:
+            authors.append((authname, 'A01'))
+    return authors
+
+def get_subjects(book):
+    subjects=[]
+    for i in range(1,3):
+        key=u'BISACCode{}'.format(i)
+        if book[key] != '':
+            try:
+                bisac=BisacHeading.objects.get(notation=book[key])
+                subjects.append(bisac)
+            except BisacHeading.DoesNotExist:
+                logger.warning( "Please add BISAC {}".format(book[key]))
+    return subjects
+
+def add_subject(subject_name,work, authority=''):
+    try:
+        subject= Subject.objects.get(name=subject_name)
+    except Subject.DoesNotExist:
+        subject=Subject.objects.create(name=subject_name, authority=authority)
+    subject.works.add(work)
+
+def get_cover(book):
+    url = book['URL']
+    if "10.3998" in url:
+        # code for umich books; can generalize, of course!
+        idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url)
+        if idmatch:
+            book_id = idmatch.group(2)
+            if idmatch.group(1) == 'ohp':
+                cover_url = "http://quod.lib.umich.edu/o/ohp/images/{}.jpg".format(book_id)
+            elif idmatch.group(1) == 'ump':
+                cover_url = "http://quod.lib.umich.edu/u/ump/images/{}.jpg".format(book_id)
+            else:
+                cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id)
+            cover = requests.head(cover_url)
+            if cover.status_code<400:
+                return cover_url
+            else:
+                logger.warning( "bad cover: {} for: {}".format(cover_url, url))
+            
+def get_isbns(book):
+    isbns = []
+    edition = None
+    for code in ['eISBN','PaperISBN','ClothISBN']:
+        if book[code] not in ('','N/A'):
+            values = book[code].split(',')
+            for value in values:
+                isbn = ISBN(value).to_string()
+                if isbn:
+                    isbns.append(isbn)
+    for isbn in isbns :
+        if not edition:
+            edition = Edition.get_by_isbn(isbn)
+    return (isbns, edition )
+
+
+def load_from_books(books):
+    ''' books is an iterator of book dicts.
+        each book mus have attributes
+        eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList, 
+        Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last, 
+        Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong, 
+        DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate, 
+        eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights, 
+        SubjectListMARC, , Book-level DOI, URL,	License
+        '''
+
+    for book in books:
+        (isbns, edition) = get_isbns(book)
+        title=book['FullTitle']
+        authors = get_authors(book)
+        if not edition and len(isbns):
+            work = Work(title=title)
+            work.save()
+            edition= Edition(title=title, work=work) 
+            edition.save()
+            Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work)
+        work=edition.work
+        Identifier.set(type='http', value=book['URL'], edition=edition, work=work)
+        for isbn in isbns:
+            edition= add_by_isbn_from_google(isbn)
+            if not edition:
+                edition= Edition(title=title, work=work)
+                edition.save()
+                Identifier.set(type='isbn', value=isbn, edition=edition, work=work)
+            edition.authors.clear()
+            for (author,role) in authors:
+                edition.add_author(author, inv_relator_contrib.get(role, 'aut'))
+            edition.publication_date = book['CopyrightYear']
+            edition.cover_image = get_cover(book)
+            edition.set_publisher(book['Publisher'])
+            edition.save()
+        description = book['DescriptionBrief']
+        if len(description)>len (work.description):
+            work.description = description
+        for bisacsh in get_subjects(book):
+            while bisacsh:
+                add_subject(bisacsh.full_label, work, authority="bisacsh")
+                bisacsh = bisacsh.parent
+        logging.info(u'loaded work {}'.format(work.title))
--- a/core/management/commands/load_books_from_onix_csv.py
+++ b/core/management/commands/load_books_from_onix_csv.py
@ -0,0 +1,13 @@
+import csv
+from django.core.management.base import BaseCommand
+
+from regluit.core.loaders.utils import UnicodeDictReader, load_from_books
+
+class Command(BaseCommand):
+    help = "load books based on a csv spreadsheet of onix data"
+    args = "<filename>"
+
+    def handle(self, filename, **options):
+        sheetreader= UnicodeDictReader(open(filename,'rU'), dialect=csv.excel_tab)
+        load_from_books(sheetreader)        
+        print "finished loading"