code for loading umich spreadsheet

pull/1/head
eric 2016-05-19 09:17:23 -04:00
parent 605d971078
commit cb3581e932
3 changed files with 147 additions and 0 deletions

0
core/loaders/__init__.py Executable file
View File

134
core/loaders/utils.py Normal file
View File

@ -0,0 +1,134 @@
import csv
import re
import requests
import logging
from regluit.core.models import Work, Edition, Author, PublisherName, Identifier, Subject
from regluit.core.isbn import ISBN
from regluit.core.bookloader import add_by_isbn_from_google
from regluit.api.crosswalks import inv_relator_contrib
from regluit.bisac.models import BisacHeading
logger = logging.getLogger(__name__)
def UnicodeDictReader(utf8_data, **kwargs):
csv_reader = csv.DictReader(utf8_data, **kwargs)
for row in csv_reader:
yield {key: unicode(value, 'utf-8') for key, value in row.iteritems()}
def get_authors(book):
authors=[]
for i in range(1,3):
fname=u'Author{}First'.format(i)
lname=u'Author{}Last'.format(i)
role=u'Author{}Role'.format(i)
authname = u'{} {}'.format(book[fname].encode('utf-8'),book[lname])
if authname != u' ':
role = book[role] if book[role]!= u' ' else 'A01'
authors.append((authname,role))
else:
break
authlist = book["AuthorsList"].replace(' and ', ', ').split(', ')
if len(authlist)>3:
for authname in authlist[3:]:
authors.append((authname, 'A01'))
return authors
def get_subjects(book):
subjects=[]
for i in range(1,3):
key=u'BISACCode{}'.format(i)
if book[key] != '':
try:
bisac=BisacHeading.objects.get(notation=book[key])
subjects.append(bisac)
except BisacHeading.DoesNotExist:
logger.warning( "Please add BISAC {}".format(book[key]))
return subjects
def add_subject(subject_name,work, authority=''):
try:
subject= Subject.objects.get(name=subject_name)
except Subject.DoesNotExist:
subject=Subject.objects.create(name=subject_name, authority=authority)
subject.works.add(work)
def get_cover(book):
url = book['URL']
if "10.3998" in url:
# code for umich books; can generalize, of course!
idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url)
if idmatch:
book_id = idmatch.group(2)
if idmatch.group(1) == 'ohp':
cover_url = "http://quod.lib.umich.edu/o/ohp/images/{}.jpg".format(book_id)
elif idmatch.group(1) == 'ump':
cover_url = "http://quod.lib.umich.edu/u/ump/images/{}.jpg".format(book_id)
else:
cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id)
cover = requests.head(cover_url)
if cover.status_code<400:
return cover_url
else:
logger.warning( "bad cover: {} for: {}".format(cover_url, url))
def get_isbns(book):
isbns = []
edition = None
for code in ['eISBN','PaperISBN','ClothISBN']:
if book[code] not in ('','N/A'):
values = book[code].split(',')
for value in values:
isbn = ISBN(value).to_string()
if isbn:
isbns.append(isbn)
for isbn in isbns :
if not edition:
edition = Edition.get_by_isbn(isbn)
return (isbns, edition )
def load_from_books(books):
''' books is an iterator of book dicts.
each book mus have attributes
eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList,
Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last,
Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong,
DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate,
eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights,
SubjectListMARC, , Book-level DOI, URL, License
'''
for book in books:
(isbns, edition) = get_isbns(book)
title=book['FullTitle']
authors = get_authors(book)
if not edition and len(isbns):
work = Work(title=title)
work.save()
edition= Edition(title=title, work=work)
edition.save()
Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work)
work=edition.work
Identifier.set(type='http', value=book['URL'], edition=edition, work=work)
for isbn in isbns:
edition= add_by_isbn_from_google(isbn)
if not edition:
edition= Edition(title=title, work=work)
edition.save()
Identifier.set(type='isbn', value=isbn, edition=edition, work=work)
edition.authors.clear()
for (author,role) in authors:
edition.add_author(author, inv_relator_contrib.get(role, 'aut'))
edition.publication_date = book['CopyrightYear']
edition.cover_image = get_cover(book)
edition.set_publisher(book['Publisher'])
edition.save()
description = book['DescriptionBrief']
if len(description)>len (work.description):
work.description = description
for bisacsh in get_subjects(book):
while bisacsh:
add_subject(bisacsh.full_label, work, authority="bisacsh")
bisacsh = bisacsh.parent
logging.info(u'loaded work {}'.format(work.title))

View File

@ -0,0 +1,13 @@
import csv
from django.core.management.base import BaseCommand
from regluit.core.loaders.utils import UnicodeDictReader, load_from_books
class Command(BaseCommand):
help = "load books based on a csv spreadsheet of onix data"
args = "<filename>"
def handle(self, filename, **options):
sheetreader= UnicodeDictReader(open(filename,'rU'), dialect=csv.excel_tab)
load_from_books(sheetreader)
print "finished loading"