regluit/core/loaders/utils.py

135 lines
5.2 KiB
Python
Raw Normal View History

2016-05-19 13:17:23 +00:00
import csv
import re
import requests
import logging
from regluit.core.models import Work, Edition, Author, PublisherName, Identifier, Subject
from regluit.core.isbn import ISBN
from regluit.core.bookloader import add_by_isbn_from_google
from regluit.api.crosswalks import inv_relator_contrib
from regluit.bisac.models import BisacHeading
logger = logging.getLogger(__name__)
def UnicodeDictReader(utf8_data, **kwargs):
csv_reader = csv.DictReader(utf8_data, **kwargs)
for row in csv_reader:
yield {key: unicode(value, 'utf-8') for key, value in row.iteritems()}
def get_authors(book):
authors=[]
for i in range(1,3):
fname=u'Author{}First'.format(i)
lname=u'Author{}Last'.format(i)
role=u'Author{}Role'.format(i)
authname = u'{} {}'.format(book[fname].encode('utf-8'),book[lname])
if authname != u' ':
role = book[role] if book[role]!= u' ' else 'A01'
authors.append((authname,role))
else:
break
authlist = book["AuthorsList"].replace(' and ', ', ').split(', ')
if len(authlist)>3:
for authname in authlist[3:]:
authors.append((authname, 'A01'))
return authors
def get_subjects(book):
subjects=[]
for i in range(1,3):
key=u'BISACCode{}'.format(i)
if book[key] != '':
try:
bisac=BisacHeading.objects.get(notation=book[key])
subjects.append(bisac)
except BisacHeading.DoesNotExist:
logger.warning( "Please add BISAC {}".format(book[key]))
return subjects
def add_subject(subject_name,work, authority=''):
try:
subject= Subject.objects.get(name=subject_name)
except Subject.DoesNotExist:
subject=Subject.objects.create(name=subject_name, authority=authority)
subject.works.add(work)
def get_cover(book):
url = book['URL']
if "10.3998" in url:
# code for umich books; can generalize, of course!
idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url)
if idmatch:
book_id = idmatch.group(2)
if idmatch.group(1) == 'ohp':
cover_url = "http://quod.lib.umich.edu/o/ohp/images/{}.jpg".format(book_id)
elif idmatch.group(1) == 'ump':
cover_url = "http://quod.lib.umich.edu/u/ump/images/{}.jpg".format(book_id)
else:
cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id)
cover = requests.head(cover_url)
if cover.status_code<400:
return cover_url
else:
logger.warning( "bad cover: {} for: {}".format(cover_url, url))
def get_isbns(book):
isbns = []
edition = None
for code in ['eISBN','PaperISBN','ClothISBN']:
if book[code] not in ('','N/A'):
values = book[code].split(',')
for value in values:
isbn = ISBN(value).to_string()
if isbn:
isbns.append(isbn)
for isbn in isbns :
if not edition:
edition = Edition.get_by_isbn(isbn)
return (isbns, edition )
def load_from_books(books):
''' books is an iterator of book dicts.
each book mus have attributes
eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList,
Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last,
Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong,
DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate,
eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights,
SubjectListMARC, , Book-level DOI, URL, License
'''
for book in books:
(isbns, edition) = get_isbns(book)
title=book['FullTitle']
authors = get_authors(book)
if not edition and len(isbns):
work = Work(title=title)
work.save()
edition= Edition(title=title, work=work)
edition.save()
Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work)
work=edition.work
Identifier.set(type='http', value=book['URL'], edition=edition, work=work)
for isbn in isbns:
edition= add_by_isbn_from_google(isbn)
if not edition:
edition= Edition(title=title, work=work)
edition.save()
Identifier.set(type='isbn', value=isbn, edition=edition, work=work)
edition.authors.clear()
for (author,role) in authors:
edition.add_author(author, inv_relator_contrib.get(role, 'aut'))
edition.publication_date = book['CopyrightYear']
edition.cover_image = get_cover(book)
edition.set_publisher(book['Publisher'])
edition.save()
description = book['DescriptionBrief']
if len(description)>len (work.description):
work.description = description
for bisacsh in get_subjects(book):
while bisacsh:
add_subject(bisacsh.full_label, work, authority="bisacsh")
bisacsh = bisacsh.parent
logging.info(u'loaded work {}'.format(work.title))