pull/91/head
eric 2018-05-11 11:46:04 -04:00
parent a303c08333
commit 05fae60ddb
1 changed files with 113 additions and 94 deletions

View File

@ -1,7 +1,6 @@
import csv
import logging
import re
import sys
import time
import unicodedata
import urlparse
@ -17,7 +16,7 @@ from regluit.bisac.models import BisacHeading
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
from regluit.core.isbn import ISBN
from regluit.core.models import (
Author, Ebook, EbookFile, Edition, Identifier, path_for_file, PublisherName, Subject, Work,
Ebook, EbookFile, Edition, Identifier, path_for_file, Subject, Work,
)
logger = logging.getLogger(__name__)
@ -31,7 +30,7 @@ def utf8_general_ci_norm(s):
"""
Normalize a la MySQL utf8_general_ci collation
(As of 2016.05.24, we're using the utf8_general_ci collation for author names)
https://stackoverflow.com/questions/1036454/what-are-the-diffrences-between-utf8-general-ci-and-utf8-unicode-ci/1036459#1036459
* converts to Unicode normalization form D for canonical decomposition
@ -50,78 +49,77 @@ def get_soup(url):
return None
def get_authors(book):
authors=[]
if book.get('AuthorsList',''):
authors = []
if book.get('AuthorsList', ''):
#UMich
for i in range(1,3):
fname=u'Author{}First'.format(i)
lname=u'Author{}Last'.format(i)
role=u'Author{}Role'.format(i)
authname = u'{} {}'.format(book[fname],book[lname])
for i in range(1, 3):
fname = u'Author{}First'.format(i)
lname = u'Author{}Last'.format(i)
role = u'Author{}Role'.format(i)
authname = u'{} {}'.format(book[fname], book[lname])
if authname != u' ':
role = book[role] if book[role].strip() else 'A01'
authors.append((authname,role))
authors.append((authname, role))
else:
break
authlist = book["AuthorsList"].replace(' and ', ', ').split(', ')
if len(authlist)>3:
if len(authlist) > 3:
for authname in authlist[3:]:
authors.append((authname, 'A01'))
else:
#OBP
for i in range(1,6):
fname= book.get(u'Contributor {} first name'.format(i), '')
lname= book.get(u'Contributor {} surname'.format(i), '')
role= book.get(u'ONIX Role Code (List 17){}'.format(i), '')
authname = u'{} {}'.format(fname,lname)
for i in range(1, 6):
fname = book.get(u'Contributor {} first name'.format(i), '')
lname = book.get(u'Contributor {} surname'.format(i), '')
role = book.get(u'ONIX Role Code (List 17){}'.format(i), '')
authname = u'{} {}'.format(fname, lname)
if authname != u' ':
role = role if role.strip() else 'A01'
authors.append((authname,role))
authors.append((authname, role))
else:
break
return authors
def get_subjects(book):
subjects=[]
for i in range(1,5):
subjects = []
for i in range(1, 5):
key = u'BISACCode{}'.format(i) #UMich dialect
key2 = u'BISAC subject code {}'.format(i) #OBP dialect
code = book.get(key,'')
code = code if code else book.get(key2,'')
code = book.get(key, '')
code = code if code else book.get(key2, '')
if code != '':
try:
bisac=BisacHeading.objects.get(notation=code)
bisac = BisacHeading.objects.get(notation=code)
subjects.append(bisac)
except BisacHeading.DoesNotExist:
logger.warning( "Please add BISAC {}".format(code))
logger.warning("Please add BISAC {}".format(code))
return subjects
def add_subject(subject_name, work, authority=''):
try:
subject= Subject.objects.get(name=subject_name)
subject = Subject.objects.get(name=subject_name)
except Subject.DoesNotExist:
subject=Subject.objects.create(name=subject_name, authority=authority)
subject = Subject.objects.create(name=subject_name, authority=authority)
subject.works.add(work)
def get_title(book):
title = book.get('FullTitle','') #UMICH
title = book.get('FullTitle', '') #UMICH
if title:
return title
title = book.get('Title','') #OBP
sub = book.get('Subtitle','')
title = book.get('Title', '') #OBP
sub = book.get('Subtitle', '')
if sub:
return u'{}: {}'.format(title,sub)
else:
return title
return u'{}: {}'.format(title, sub)
return title
def get_cover(book):
cover_url = book.get('Cover URL','') #OBP
cover_url = book.get('Cover URL', '') #OBP
if cover_url:
return cover_url
url = book['URL']
if "10.3998" in url:
# code for umich books; can generalize, of course!
idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url)
idmatch = re.search(r'([^/]+)\.(\d+\.\d+\.\d+)', url)
if idmatch:
book_id = idmatch.group(2)
if idmatch.group(1) == 'ohp':
@ -131,74 +129,78 @@ def get_cover(book):
else:
cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id)
cover = requests.head(cover_url)
if cover.status_code<400:
if cover.status_code < 400:
return cover_url
else:
logger.warning( "bad cover: {} for: {}".format(cover_url, url))
logger.warning("bad cover: {} for: {}".format(cover_url, url))
def get_isbns(book):
isbns = []
edition = None
#'ISBN 1' is OBP, others are UMICH
for code in ['eISBN', 'ISBN 3','PaperISBN', 'ISBN 2', 'ClothISBN', 'ISBN 1', 'ISBN 4', 'ISBN 5']:
if book.get(code, '') not in ('','N/A'):
for code in ['eISBN', 'ISBN 3', 'PaperISBN', 'ISBN 2', 'ClothISBN',
'ISBN 1', 'ISBN 4', 'ISBN 5'
]:
if book.get(code, '') not in ('', 'N/A'):
values = book[code].split(',')
for value in values:
isbn = ISBN(value).to_string()
if isbn:
isbns.append(isbn)
for isbn in isbns :
for isbn in isbns:
if not edition:
edition = Edition.get_by_isbn(isbn)
return (isbns, edition )
return (isbns, edition)
def get_pubdate(book):
value = book.get('CopyrightYear','') #UMICH
value = book.get('CopyrightYear', '') #UMICH
if value:
return value
value = book.get('publication year','') #OBP
sub = book.get('publication month','')
sub2 = book.get('publication day','')
value = book.get('publication year', '') #OBP
sub = book.get('publication month', '')
sub2 = book.get('publication day', '')
if sub2:
return u'{}-{}-{}'.format(value,sub,sub2)
return u'{}-{}-{}'.format(value, sub, sub2)
elif sub:
return u'{}-{}'.format(value,sub,sub2)
else:
return value
return u'{}-{}'.format(value, sub, sub2)
return value
def get_publisher(book):
value = book.get('Publisher','')
value = book.get('Publisher', '')
if value:
return value
if book.get('DOI prefix','')=='10.11647':
if book.get('DOI prefix', '') == '10.11647':
return "Open Book Publishers"
def get_url(book):
url = book.get('URL','')
url = url if url else u'https://doi.org/{}/{}'.format( book.get('DOI prefix',''),book.get('DOI suffix',''))
url = book.get('URL', '')
url = url if url else u'https://doi.org/{}/{}'.format(
book.get('DOI prefix', ''),
book.get('DOI suffix', '')
)
return url
def get_description(book):
value = book.get('DescriptionBrief','')
value = value if value else book.get('Plain Text Blurb','')
value = book.get('DescriptionBrief', '')
value = value if value else book.get('Plain Text Blurb', '')
return value
def get_language(book):
value = book.get('ISO Language Code','')
value = book.get('ISO Language Code', '')
return value
def load_from_books(books):
''' books is an iterator of book dicts.
each book must have attributes
(umich dialect)
eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList,
Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last,
Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong,
DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate,
eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights,
eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList,
Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last,
Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong,
DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate,
eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights,
SubjectListMARC, , Book-level DOI, URL, License
'''
# Goal: get or create an Edition and Work for each given book
@ -209,21 +211,21 @@ def load_from_books(books):
# try first to get an Edition already in DB with by one of the ISBNs in book
(isbns, edition) = get_isbns(book)
if len(isbns)==0:
if not isbns:
continue
title=get_title(book)
title = get_title(book)
authors = get_authors(book)
# if matching by ISBN doesn't work, then create a Work and Edition
# if matching by ISBN doesn't work, then create a Work and Edition
# with a title and the first ISBN
if not edition:
work = Work(title=title)
work.save()
edition= Edition(title=title, work=work)
edition = Edition(title=title, work=work)
edition.save()
Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work)
work=edition.work
work = edition.work
# at this point, work and edition exist
url = get_url(book)
@ -237,7 +239,7 @@ def load_from_books(books):
if edition and edition.work != work:
work = merge_works(work, edition.work)
if not edition:
edition= Edition(title=title, work=work)
edition = Edition(title=title, work=work)
edition.save()
Identifier.set(type='isbn', value=isbn, edition=edition, work=work)
@ -249,18 +251,18 @@ def load_from_books(books):
edition.save()
edition.set_publisher(get_publisher(book))
# possibly replace work.description
# possibly replace work.description
description = get_description(book)
if len(description)>len (work.description):
if len(description) > len(work.description):
work.description = description
work.save()
# set language
lang= get_language(book)
lang = get_language(book)
if lang:
work.language = lang
work.save()
# add a bisac subject (and ancestors) to work
for bisacsh in get_subjects(book):
while bisacsh:
@ -273,13 +275,13 @@ def load_from_books(books):
results.append((book, work, edition))
try:
logger.info (u"{} {} {}\n".format(i, title, loading_ok))
logger.info(u"{} {} {}\n".format(i, title, loading_ok))
except Exception as e:
logger.info (u"{} {}\n".format(i, title, str(e) ))
logger.info(u"{} {} {}\n".format(i, title, str(e)))
return results
def loaded_book_ok(book, work, edition):
isbns = get_isbns(book)[0]
@ -292,10 +294,10 @@ def loaded_book_ok(book, work, edition):
try:
url_id = Identifier.objects.get(type='http', value=get_url(book))
if url_id is None:
logger.info ("url_id problem: work.id {}, url: {}".format(work.id, get_url(book)))
logger.info("url_id problem: work.id {}, url: {}".format(work.id, get_url(book)))
return False
except Exception as e:
logger.info (str(e))
logger.info(str(e))
return False
# isbns
@ -307,15 +309,17 @@ def loaded_book_ok(book, work, edition):
try:
edition_for_isbn = Identifier.objects.get(type='isbn', value=isbn).edition
except Exception as e:
print (e)
logger.info(e)
return False
# authors
# print set([ed.name for ed in edition_for_isbn.authors.all()])
if (set([utf8_general_ci_norm(author[0]) for author in authors]) !=
set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])):
print "problem with authors"
if (
set([utf8_general_ci_norm(author[0]) for author in authors]) !=
set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])
):
logger.info("problem with authors")
return False
try:
@ -327,7 +331,7 @@ def loaded_book_ok(book, work, edition):
# work description
description = get_description(book)
if not ((work.description == description) or (len(description) <len (work.description))):
if not ((work.description == description) or (len(description) < len(work.description))):
return False
# bisac
@ -364,19 +368,23 @@ def ids_from_urls(url):
if id_match:
ids[ident] = id_match.group('id')
return ids
DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"')
def dl_online(ebook):
if ebook.format != 'online':
return None, False
if ebook.url.find(u'dropbox.com/s/') >= 0:
pass
elif ebook.url.find(u'dropbox.com/s/') >= 0:
response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
match_dl = DROPBOX_DL.search(response.content)
if match_dl:
return make_dl_ebook(match_dl.group(1), ebook)
else:
logger.warning('couldn\'t get {}'.format(ebook.url))
else:
logger.warning('couldn\'t get dl for {}'.format(ebook.url))
elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
doc = get_soup(ebook.url)
if doc:
@ -384,7 +392,13 @@ def dl_online(ebook):
if obj:
dl_url = urlparse.urljoin(ebook.url, obj['href'])
return make_dl_ebook(dl_url, ebook)
else:
logger.warning('couldn\'t get dl_url for {}'.format(ebook.url))
else:
logger.warning('couldn\'t get soup for {}'.format(ebook.url))
return None, False
def make_dl_ebook(url, ebook):
if EbookFile.objects.filter(source=ebook.url):
return EbookFile.objects.filter(source=ebook.url)[0], False
@ -414,12 +428,17 @@ def make_dl_ebook(url, ebook):
new_ebf.ebook = new_ebook
new_ebf.save()
return new_ebf, True
else:
logger.warning('download format for {} is not ebook'.format(url))
else:
logger.warning('couldn\'t get {}'.format(url))
return None, False
def type_for_url(url, content_type=None):
if not url:
return ''
if url.find('books.openedition.org') >= 0:
return ('online')
return 'online'
if Ebook.objects.filter(url=url):
return Ebook.objects.filter(url=url)[0].format
ct = content_type if content_type else contenttyper.calc_type(url)
@ -440,7 +459,7 @@ def type_for_url(url, content_type=None):
elif re.search("mobi", ct):
return "mobi"
return "other"
class ContentTyper(object):
""" """
def __init__(self):