pull/91/head
eric 2018-05-11 11:46:04 -04:00
parent a303c08333
commit 05fae60ddb
1 changed files with 113 additions and 94 deletions

View File

@ -1,7 +1,6 @@
import csv
import logging
import re
import sys
import time
import unicodedata
import urlparse
@ -17,7 +16,7 @@ from regluit.bisac.models import BisacHeading
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
from regluit.core.isbn import ISBN
from regluit.core.models import (
Author, Ebook, EbookFile, Edition, Identifier, path_for_file, PublisherName, Subject, Work,
Ebook, EbookFile, Edition, Identifier, path_for_file, Subject, Work,
)
logger = logging.getLogger(__name__)
@ -50,78 +49,77 @@ def get_soup(url):
return None
def get_authors(book):
authors=[]
if book.get('AuthorsList',''):
authors = []
if book.get('AuthorsList', ''):
#UMich
for i in range(1,3):
fname=u'Author{}First'.format(i)
lname=u'Author{}Last'.format(i)
role=u'Author{}Role'.format(i)
authname = u'{} {}'.format(book[fname],book[lname])
for i in range(1, 3):
fname = u'Author{}First'.format(i)
lname = u'Author{}Last'.format(i)
role = u'Author{}Role'.format(i)
authname = u'{} {}'.format(book[fname], book[lname])
if authname != u' ':
role = book[role] if book[role].strip() else 'A01'
authors.append((authname,role))
authors.append((authname, role))
else:
break
authlist = book["AuthorsList"].replace(' and ', ', ').split(', ')
if len(authlist)>3:
if len(authlist) > 3:
for authname in authlist[3:]:
authors.append((authname, 'A01'))
else:
#OBP
for i in range(1,6):
fname= book.get(u'Contributor {} first name'.format(i), '')
lname= book.get(u'Contributor {} surname'.format(i), '')
role= book.get(u'ONIX Role Code (List 17){}'.format(i), '')
authname = u'{} {}'.format(fname,lname)
for i in range(1, 6):
fname = book.get(u'Contributor {} first name'.format(i), '')
lname = book.get(u'Contributor {} surname'.format(i), '')
role = book.get(u'ONIX Role Code (List 17){}'.format(i), '')
authname = u'{} {}'.format(fname, lname)
if authname != u' ':
role = role if role.strip() else 'A01'
authors.append((authname,role))
authors.append((authname, role))
else:
break
return authors
def get_subjects(book):
subjects=[]
for i in range(1,5):
subjects = []
for i in range(1, 5):
key = u'BISACCode{}'.format(i) #UMich dialect
key2 = u'BISAC subject code {}'.format(i) #OBP dialect
code = book.get(key,'')
code = code if code else book.get(key2,'')
code = book.get(key, '')
code = code if code else book.get(key2, '')
if code != '':
try:
bisac=BisacHeading.objects.get(notation=code)
bisac = BisacHeading.objects.get(notation=code)
subjects.append(bisac)
except BisacHeading.DoesNotExist:
logger.warning( "Please add BISAC {}".format(code))
logger.warning("Please add BISAC {}".format(code))
return subjects
def add_subject(subject_name, work, authority=''):
try:
subject= Subject.objects.get(name=subject_name)
subject = Subject.objects.get(name=subject_name)
except Subject.DoesNotExist:
subject=Subject.objects.create(name=subject_name, authority=authority)
subject = Subject.objects.create(name=subject_name, authority=authority)
subject.works.add(work)
def get_title(book):
title = book.get('FullTitle','') #UMICH
title = book.get('FullTitle', '') #UMICH
if title:
return title
title = book.get('Title','') #OBP
sub = book.get('Subtitle','')
title = book.get('Title', '') #OBP
sub = book.get('Subtitle', '')
if sub:
return u'{}: {}'.format(title,sub)
else:
return u'{}: {}'.format(title, sub)
return title
def get_cover(book):
cover_url = book.get('Cover URL','') #OBP
cover_url = book.get('Cover URL', '') #OBP
if cover_url:
return cover_url
url = book['URL']
if "10.3998" in url:
# code for umich books; can generalize, of course!
idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url)
idmatch = re.search(r'([^/]+)\.(\d+\.\d+\.\d+)', url)
if idmatch:
book_id = idmatch.group(2)
if idmatch.group(1) == 'ohp':
@ -131,60 +129,64 @@ def get_cover(book):
else:
cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id)
cover = requests.head(cover_url)
if cover.status_code<400:
if cover.status_code < 400:
return cover_url
else:
logger.warning( "bad cover: {} for: {}".format(cover_url, url))
logger.warning("bad cover: {} for: {}".format(cover_url, url))
def get_isbns(book):
isbns = []
edition = None
#'ISBN 1' is OBP, others are UMICH
for code in ['eISBN', 'ISBN 3','PaperISBN', 'ISBN 2', 'ClothISBN', 'ISBN 1', 'ISBN 4', 'ISBN 5']:
if book.get(code, '') not in ('','N/A'):
for code in ['eISBN', 'ISBN 3', 'PaperISBN', 'ISBN 2', 'ClothISBN',
'ISBN 1', 'ISBN 4', 'ISBN 5'
]:
if book.get(code, '') not in ('', 'N/A'):
values = book[code].split(',')
for value in values:
isbn = ISBN(value).to_string()
if isbn:
isbns.append(isbn)
for isbn in isbns :
for isbn in isbns:
if not edition:
edition = Edition.get_by_isbn(isbn)
return (isbns, edition )
return (isbns, edition)
def get_pubdate(book):
value = book.get('CopyrightYear','') #UMICH
value = book.get('CopyrightYear', '') #UMICH
if value:
return value
value = book.get('publication year','') #OBP
sub = book.get('publication month','')
sub2 = book.get('publication day','')
value = book.get('publication year', '') #OBP
sub = book.get('publication month', '')
sub2 = book.get('publication day', '')
if sub2:
return u'{}-{}-{}'.format(value,sub,sub2)
return u'{}-{}-{}'.format(value, sub, sub2)
elif sub:
return u'{}-{}'.format(value,sub,sub2)
else:
return u'{}-{}'.format(value, sub, sub2)
return value
def get_publisher(book):
value = book.get('Publisher','')
value = book.get('Publisher', '')
if value:
return value
if book.get('DOI prefix','')=='10.11647':
if book.get('DOI prefix', '') == '10.11647':
return "Open Book Publishers"
def get_url(book):
url = book.get('URL','')
url = url if url else u'https://doi.org/{}/{}'.format( book.get('DOI prefix',''),book.get('DOI suffix',''))
url = book.get('URL', '')
url = url if url else u'https://doi.org/{}/{}'.format(
book.get('DOI prefix', ''),
book.get('DOI suffix', '')
)
return url
def get_description(book):
value = book.get('DescriptionBrief','')
value = value if value else book.get('Plain Text Blurb','')
value = book.get('DescriptionBrief', '')
value = value if value else book.get('Plain Text Blurb', '')
return value
def get_language(book):
value = book.get('ISO Language Code','')
value = book.get('ISO Language Code', '')
return value
@ -209,9 +211,9 @@ def load_from_books(books):
# try first to get an Edition already in DB with by one of the ISBNs in book
(isbns, edition) = get_isbns(book)
if len(isbns)==0:
if not isbns:
continue
title=get_title(book)
title = get_title(book)
authors = get_authors(book)
# if matching by ISBN doesn't work, then create a Work and Edition
@ -219,11 +221,11 @@ def load_from_books(books):
if not edition:
work = Work(title=title)
work.save()
edition= Edition(title=title, work=work)
edition = Edition(title=title, work=work)
edition.save()
Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work)
work=edition.work
work = edition.work
# at this point, work and edition exist
url = get_url(book)
@ -237,7 +239,7 @@ def load_from_books(books):
if edition and edition.work != work:
work = merge_works(work, edition.work)
if not edition:
edition= Edition(title=title, work=work)
edition = Edition(title=title, work=work)
edition.save()
Identifier.set(type='isbn', value=isbn, edition=edition, work=work)
@ -251,12 +253,12 @@ def load_from_books(books):
# possibly replace work.description
description = get_description(book)
if len(description)>len (work.description):
if len(description) > len(work.description):
work.description = description
work.save()
# set language
lang= get_language(book)
lang = get_language(book)
if lang:
work.language = lang
work.save()
@ -273,9 +275,9 @@ def load_from_books(books):
results.append((book, work, edition))
try:
logger.info (u"{} {} {}\n".format(i, title, loading_ok))
logger.info(u"{} {} {}\n".format(i, title, loading_ok))
except Exception as e:
logger.info (u"{} {}\n".format(i, title, str(e) ))
logger.info(u"{} {} {}\n".format(i, title, str(e)))
return results
@ -292,10 +294,10 @@ def loaded_book_ok(book, work, edition):
try:
url_id = Identifier.objects.get(type='http', value=get_url(book))
if url_id is None:
logger.info ("url_id problem: work.id {}, url: {}".format(work.id, get_url(book)))
logger.info("url_id problem: work.id {}, url: {}".format(work.id, get_url(book)))
return False
except Exception as e:
logger.info (str(e))
logger.info(str(e))
return False
# isbns
@ -307,15 +309,17 @@ def loaded_book_ok(book, work, edition):
try:
edition_for_isbn = Identifier.objects.get(type='isbn', value=isbn).edition
except Exception as e:
print (e)
logger.info(e)
return False
# authors
# print set([ed.name for ed in edition_for_isbn.authors.all()])
if (set([utf8_general_ci_norm(author[0]) for author in authors]) !=
set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])):
print "problem with authors"
if (
set([utf8_general_ci_norm(author[0]) for author in authors]) !=
set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])
):
logger.info("problem with authors")
return False
try:
@ -327,7 +331,7 @@ def loaded_book_ok(book, work, edition):
# work description
description = get_description(book)
if not ((work.description == description) or (len(description) <len (work.description))):
if not ((work.description == description) or (len(description) < len(work.description))):
return False
# bisac
@ -369,14 +373,18 @@ DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+
def dl_online(ebook):
if ebook.format != 'online':
return None, False
if ebook.url.find(u'dropbox.com/s/') >= 0:
pass
elif ebook.url.find(u'dropbox.com/s/') >= 0:
response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
match_dl = DROPBOX_DL.search(response.content)
if match_dl:
return make_dl_ebook(match_dl.group(1), ebook)
else:
logger.warning('couldn\'t get {}'.format(ebook.url))
else:
logger.warning('couldn\'t get dl for {}'.format(ebook.url))
elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
doc = get_soup(ebook.url)
if doc:
@ -384,6 +392,12 @@ def dl_online(ebook):
if obj:
dl_url = urlparse.urljoin(ebook.url, obj['href'])
return make_dl_ebook(dl_url, ebook)
else:
logger.warning('couldn\'t get dl_url for {}'.format(ebook.url))
else:
logger.warning('couldn\'t get soup for {}'.format(ebook.url))
return None, False
def make_dl_ebook(url, ebook):
if EbookFile.objects.filter(source=ebook.url):
@ -414,12 +428,17 @@ def make_dl_ebook(url, ebook):
new_ebf.ebook = new_ebook
new_ebf.save()
return new_ebf, True
else:
logger.warning('download format for {} is not ebook'.format(url))
else:
logger.warning('couldn\'t get {}'.format(url))
return None, False
def type_for_url(url, content_type=None):
if not url:
return ''
if url.find('books.openedition.org') >= 0:
return ('online')
return 'online'
if Ebook.objects.filter(url=url):
return Ebook.objects.filter(url=url)[0].format
ct = content_type if content_type else contenttyper.calc_type(url)