commit
71ba8dc9fa
|
@ -1,7 +1,6 @@
|
|||
import csv
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import unicodedata
|
||||
import urlparse
|
||||
|
@ -17,7 +16,7 @@ from regluit.bisac.models import BisacHeading
|
|||
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
|
||||
from regluit.core.isbn import ISBN
|
||||
from regluit.core.models import (
|
||||
Author, Ebook, EbookFile, Edition, Identifier, path_for_file, PublisherName, Subject, Work,
|
||||
Ebook, EbookFile, Edition, Identifier, path_for_file, Subject, Work,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -31,7 +30,7 @@ def utf8_general_ci_norm(s):
|
|||
"""
|
||||
Normalize a la MySQL utf8_general_ci collation
|
||||
(As of 2016.05.24, we're using the utf8_general_ci collation for author names)
|
||||
|
||||
|
||||
https://stackoverflow.com/questions/1036454/what-are-the-diffrences-between-utf8-general-ci-and-utf8-unicode-ci/1036459#1036459
|
||||
|
||||
* converts to Unicode normalization form D for canonical decomposition
|
||||
|
@ -50,78 +49,77 @@ def get_soup(url):
|
|||
return None
|
||||
|
||||
def get_authors(book):
|
||||
authors=[]
|
||||
if book.get('AuthorsList',''):
|
||||
authors = []
|
||||
if book.get('AuthorsList', ''):
|
||||
#UMich
|
||||
for i in range(1,3):
|
||||
fname=u'Author{}First'.format(i)
|
||||
lname=u'Author{}Last'.format(i)
|
||||
role=u'Author{}Role'.format(i)
|
||||
authname = u'{} {}'.format(book[fname],book[lname])
|
||||
for i in range(1, 3):
|
||||
fname = u'Author{}First'.format(i)
|
||||
lname = u'Author{}Last'.format(i)
|
||||
role = u'Author{}Role'.format(i)
|
||||
authname = u'{} {}'.format(book[fname], book[lname])
|
||||
if authname != u' ':
|
||||
role = book[role] if book[role].strip() else 'A01'
|
||||
authors.append((authname,role))
|
||||
authors.append((authname, role))
|
||||
else:
|
||||
break
|
||||
authlist = book["AuthorsList"].replace(' and ', ', ').split(', ')
|
||||
if len(authlist)>3:
|
||||
if len(authlist) > 3:
|
||||
for authname in authlist[3:]:
|
||||
authors.append((authname, 'A01'))
|
||||
else:
|
||||
#OBP
|
||||
for i in range(1,6):
|
||||
fname= book.get(u'Contributor {} first name'.format(i), '')
|
||||
lname= book.get(u'Contributor {} surname'.format(i), '')
|
||||
role= book.get(u'ONIX Role Code (List 17){}'.format(i), '')
|
||||
authname = u'{} {}'.format(fname,lname)
|
||||
for i in range(1, 6):
|
||||
fname = book.get(u'Contributor {} first name'.format(i), '')
|
||||
lname = book.get(u'Contributor {} surname'.format(i), '')
|
||||
role = book.get(u'ONIX Role Code (List 17){}'.format(i), '')
|
||||
authname = u'{} {}'.format(fname, lname)
|
||||
if authname != u' ':
|
||||
role = role if role.strip() else 'A01'
|
||||
authors.append((authname,role))
|
||||
authors.append((authname, role))
|
||||
else:
|
||||
break
|
||||
return authors
|
||||
|
||||
def get_subjects(book):
|
||||
subjects=[]
|
||||
for i in range(1,5):
|
||||
subjects = []
|
||||
for i in range(1, 5):
|
||||
key = u'BISACCode{}'.format(i) #UMich dialect
|
||||
key2 = u'BISAC subject code {}'.format(i) #OBP dialect
|
||||
code = book.get(key,'')
|
||||
code = code if code else book.get(key2,'')
|
||||
code = book.get(key, '')
|
||||
code = code if code else book.get(key2, '')
|
||||
if code != '':
|
||||
try:
|
||||
bisac=BisacHeading.objects.get(notation=code)
|
||||
bisac = BisacHeading.objects.get(notation=code)
|
||||
subjects.append(bisac)
|
||||
except BisacHeading.DoesNotExist:
|
||||
logger.warning( "Please add BISAC {}".format(code))
|
||||
logger.warning("Please add BISAC {}".format(code))
|
||||
return subjects
|
||||
|
||||
def add_subject(subject_name, work, authority=''):
|
||||
try:
|
||||
subject= Subject.objects.get(name=subject_name)
|
||||
subject = Subject.objects.get(name=subject_name)
|
||||
except Subject.DoesNotExist:
|
||||
subject=Subject.objects.create(name=subject_name, authority=authority)
|
||||
subject = Subject.objects.create(name=subject_name, authority=authority)
|
||||
subject.works.add(work)
|
||||
|
||||
def get_title(book):
|
||||
title = book.get('FullTitle','') #UMICH
|
||||
title = book.get('FullTitle', '') #UMICH
|
||||
if title:
|
||||
return title
|
||||
title = book.get('Title','') #OBP
|
||||
sub = book.get('Subtitle','')
|
||||
title = book.get('Title', '') #OBP
|
||||
sub = book.get('Subtitle', '')
|
||||
if sub:
|
||||
return u'{}: {}'.format(title,sub)
|
||||
else:
|
||||
return title
|
||||
|
||||
return u'{}: {}'.format(title, sub)
|
||||
return title
|
||||
|
||||
def get_cover(book):
|
||||
cover_url = book.get('Cover URL','') #OBP
|
||||
cover_url = book.get('Cover URL', '') #OBP
|
||||
if cover_url:
|
||||
return cover_url
|
||||
url = book['URL']
|
||||
if "10.3998" in url:
|
||||
# code for umich books; can generalize, of course!
|
||||
idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url)
|
||||
idmatch = re.search(r'([^/]+)\.(\d+\.\d+\.\d+)', url)
|
||||
if idmatch:
|
||||
book_id = idmatch.group(2)
|
||||
if idmatch.group(1) == 'ohp':
|
||||
|
@ -131,74 +129,78 @@ def get_cover(book):
|
|||
else:
|
||||
cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id)
|
||||
cover = requests.head(cover_url)
|
||||
if cover.status_code<400:
|
||||
if cover.status_code < 400:
|
||||
return cover_url
|
||||
else:
|
||||
logger.warning( "bad cover: {} for: {}".format(cover_url, url))
|
||||
|
||||
logger.warning("bad cover: {} for: {}".format(cover_url, url))
|
||||
|
||||
def get_isbns(book):
|
||||
isbns = []
|
||||
edition = None
|
||||
#'ISBN 1' is OBP, others are UMICH
|
||||
for code in ['eISBN', 'ISBN 3','PaperISBN', 'ISBN 2', 'ClothISBN', 'ISBN 1', 'ISBN 4', 'ISBN 5']:
|
||||
if book.get(code, '') not in ('','N/A'):
|
||||
for code in ['eISBN', 'ISBN 3', 'PaperISBN', 'ISBN 2', 'ClothISBN',
|
||||
'ISBN 1', 'ISBN 4', 'ISBN 5'
|
||||
]:
|
||||
if book.get(code, '') not in ('', 'N/A'):
|
||||
values = book[code].split(',')
|
||||
for value in values:
|
||||
isbn = ISBN(value).to_string()
|
||||
if isbn:
|
||||
isbns.append(isbn)
|
||||
for isbn in isbns :
|
||||
for isbn in isbns:
|
||||
if not edition:
|
||||
edition = Edition.get_by_isbn(isbn)
|
||||
return (isbns, edition )
|
||||
return (isbns, edition)
|
||||
|
||||
def get_pubdate(book):
|
||||
value = book.get('CopyrightYear','') #UMICH
|
||||
value = book.get('CopyrightYear', '') #UMICH
|
||||
if value:
|
||||
return value
|
||||
value = book.get('publication year','') #OBP
|
||||
sub = book.get('publication month','')
|
||||
sub2 = book.get('publication day','')
|
||||
value = book.get('publication year', '') #OBP
|
||||
sub = book.get('publication month', '')
|
||||
sub2 = book.get('publication day', '')
|
||||
if sub2:
|
||||
return u'{}-{}-{}'.format(value,sub,sub2)
|
||||
return u'{}-{}-{}'.format(value, sub, sub2)
|
||||
elif sub:
|
||||
return u'{}-{}'.format(value,sub,sub2)
|
||||
else:
|
||||
return value
|
||||
|
||||
return u'{}-{}'.format(value, sub, sub2)
|
||||
return value
|
||||
|
||||
def get_publisher(book):
|
||||
value = book.get('Publisher','')
|
||||
value = book.get('Publisher', '')
|
||||
if value:
|
||||
return value
|
||||
if book.get('DOI prefix','')=='10.11647':
|
||||
if book.get('DOI prefix', '') == '10.11647':
|
||||
return "Open Book Publishers"
|
||||
|
||||
|
||||
def get_url(book):
|
||||
url = book.get('URL','')
|
||||
url = url if url else u'https://doi.org/{}/{}'.format( book.get('DOI prefix',''),book.get('DOI suffix',''))
|
||||
url = book.get('URL', '')
|
||||
url = url if url else u'https://doi.org/{}/{}'.format(
|
||||
book.get('DOI prefix', ''),
|
||||
book.get('DOI suffix', '')
|
||||
)
|
||||
return url
|
||||
|
||||
def get_description(book):
|
||||
value = book.get('DescriptionBrief','')
|
||||
value = value if value else book.get('Plain Text Blurb','')
|
||||
value = book.get('DescriptionBrief', '')
|
||||
value = value if value else book.get('Plain Text Blurb', '')
|
||||
return value
|
||||
|
||||
def get_language(book):
|
||||
value = book.get('ISO Language Code','')
|
||||
value = book.get('ISO Language Code', '')
|
||||
return value
|
||||
|
||||
|
||||
|
||||
def load_from_books(books):
|
||||
''' books is an iterator of book dicts.
|
||||
each book must have attributes
|
||||
(umich dialect)
|
||||
eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList,
|
||||
Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last,
|
||||
Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong,
|
||||
DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate,
|
||||
eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights,
|
||||
eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList,
|
||||
Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last,
|
||||
Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong,
|
||||
DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate,
|
||||
eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights,
|
||||
SubjectListMARC, , Book-level DOI, URL, License
|
||||
|
||||
|
||||
'''
|
||||
|
||||
# Goal: get or create an Edition and Work for each given book
|
||||
|
@ -209,21 +211,21 @@ def load_from_books(books):
|
|||
|
||||
# try first to get an Edition already in DB with by one of the ISBNs in book
|
||||
(isbns, edition) = get_isbns(book)
|
||||
if len(isbns)==0:
|
||||
if not isbns:
|
||||
continue
|
||||
title=get_title(book)
|
||||
title = get_title(book)
|
||||
authors = get_authors(book)
|
||||
|
||||
# if matching by ISBN doesn't work, then create a Work and Edition
|
||||
# if matching by ISBN doesn't work, then create a Work and Edition
|
||||
# with a title and the first ISBN
|
||||
if not edition:
|
||||
work = Work(title=title)
|
||||
work.save()
|
||||
edition= Edition(title=title, work=work)
|
||||
edition = Edition(title=title, work=work)
|
||||
edition.save()
|
||||
Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work)
|
||||
|
||||
work=edition.work
|
||||
work = edition.work
|
||||
|
||||
# at this point, work and edition exist
|
||||
url = get_url(book)
|
||||
|
@ -237,7 +239,7 @@ def load_from_books(books):
|
|||
if edition and edition.work != work:
|
||||
work = merge_works(work, edition.work)
|
||||
if not edition:
|
||||
edition= Edition(title=title, work=work)
|
||||
edition = Edition(title=title, work=work)
|
||||
edition.save()
|
||||
Identifier.set(type='isbn', value=isbn, edition=edition, work=work)
|
||||
|
||||
|
@ -249,18 +251,18 @@ def load_from_books(books):
|
|||
edition.save()
|
||||
edition.set_publisher(get_publisher(book))
|
||||
|
||||
# possibly replace work.description
|
||||
# possibly replace work.description
|
||||
description = get_description(book)
|
||||
if len(description)>len (work.description):
|
||||
if len(description) > len(work.description):
|
||||
work.description = description
|
||||
work.save()
|
||||
|
||||
|
||||
# set language
|
||||
lang= get_language(book)
|
||||
lang = get_language(book)
|
||||
if lang:
|
||||
work.language = lang
|
||||
work.save()
|
||||
|
||||
|
||||
# add a bisac subject (and ancestors) to work
|
||||
for bisacsh in get_subjects(book):
|
||||
while bisacsh:
|
||||
|
@ -273,13 +275,13 @@ def load_from_books(books):
|
|||
results.append((book, work, edition))
|
||||
|
||||
try:
|
||||
logger.info (u"{} {} {}\n".format(i, title, loading_ok))
|
||||
logger.info(u"{} {} {}\n".format(i, title, loading_ok))
|
||||
except Exception as e:
|
||||
logger.info (u"{} {}\n".format(i, title, str(e) ))
|
||||
logger.info(u"{} {} {}\n".format(i, title, str(e)))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
|
||||
def loaded_book_ok(book, work, edition):
|
||||
|
||||
isbns = get_isbns(book)[0]
|
||||
|
@ -292,10 +294,10 @@ def loaded_book_ok(book, work, edition):
|
|||
try:
|
||||
url_id = Identifier.objects.get(type='http', value=get_url(book))
|
||||
if url_id is None:
|
||||
logger.info ("url_id problem: work.id {}, url: {}".format(work.id, get_url(book)))
|
||||
logger.info("url_id problem: work.id {}, url: {}".format(work.id, get_url(book)))
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.info (str(e))
|
||||
logger.info(str(e))
|
||||
return False
|
||||
|
||||
# isbns
|
||||
|
@ -307,15 +309,17 @@ def loaded_book_ok(book, work, edition):
|
|||
try:
|
||||
edition_for_isbn = Identifier.objects.get(type='isbn', value=isbn).edition
|
||||
except Exception as e:
|
||||
print (e)
|
||||
logger.info(e)
|
||||
return False
|
||||
|
||||
# authors
|
||||
# print set([ed.name for ed in edition_for_isbn.authors.all()])
|
||||
|
||||
if (set([utf8_general_ci_norm(author[0]) for author in authors]) !=
|
||||
set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])):
|
||||
print "problem with authors"
|
||||
if (
|
||||
set([utf8_general_ci_norm(author[0]) for author in authors]) !=
|
||||
set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])
|
||||
):
|
||||
logger.info("problem with authors")
|
||||
return False
|
||||
|
||||
try:
|
||||
|
@ -327,7 +331,7 @@ def loaded_book_ok(book, work, edition):
|
|||
|
||||
# work description
|
||||
description = get_description(book)
|
||||
if not ((work.description == description) or (len(description) <len (work.description))):
|
||||
if not ((work.description == description) or (len(description) < len(work.description))):
|
||||
return False
|
||||
|
||||
# bisac
|
||||
|
@ -364,19 +368,23 @@ def ids_from_urls(url):
|
|||
if id_match:
|
||||
ids[ident] = id_match.group('id')
|
||||
return ids
|
||||
|
||||
|
||||
DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"')
|
||||
|
||||
def dl_online(ebook):
|
||||
if ebook.format != 'online':
|
||||
return None, False
|
||||
|
||||
if ebook.url.find(u'dropbox.com/s/') >= 0:
|
||||
pass
|
||||
elif ebook.url.find(u'dropbox.com/s/') >= 0:
|
||||
response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
|
||||
if response.status_code == 200:
|
||||
match_dl = DROPBOX_DL.search(response.content)
|
||||
if match_dl:
|
||||
return make_dl_ebook(match_dl.group(1), ebook)
|
||||
else:
|
||||
logger.warning('couldn\'t get {}'.format(ebook.url))
|
||||
else:
|
||||
logger.warning('couldn\'t get dl for {}'.format(ebook.url))
|
||||
|
||||
elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
|
||||
doc = get_soup(ebook.url)
|
||||
if doc:
|
||||
|
@ -384,7 +392,13 @@ def dl_online(ebook):
|
|||
if obj:
|
||||
dl_url = urlparse.urljoin(ebook.url, obj['href'])
|
||||
return make_dl_ebook(dl_url, ebook)
|
||||
|
||||
else:
|
||||
logger.warning('couldn\'t get dl_url for {}'.format(ebook.url))
|
||||
else:
|
||||
logger.warning('couldn\'t get soup for {}'.format(ebook.url))
|
||||
|
||||
return None, False
|
||||
|
||||
def make_dl_ebook(url, ebook):
|
||||
if EbookFile.objects.filter(source=ebook.url):
|
||||
return EbookFile.objects.filter(source=ebook.url)[0], False
|
||||
|
@ -414,12 +428,17 @@ def make_dl_ebook(url, ebook):
|
|||
new_ebf.ebook = new_ebook
|
||||
new_ebf.save()
|
||||
return new_ebf, True
|
||||
else:
|
||||
logger.warning('download format for {} is not ebook'.format(url))
|
||||
else:
|
||||
logger.warning('couldn\'t get {}'.format(url))
|
||||
return None, False
|
||||
|
||||
def type_for_url(url, content_type=None):
|
||||
if not url:
|
||||
return ''
|
||||
if url.find('books.openedition.org') >= 0:
|
||||
return ('online')
|
||||
return 'online'
|
||||
if Ebook.objects.filter(url=url):
|
||||
return Ebook.objects.filter(url=url)[0].format
|
||||
ct = content_type if content_type else contenttyper.calc_type(url)
|
||||
|
@ -440,7 +459,7 @@ def type_for_url(url, content_type=None):
|
|||
elif re.search("mobi", ct):
|
||||
return "mobi"
|
||||
return "other"
|
||||
|
||||
|
||||
class ContentTyper(object):
|
||||
""" """
|
||||
def __init__(self):
|
||||
|
|
Loading…
Reference in New Issue