diff --git a/core/loaders/utils.py b/core/loaders/utils.py index 7f54c77c..f559870d 100644 --- a/core/loaders/utils.py +++ b/core/loaders/utils.py @@ -1,7 +1,6 @@ import csv import logging import re -import sys import time import unicodedata import urlparse @@ -17,7 +16,7 @@ from regluit.bisac.models import BisacHeading from regluit.core.bookloader import add_by_isbn_from_google, merge_works from regluit.core.isbn import ISBN from regluit.core.models import ( - Author, Ebook, EbookFile, Edition, Identifier, path_for_file, PublisherName, Subject, Work, + Ebook, EbookFile, Edition, Identifier, path_for_file, Subject, Work, ) logger = logging.getLogger(__name__) @@ -31,7 +30,7 @@ def utf8_general_ci_norm(s): """ Normalize a la MySQL utf8_general_ci collation (As of 2016.05.24, we're using the utf8_general_ci collation for author names) - + https://stackoverflow.com/questions/1036454/what-are-the-diffrences-between-utf8-general-ci-and-utf8-unicode-ci/1036459#1036459 * converts to Unicode normalization form D for canonical decomposition @@ -50,78 +49,77 @@ def get_soup(url): return None def get_authors(book): - authors=[] - if book.get('AuthorsList',''): + authors = [] + if book.get('AuthorsList', ''): #UMich - for i in range(1,3): - fname=u'Author{}First'.format(i) - lname=u'Author{}Last'.format(i) - role=u'Author{}Role'.format(i) - authname = u'{} {}'.format(book[fname],book[lname]) + for i in range(1, 3): + fname = u'Author{}First'.format(i) + lname = u'Author{}Last'.format(i) + role = u'Author{}Role'.format(i) + authname = u'{} {}'.format(book[fname], book[lname]) if authname != u' ': role = book[role] if book[role].strip() else 'A01' - authors.append((authname,role)) + authors.append((authname, role)) else: break authlist = book["AuthorsList"].replace(' and ', ', ').split(', ') - if len(authlist)>3: + if len(authlist) > 3: for authname in authlist[3:]: authors.append((authname, 'A01')) else: #OBP - for i in range(1,6): - fname= book.get(u'Contributor {} first name'.format(i), '') - lname= book.get(u'Contributor {} surname'.format(i), '') - role= book.get(u'ONIX Role Code (List 17){}'.format(i), '') - authname = u'{} {}'.format(fname,lname) + for i in range(1, 6): + fname = book.get(u'Contributor {} first name'.format(i), '') + lname = book.get(u'Contributor {} surname'.format(i), '') + role = book.get(u'ONIX Role Code (List 17){}'.format(i), '') + authname = u'{} {}'.format(fname, lname) if authname != u' ': role = role if role.strip() else 'A01' - authors.append((authname,role)) + authors.append((authname, role)) else: break return authors def get_subjects(book): - subjects=[] - for i in range(1,5): + subjects = [] + for i in range(1, 5): key = u'BISACCode{}'.format(i) #UMich dialect key2 = u'BISAC subject code {}'.format(i) #OBP dialect - code = book.get(key,'') - code = code if code else book.get(key2,'') + code = book.get(key, '') + code = code if code else book.get(key2, '') if code != '': try: - bisac=BisacHeading.objects.get(notation=code) + bisac = BisacHeading.objects.get(notation=code) subjects.append(bisac) except BisacHeading.DoesNotExist: - logger.warning( "Please add BISAC {}".format(code)) + logger.warning("Please add BISAC {}".format(code)) return subjects def add_subject(subject_name, work, authority=''): try: - subject= Subject.objects.get(name=subject_name) + subject = Subject.objects.get(name=subject_name) except Subject.DoesNotExist: - subject=Subject.objects.create(name=subject_name, authority=authority) + subject = Subject.objects.create(name=subject_name, authority=authority) subject.works.add(work) def get_title(book): - title = book.get('FullTitle','') #UMICH + title = book.get('FullTitle', '') #UMICH if title: return title - title = book.get('Title','') #OBP - sub = book.get('Subtitle','') + title = book.get('Title', '') #OBP + sub = book.get('Subtitle', '') if sub: - return u'{}: {}'.format(title,sub) - else: - return title - + return u'{}: {}'.format(title, sub) + return title + def get_cover(book): - cover_url = book.get('Cover URL','') #OBP + cover_url = book.get('Cover URL', '') #OBP if cover_url: return cover_url url = book['URL'] if "10.3998" in url: # code for umich books; can generalize, of course! - idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url) + idmatch = re.search(r'([^/]+)\.(\d+\.\d+\.\d+)', url) if idmatch: book_id = idmatch.group(2) if idmatch.group(1) == 'ohp': @@ -131,74 +129,78 @@ def get_cover(book): else: cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id) cover = requests.head(cover_url) - if cover.status_code<400: + if cover.status_code < 400: return cover_url else: - logger.warning( "bad cover: {} for: {}".format(cover_url, url)) - + logger.warning("bad cover: {} for: {}".format(cover_url, url)) + def get_isbns(book): isbns = [] edition = None #'ISBN 1' is OBP, others are UMICH - for code in ['eISBN', 'ISBN 3','PaperISBN', 'ISBN 2', 'ClothISBN', 'ISBN 1', 'ISBN 4', 'ISBN 5']: - if book.get(code, '') not in ('','N/A'): + for code in ['eISBN', 'ISBN 3', 'PaperISBN', 'ISBN 2', 'ClothISBN', + 'ISBN 1', 'ISBN 4', 'ISBN 5' + ]: + if book.get(code, '') not in ('', 'N/A'): values = book[code].split(',') for value in values: isbn = ISBN(value).to_string() if isbn: isbns.append(isbn) - for isbn in isbns : + for isbn in isbns: if not edition: edition = Edition.get_by_isbn(isbn) - return (isbns, edition ) + return (isbns, edition) def get_pubdate(book): - value = book.get('CopyrightYear','') #UMICH + value = book.get('CopyrightYear', '') #UMICH if value: return value - value = book.get('publication year','') #OBP - sub = book.get('publication month','') - sub2 = book.get('publication day','') + value = book.get('publication year', '') #OBP + sub = book.get('publication month', '') + sub2 = book.get('publication day', '') if sub2: - return u'{}-{}-{}'.format(value,sub,sub2) + return u'{}-{}-{}'.format(value, sub, sub2) elif sub: - return u'{}-{}'.format(value,sub,sub2) - else: - return value - + return u'{}-{}'.format(value, sub, sub2) + return value + def get_publisher(book): - value = book.get('Publisher','') + value = book.get('Publisher', '') if value: return value - if book.get('DOI prefix','')=='10.11647': + if book.get('DOI prefix', '') == '10.11647': return "Open Book Publishers" - + def get_url(book): - url = book.get('URL','') - url = url if url else u'https://doi.org/{}/{}'.format( book.get('DOI prefix',''),book.get('DOI suffix','')) + url = book.get('URL', '') + url = url if url else u'https://doi.org/{}/{}'.format( + book.get('DOI prefix', ''), + book.get('DOI suffix', '') + ) return url def get_description(book): - value = book.get('DescriptionBrief','') - value = value if value else book.get('Plain Text Blurb','') + value = book.get('DescriptionBrief', '') + value = value if value else book.get('Plain Text Blurb', '') return value def get_language(book): - value = book.get('ISO Language Code','') + value = book.get('ISO Language Code', '') return value - + def load_from_books(books): ''' books is an iterator of book dicts. each book must have attributes (umich dialect) - eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList, - Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last, - Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong, - DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate, - eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights, + eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList, + Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last, + Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong, + DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate, + eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights, SubjectListMARC, , Book-level DOI, URL, License - + ''' # Goal: get or create an Edition and Work for each given book @@ -209,21 +211,21 @@ def load_from_books(books): # try first to get an Edition already in DB with by one of the ISBNs in book (isbns, edition) = get_isbns(book) - if len(isbns)==0: + if not isbns: continue - title=get_title(book) + title = get_title(book) authors = get_authors(book) - # if matching by ISBN doesn't work, then create a Work and Edition + # if matching by ISBN doesn't work, then create a Work and Edition # with a title and the first ISBN if not edition: work = Work(title=title) work.save() - edition= Edition(title=title, work=work) + edition = Edition(title=title, work=work) edition.save() Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work) - work=edition.work + work = edition.work # at this point, work and edition exist url = get_url(book) @@ -237,7 +239,7 @@ def load_from_books(books): if edition and edition.work != work: work = merge_works(work, edition.work) if not edition: - edition= Edition(title=title, work=work) + edition = Edition(title=title, work=work) edition.save() Identifier.set(type='isbn', value=isbn, edition=edition, work=work) @@ -249,18 +251,18 @@ def load_from_books(books): edition.save() edition.set_publisher(get_publisher(book)) - # possibly replace work.description + # possibly replace work.description description = get_description(book) - if len(description)>len (work.description): + if len(description) > len(work.description): work.description = description work.save() - + # set language - lang= get_language(book) + lang = get_language(book) if lang: work.language = lang work.save() - + # add a bisac subject (and ancestors) to work for bisacsh in get_subjects(book): while bisacsh: @@ -273,13 +275,13 @@ def load_from_books(books): results.append((book, work, edition)) try: - logger.info (u"{} {} {}\n".format(i, title, loading_ok)) + logger.info(u"{} {} {}\n".format(i, title, loading_ok)) except Exception as e: - logger.info (u"{} {}\n".format(i, title, str(e) )) + logger.info(u"{} {} {}\n".format(i, title, str(e))) return results - + def loaded_book_ok(book, work, edition): isbns = get_isbns(book)[0] @@ -292,10 +294,10 @@ def loaded_book_ok(book, work, edition): try: url_id = Identifier.objects.get(type='http', value=get_url(book)) if url_id is None: - logger.info ("url_id problem: work.id {}, url: {}".format(work.id, get_url(book))) + logger.info("url_id problem: work.id {}, url: {}".format(work.id, get_url(book))) return False except Exception as e: - logger.info (str(e)) + logger.info(str(e)) return False # isbns @@ -307,15 +309,17 @@ def loaded_book_ok(book, work, edition): try: edition_for_isbn = Identifier.objects.get(type='isbn', value=isbn).edition except Exception as e: - print (e) + logger.info(e) return False # authors # print set([ed.name for ed in edition_for_isbn.authors.all()]) - if (set([utf8_general_ci_norm(author[0]) for author in authors]) != - set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])): - print "problem with authors" + if ( + set([utf8_general_ci_norm(author[0]) for author in authors]) != + set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()]) + ): + logger.info("problem with authors") return False try: @@ -327,7 +331,7 @@ def loaded_book_ok(book, work, edition): # work description description = get_description(book) - if not ((work.description == description) or (len(description) = 0: + pass + elif ebook.url.find(u'dropbox.com/s/') >= 0: response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT}) if response.status_code == 200: match_dl = DROPBOX_DL.search(response.content) if match_dl: return make_dl_ebook(match_dl.group(1), ebook) + else: + logger.warning('couldn\'t get {}'.format(ebook.url)) + else: + logger.warning('couldn\'t get dl for {}'.format(ebook.url)) + elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0: doc = get_soup(ebook.url) if doc: @@ -384,7 +392,13 @@ def dl_online(ebook): if obj: dl_url = urlparse.urljoin(ebook.url, obj['href']) return make_dl_ebook(dl_url, ebook) - + else: + logger.warning('couldn\'t get dl_url for {}'.format(ebook.url)) + else: + logger.warning('couldn\'t get soup for {}'.format(ebook.url)) + + return None, False + def make_dl_ebook(url, ebook): if EbookFile.objects.filter(source=ebook.url): return EbookFile.objects.filter(source=ebook.url)[0], False @@ -414,12 +428,17 @@ def make_dl_ebook(url, ebook): new_ebf.ebook = new_ebook new_ebf.save() return new_ebf, True + else: + logger.warning('download format for {} is not ebook'.format(url)) + else: + logger.warning('couldn\'t get {}'.format(url)) + return None, False def type_for_url(url, content_type=None): if not url: return '' if url.find('books.openedition.org') >= 0: - return ('online') + return 'online' if Ebook.objects.filter(url=url): return Ebook.objects.filter(url=url)[0].format ct = content_type if content_type else contenttyper.calc_type(url) @@ -440,7 +459,7 @@ def type_for_url(url, content_type=None): elif re.search("mobi", ct): return "mobi" return "other" - + class ContentTyper(object): """ """ def __init__(self):