Merge pull request #778 from Gluejar/doab-misc

fix bugs and delint
2018-05-11 11:47:48 -04:00 · 2018-05-11 11:47:48 -04:00 · 71ba8dc9fa
parent 95b8276829 05fae60ddb
commit 71ba8dc9fa
1 changed files with 113 additions and 94 deletions
--- a/core/loaders/utils.py
+++ b/core/loaders/utils.py
@ -1,7 +1,6 @@
 import csv
 import logging
 import re
-import sys
 import time
 import unicodedata
 import urlparse
@ -17,7 +16,7 @@ from regluit.bisac.models import BisacHeading
 from regluit.core.bookloader import add_by_isbn_from_google, merge_works
 from regluit.core.isbn import ISBN
 from regluit.core.models import (
-    Author, Ebook, EbookFile, Edition, Identifier, path_for_file, PublisherName, Subject, Work,
+    Ebook, EbookFile, Edition, Identifier, path_for_file, Subject, Work,
 )

 logger = logging.getLogger(__name__)
@ -31,7 +30,7 @@ def utf8_general_ci_norm(s):
    """
    Normalize a la MySQL utf8_general_ci collation
    (As of 2016.05.24, we're using the utf8_general_ci collation for author names)
-    
+
    https://stackoverflow.com/questions/1036454/what-are-the-diffrences-between-utf8-general-ci-and-utf8-unicode-ci/1036459#1036459

    * converts to Unicode normalization form D for canonical decomposition
@ -50,78 +49,77 @@ def get_soup(url):
    return None

 def get_authors(book):
-    authors=[]
-    if book.get('AuthorsList',''):
+    authors = []
+    if book.get('AuthorsList', ''):
        #UMich
-        for i in range(1,3):
-            fname=u'Author{}First'.format(i)
-            lname=u'Author{}Last'.format(i)
-            role=u'Author{}Role'.format(i)
-            authname = u'{} {}'.format(book[fname],book[lname])
+        for i in range(1, 3):
+            fname = u'Author{}First'.format(i)
+            lname = u'Author{}Last'.format(i)
+            role = u'Author{}Role'.format(i)
+            authname = u'{} {}'.format(book[fname], book[lname])
            if authname != u' ':
                role = book[role] if book[role].strip() else 'A01'
-                authors.append((authname,role))
+                authors.append((authname, role))
            else:
                break
        authlist = book["AuthorsList"].replace(' and ', ', ').split(', ')
-        if len(authlist)>3:
+        if len(authlist) > 3:
            for authname in authlist[3:]:
                authors.append((authname, 'A01'))
    else:
        #OBP
-        for i in range(1,6):
-            fname= book.get(u'Contributor {} first name'.format(i), '')
-            lname= book.get(u'Contributor {} surname'.format(i), '')
-            role= book.get(u'ONIX Role Code (List 17){}'.format(i), '')
-            authname = u'{} {}'.format(fname,lname)
+        for i in range(1, 6):
+            fname = book.get(u'Contributor {} first name'.format(i), '')
+            lname = book.get(u'Contributor {} surname'.format(i), '')
+            role = book.get(u'ONIX Role Code (List 17){}'.format(i), '')
+            authname = u'{} {}'.format(fname, lname)
            if authname != u' ':
                role = role if role.strip() else 'A01'
-                authors.append((authname,role))
+                authors.append((authname, role))
            else:
                break
    return authors

 def get_subjects(book):
-    subjects=[]
-    for i in range(1,5):
+    subjects = []
+    for i in range(1, 5):
        key = u'BISACCode{}'.format(i)  #UMich dialect
        key2 = u'BISAC subject code {}'.format(i) #OBP dialect
-        code = book.get(key,'')
-        code = code if code else book.get(key2,'')
+        code = book.get(key, '')
+        code = code if code else book.get(key2, '')
        if code != '':
            try:
-                bisac=BisacHeading.objects.get(notation=code)
+                bisac = BisacHeading.objects.get(notation=code)
                subjects.append(bisac)
            except BisacHeading.DoesNotExist:
-                logger.warning( "Please add BISAC {}".format(code))
+                logger.warning("Please add BISAC {}".format(code))
    return subjects

 def add_subject(subject_name, work, authority=''):
    try:
-        subject= Subject.objects.get(name=subject_name)
+        subject = Subject.objects.get(name=subject_name)
    except Subject.DoesNotExist:
-        subject=Subject.objects.create(name=subject_name, authority=authority)
+        subject = Subject.objects.create(name=subject_name, authority=authority)
    subject.works.add(work)

 def get_title(book):
-    title = book.get('FullTitle','') #UMICH
+    title = book.get('FullTitle', '') #UMICH
    if title:
        return title
-    title = book.get('Title','') #OBP
-    sub = book.get('Subtitle','')
+    title = book.get('Title', '') #OBP
+    sub = book.get('Subtitle', '')
    if sub:
-        return u'{}: {}'.format(title,sub)
-    else:
-        return title
-        
+        return u'{}: {}'.format(title, sub)
+    return title
+
 def get_cover(book):
-    cover_url =  book.get('Cover URL','') #OBP
+    cover_url = book.get('Cover URL', '') #OBP
    if cover_url:
        return cover_url
    url = book['URL']
    if "10.3998" in url:
        # code for umich books; can generalize, of course!
-        idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url)
+        idmatch = re.search(r'([^/]+)\.(\d+\.\d+\.\d+)', url)
        if idmatch:
            book_id = idmatch.group(2)
            if idmatch.group(1) == 'ohp':
@ -131,74 +129,78 @@ def get_cover(book):
            else:
                cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id)
            cover = requests.head(cover_url)
-            if cover.status_code<400:
+            if cover.status_code < 400:
                return cover_url
            else:
-                logger.warning( "bad cover: {} for: {}".format(cover_url, url))
-            
+                logger.warning("bad cover: {} for: {}".format(cover_url, url))
+
 def get_isbns(book):
    isbns = []
    edition = None
    #'ISBN 1' is OBP, others are UMICH
-    for code in ['eISBN', 'ISBN 3','PaperISBN', 'ISBN 2', 'ClothISBN', 'ISBN 1', 'ISBN 4', 'ISBN 5']:
-        if book.get(code, '') not in ('','N/A'):
+    for code in ['eISBN', 'ISBN 3', 'PaperISBN', 'ISBN 2', 'ClothISBN',
+                 'ISBN 1', 'ISBN 4', 'ISBN 5'
+                ]:
+        if book.get(code, '') not in ('', 'N/A'):
            values = book[code].split(',')
            for value in values:
                isbn = ISBN(value).to_string()
                if isbn:
                    isbns.append(isbn)
-    for isbn in isbns :
+    for isbn in isbns:
        if not edition:
            edition = Edition.get_by_isbn(isbn)
-    return (isbns, edition )
+    return (isbns, edition)

 def get_pubdate(book):
-    value = book.get('CopyrightYear','') #UMICH
+    value = book.get('CopyrightYear', '') #UMICH
    if value:
        return value
-    value = book.get('publication year','') #OBP
-    sub = book.get('publication month','')
-    sub2 = book.get('publication day','')
+    value = book.get('publication year', '') #OBP
+    sub = book.get('publication month', '')
+    sub2 = book.get('publication day', '')
    if sub2:
-        return u'{}-{}-{}'.format(value,sub,sub2)
+        return u'{}-{}-{}'.format(value, sub, sub2)
    elif sub:
-        return u'{}-{}'.format(value,sub,sub2)
-    else:
-        return value
-        
+        return u'{}-{}'.format(value, sub, sub2)
+    return value
+
 def get_publisher(book):
-    value = book.get('Publisher','')
+    value = book.get('Publisher', '')
    if value:
        return value
-    if book.get('DOI prefix','')=='10.11647':
+    if book.get('DOI prefix', '') == '10.11647':
        return "Open Book Publishers"
-        
+
 def get_url(book):
-    url = book.get('URL','')
-    url = url if url else u'https://doi.org/{}/{}'.format( book.get('DOI prefix',''),book.get('DOI suffix',''))
+    url = book.get('URL', '')
+    url = url if url else u'https://doi.org/{}/{}'.format(
+        book.get('DOI prefix', ''),
+        book.get('DOI suffix', '')
+    )
    return url

 def get_description(book):
-    value = book.get('DescriptionBrief','')
-    value = value if value else book.get('Plain Text Blurb','')
+    value = book.get('DescriptionBrief', '')
+    value = value if value else book.get('Plain Text Blurb', '')
    return value

 def get_language(book):
-    value = book.get('ISO Language Code','')
+    value = book.get('ISO Language Code', '')
    return value

-        
+
 def load_from_books(books):
    ''' books is an iterator of book dicts.
        each book must have attributes
        (umich dialect)
-        eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList, 
-        Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last, 
-        Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong, 
-        DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate, 
-        eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights, 
+        eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList,
+        Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last,
+        Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong,
+        DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate,
+        eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights,
        SubjectListMARC, , Book-level DOI, URL,	License
-        
+
        '''

    # Goal: get or create an Edition and Work for each given book
@ -209,21 +211,21 @@ def load_from_books(books):

        # try first to get an Edition already in DB with by one of the ISBNs in book
        (isbns, edition) = get_isbns(book)
-        if len(isbns)==0:
+        if not isbns:
            continue
-        title=get_title(book)
+        title = get_title(book)
        authors = get_authors(book)

-        # if matching by ISBN doesn't work, then create a Work and Edition 
+        # if matching by ISBN doesn't work, then create a Work and Edition
        # with a title and the first ISBN
        if not edition:
            work = Work(title=title)
            work.save()
-            edition= Edition(title=title, work=work) 
+            edition = Edition(title=title, work=work)
            edition.save()
            Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work)

-        work=edition.work
+        work = edition.work

        # at this point, work and edition exist
        url = get_url(book)
@ -237,7 +239,7 @@ def load_from_books(books):
            if edition and edition.work != work:
                work = merge_works(work, edition.work)
            if not edition:
-                edition= Edition(title=title, work=work)
+                edition = Edition(title=title, work=work)
                edition.save()
                Identifier.set(type='isbn', value=isbn, edition=edition, work=work)

@ -249,18 +251,18 @@ def load_from_books(books):
            edition.save()
            edition.set_publisher(get_publisher(book))

-        # possibly replace work.description 
+        # possibly replace work.description
        description = get_description(book)
-        if len(description)>len (work.description):
+        if len(description) > len(work.description):
            work.description = description
            work.save()
-        
+
        # set language
-        lang= get_language(book)
+        lang = get_language(book)
        if lang:
            work.language = lang
            work.save()
-        
+
        # add a bisac subject (and ancestors) to work
        for bisacsh in get_subjects(book):
            while bisacsh:
@ -273,13 +275,13 @@ def load_from_books(books):
        results.append((book, work, edition))

        try:
-            logger.info (u"{} {} {}\n".format(i, title, loading_ok))
+            logger.info(u"{} {} {}\n".format(i, title, loading_ok))
        except Exception as e:
-            logger.info (u"{} {}\n".format(i, title, str(e) ))
+            logger.info(u"{} {} {}\n".format(i, title, str(e)))

    return results

-    
+
 def loaded_book_ok(book, work, edition):

    isbns = get_isbns(book)[0]
@ -292,10 +294,10 @@ def loaded_book_ok(book, work, edition):
    try:
        url_id = Identifier.objects.get(type='http', value=get_url(book))
        if url_id is None:
-            logger.info ("url_id problem: work.id {}, url: {}".format(work.id, get_url(book)))
+            logger.info("url_id problem: work.id {}, url: {}".format(work.id, get_url(book)))
            return False
    except Exception as e:
-        logger.info (str(e))
+        logger.info(str(e))
        return False

    # isbns
@ -307,15 +309,17 @@ def loaded_book_ok(book, work, edition):
            try:
                edition_for_isbn = Identifier.objects.get(type='isbn', value=isbn).edition
            except Exception as e:
-                print (e)
+                logger.info(e)
                return False

            # authors
            # print set([ed.name for ed in edition_for_isbn.authors.all()])

-            if (set([utf8_general_ci_norm(author[0]) for author in authors]) != 
-                   set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])):
-                print "problem with authors"
+            if (
+                    set([utf8_general_ci_norm(author[0]) for author in authors]) !=
+                    set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])
+            ):
+                logger.info("problem with authors")
                return False

            try:
@ -327,7 +331,7 @@ def loaded_book_ok(book, work, edition):

    # work description
    description = get_description(book)
-    if not ((work.description == description) or (len(description) <len (work.description))):
+    if not ((work.description == description) or (len(description) < len(work.description))):
        return False

    # bisac
@ -364,19 +368,23 @@ def ids_from_urls(url):
        if id_match:
            ids[ident] = id_match.group('id')
    return ids
-        
+
 DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"')

 def dl_online(ebook):
    if ebook.format != 'online':
-        return None, False
-        
-    if ebook.url.find(u'dropbox.com/s/') >= 0:
+        pass
+    elif ebook.url.find(u'dropbox.com/s/') >= 0:
        response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
        if response.status_code == 200:
            match_dl = DROPBOX_DL.search(response.content)
            if match_dl:
                return make_dl_ebook(match_dl.group(1), ebook)
+            else:
+                logger.warning('couldn\'t get {}'.format(ebook.url))
+        else:
+            logger.warning('couldn\'t get dl for {}'.format(ebook.url))
+
    elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
        doc = get_soup(ebook.url)
        if doc:
@ -384,7 +392,13 @@ def dl_online(ebook):
            if obj:
                dl_url = urlparse.urljoin(ebook.url, obj['href'])
                return make_dl_ebook(dl_url, ebook)
-                
+            else:
+                logger.warning('couldn\'t get dl_url for {}'.format(ebook.url))
+        else:
+            logger.warning('couldn\'t get soup for {}'.format(ebook.url))
+
+    return None, False
+
 def make_dl_ebook(url, ebook):
    if EbookFile.objects.filter(source=ebook.url):
        return EbookFile.objects.filter(source=ebook.url)[0], False
@ -414,12 +428,17 @@ def make_dl_ebook(url, ebook):
            new_ebf.ebook = new_ebook
            new_ebf.save()
            return new_ebf, True
+        else:
+            logger.warning('download format for {} is not ebook'.format(url))
+    else:
+        logger.warning('couldn\'t get {}'.format(url))
+    return None, False

 def type_for_url(url, content_type=None):
    if not url:
        return ''
    if url.find('books.openedition.org') >= 0:
-        return ('online')
+        return 'online'
    if Ebook.objects.filter(url=url):
        return Ebook.objects.filter(url=url)[0].format
    ct = content_type if content_type else contenttyper.calc_type(url)
@ -440,7 +459,7 @@ def type_for_url(url, content_type=None):
    elif re.search("mobi", ct):
        return "mobi"
    return "other"
-   
+
 class ContentTyper(object):
    """ """
    def __init__(self):