From 05fae60ddbf176c63454bc89da72a420a28ca517 Mon Sep 17 00:00:00 2001
From: eric <eric@hellman.net>
Date: Fri, 11 May 2018 11:46:04 -0400
Subject: [PATCH] delint

---
 core/loaders/utils.py | 207 +++++++++++++++++++++++-------------------
 1 file changed, 113 insertions(+), 94 deletions(-)

diff --git a/core/loaders/utils.py b/core/loaders/utils.py
index 7f54c77c..f559870d 100644
--- a/core/loaders/utils.py
+++ b/core/loaders/utils.py
@@ -1,7 +1,6 @@
 import csv
 import logging
 import re
-import sys
 import time
 import unicodedata
 import urlparse
@@ -17,7 +16,7 @@ from regluit.bisac.models import BisacHeading
 from regluit.core.bookloader import add_by_isbn_from_google, merge_works
 from regluit.core.isbn import ISBN
 from regluit.core.models import (
-    Author, Ebook, EbookFile, Edition, Identifier, path_for_file, PublisherName, Subject, Work,
+    Ebook, EbookFile, Edition, Identifier, path_for_file, Subject, Work,
 )
 
 logger = logging.getLogger(__name__)
@@ -31,7 +30,7 @@ def utf8_general_ci_norm(s):
     """
     Normalize a la MySQL utf8_general_ci collation
     (As of 2016.05.24, we're using the utf8_general_ci collation for author names)
-    
+
     https://stackoverflow.com/questions/1036454/what-are-the-diffrences-between-utf8-general-ci-and-utf8-unicode-ci/1036459#1036459
 
     * converts to Unicode normalization form D for canonical decomposition
@@ -50,78 +49,77 @@ def get_soup(url):
     return None
 
 def get_authors(book):
-    authors=[]
-    if book.get('AuthorsList',''):
+    authors = []
+    if book.get('AuthorsList', ''):
         #UMich
-        for i in range(1,3):
-            fname=u'Author{}First'.format(i)
-            lname=u'Author{}Last'.format(i)
-            role=u'Author{}Role'.format(i)
-            authname = u'{} {}'.format(book[fname],book[lname])
+        for i in range(1, 3):
+            fname = u'Author{}First'.format(i)
+            lname = u'Author{}Last'.format(i)
+            role = u'Author{}Role'.format(i)
+            authname = u'{} {}'.format(book[fname], book[lname])
             if authname != u' ':
                 role = book[role] if book[role].strip() else 'A01'
-                authors.append((authname,role))
+                authors.append((authname, role))
             else:
                 break
         authlist = book["AuthorsList"].replace(' and ', ', ').split(', ')
-        if len(authlist)>3:
+        if len(authlist) > 3:
             for authname in authlist[3:]:
                 authors.append((authname, 'A01'))
     else:
         #OBP
-        for i in range(1,6):
-            fname= book.get(u'Contributor {} first name'.format(i), '')
-            lname= book.get(u'Contributor {} surname'.format(i), '')
-            role= book.get(u'ONIX Role Code (List 17){}'.format(i), '')
-            authname = u'{} {}'.format(fname,lname)
+        for i in range(1, 6):
+            fname = book.get(u'Contributor {} first name'.format(i), '')
+            lname = book.get(u'Contributor {} surname'.format(i), '')
+            role = book.get(u'ONIX Role Code (List 17){}'.format(i), '')
+            authname = u'{} {}'.format(fname, lname)
             if authname != u' ':
                 role = role if role.strip() else 'A01'
-                authors.append((authname,role))
+                authors.append((authname, role))
             else:
                 break
     return authors
 
 def get_subjects(book):
-    subjects=[]
-    for i in range(1,5):
+    subjects = []
+    for i in range(1, 5):
         key = u'BISACCode{}'.format(i)  #UMich dialect
         key2 = u'BISAC subject code {}'.format(i) #OBP dialect
-        code = book.get(key,'')
-        code = code if code else book.get(key2,'')
+        code = book.get(key, '')
+        code = code if code else book.get(key2, '')
         if code != '':
             try:
-                bisac=BisacHeading.objects.get(notation=code)
+                bisac = BisacHeading.objects.get(notation=code)
                 subjects.append(bisac)
             except BisacHeading.DoesNotExist:
-                logger.warning( "Please add BISAC {}".format(code))
+                logger.warning("Please add BISAC {}".format(code))
     return subjects
 
 def add_subject(subject_name, work, authority=''):
     try:
-        subject= Subject.objects.get(name=subject_name)
+        subject = Subject.objects.get(name=subject_name)
     except Subject.DoesNotExist:
-        subject=Subject.objects.create(name=subject_name, authority=authority)
+        subject = Subject.objects.create(name=subject_name, authority=authority)
     subject.works.add(work)
 
 def get_title(book):
-    title = book.get('FullTitle','') #UMICH
+    title = book.get('FullTitle', '') #UMICH
     if title:
         return title
-    title = book.get('Title','') #OBP
-    sub = book.get('Subtitle','')
+    title = book.get('Title', '') #OBP
+    sub = book.get('Subtitle', '')
     if sub:
-        return u'{}: {}'.format(title,sub)
-    else:
-        return title
-        
+        return u'{}: {}'.format(title, sub)
+    return title
+
 def get_cover(book):
-    cover_url =  book.get('Cover URL','') #OBP
+    cover_url = book.get('Cover URL', '') #OBP
     if cover_url:
         return cover_url
     url = book['URL']
     if "10.3998" in url:
         # code for umich books; can generalize, of course!
-        idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url)
+        idmatch = re.search(r'([^/]+)\.(\d+\.\d+\.\d+)', url)
         if idmatch:
             book_id = idmatch.group(2)
             if idmatch.group(1) == 'ohp':
@@ -131,74 +129,78 @@ def get_cover(book):
             else:
                 cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id)
             cover = requests.head(cover_url)
-            if cover.status_code<400:
+            if cover.status_code < 400:
                 return cover_url
             else:
-                logger.warning( "bad cover: {} for: {}".format(cover_url, url))
-            
+                logger.warning("bad cover: {} for: {}".format(cover_url, url))
+
 def get_isbns(book):
     isbns = []
     edition = None
     #'ISBN 1' is OBP, others are UMICH
-    for code in ['eISBN', 'ISBN 3','PaperISBN', 'ISBN 2', 'ClothISBN', 'ISBN 1', 'ISBN 4', 'ISBN 5']:
-        if book.get(code, '') not in ('','N/A'):
+    for code in ['eISBN', 'ISBN 3', 'PaperISBN', 'ISBN 2', 'ClothISBN',
+                 'ISBN 1', 'ISBN 4', 'ISBN 5'
+                ]:
+        if book.get(code, '') not in ('', 'N/A'):
             values = book[code].split(',')
             for value in values:
                 isbn = ISBN(value).to_string()
                 if isbn:
                     isbns.append(isbn)
-    for isbn in isbns :
+    for isbn in isbns:
         if not edition:
             edition = Edition.get_by_isbn(isbn)
-    return (isbns, edition )
+    return (isbns, edition)
 
 def get_pubdate(book):
-    value = book.get('CopyrightYear','') #UMICH
+    value = book.get('CopyrightYear', '') #UMICH
     if value:
         return value
-    value = book.get('publication year','') #OBP
-    sub = book.get('publication month','')
-    sub2 = book.get('publication day','')
+    value = book.get('publication year', '') #OBP
+    sub = book.get('publication month', '')
+    sub2 = book.get('publication day', '')
     if sub2:
-        return u'{}-{}-{}'.format(value,sub,sub2)
+        return u'{}-{}-{}'.format(value, sub, sub2)
     elif sub:
-        return u'{}-{}'.format(value,sub,sub2)
-    else:
-        return value
-        
+        return u'{}-{}'.format(value, sub, sub2)
+    return value
+
 def get_publisher(book):
-    value = book.get('Publisher','')
+    value = book.get('Publisher', '')
     if value:
         return value
-    if book.get('DOI prefix','')=='10.11647':
+    if book.get('DOI prefix', '') == '10.11647':
         return "Open Book Publishers"
-        
+
 def get_url(book):
-    url = book.get('URL','')
-    url = url if url else u'https://doi.org/{}/{}'.format( book.get('DOI prefix',''),book.get('DOI suffix',''))
+    url = book.get('URL', '')
+    url = url if url else u'https://doi.org/{}/{}'.format(
+        book.get('DOI prefix', ''),
+        book.get('DOI suffix', '')
+    )
     return url
 
 def get_description(book):
-    value = book.get('DescriptionBrief','')
-    value = value if value else book.get('Plain Text Blurb','')
+    value = book.get('DescriptionBrief', '')
+    value = value if value else book.get('Plain Text Blurb', '')
     return value
 
 def get_language(book):
-    value = book.get('ISO Language Code','')
+    value = book.get('ISO Language Code', '')
     return value
 
-        
+
 def load_from_books(books):
     ''' books is an iterator of book dicts.
         each book must have attributes
         (umich dialect)
-        eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList, 
-        Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last, 
-        Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong, 
-        DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate, 
-        eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights, 
+        eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList,
+        Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last,
+        Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong,
+        DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate,
+        eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights,
         SubjectListMARC, , Book-level DOI, URL,	License
-        
+
         '''
 
     # Goal: get or create an Edition and Work for each given book
@@ -209,21 +211,21 @@ def load_from_books(books):
 
         # try first to get an Edition already in DB with by one of the ISBNs in book
         (isbns, edition) = get_isbns(book)
-        if len(isbns)==0:
+        if not isbns:
             continue
-        title=get_title(book)
+        title = get_title(book)
         authors = get_authors(book)
 
-        # if matching by ISBN doesn't work, then create a Work and Edition 
+        # if matching by ISBN doesn't work, then create a Work and Edition
         # with a title and the first ISBN
         if not edition:
             work = Work(title=title)
             work.save()
-            edition= Edition(title=title, work=work) 
+            edition = Edition(title=title, work=work)
             edition.save()
             Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work)
 
-        work=edition.work
+        work = edition.work
 
         # at this point, work and edition exist
         url = get_url(book)
@@ -237,7 +239,7 @@ def load_from_books(books):
             if edition and edition.work != work:
                 work = merge_works(work, edition.work)
             if not edition:
-                edition= Edition(title=title, work=work)
+                edition = Edition(title=title, work=work)
                 edition.save()
                 Identifier.set(type='isbn', value=isbn, edition=edition, work=work)
 
@@ -249,18 +251,18 @@ def load_from_books(books):
             edition.save()
             edition.set_publisher(get_publisher(book))
 
-        # possibly replace work.description 
+        # possibly replace work.description
         description = get_description(book)
-        if len(description)>len (work.description):
+        if len(description) > len(work.description):
             work.description = description
             work.save()
-        
+
         # set language
-        lang= get_language(book)
+        lang = get_language(book)
         if lang:
             work.language = lang
             work.save()
-        
+
         # add a bisac subject (and ancestors) to work
         for bisacsh in get_subjects(book):
             while bisacsh:
@@ -273,13 +275,13 @@ def load_from_books(books):
         results.append((book, work, edition))
 
         try:
-            logger.info (u"{} {} {}\n".format(i, title, loading_ok))
+            logger.info(u"{} {} {}\n".format(i, title, loading_ok))
         except Exception as e:
-            logger.info (u"{} {}\n".format(i, title, str(e) ))
+            logger.info(u"{} {} {}\n".format(i, title, str(e)))
 
     return results
 
-    
+
 def loaded_book_ok(book, work, edition):
 
     isbns = get_isbns(book)[0]
@@ -292,10 +294,10 @@ def loaded_book_ok(book, work, edition):
     try:
         url_id = Identifier.objects.get(type='http', value=get_url(book))
         if url_id is None:
-            logger.info ("url_id problem: work.id {}, url: {}".format(work.id, get_url(book)))
+            logger.info("url_id problem: work.id {}, url: {}".format(work.id, get_url(book)))
             return False
     except Exception as e:
-        logger.info (str(e))
+        logger.info(str(e))
         return False
 
     # isbns
@@ -307,15 +309,17 @@ def loaded_book_ok(book, work, edition):
             try:
                 edition_for_isbn = Identifier.objects.get(type='isbn', value=isbn).edition
             except Exception as e:
-                print (e)
+                logger.info(e)
                 return False
 
             # authors
             # print set([ed.name for ed in edition_for_isbn.authors.all()])
 
-            if (set([utf8_general_ci_norm(author[0]) for author in authors]) != 
-                   set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])):
-                print "problem with authors"
+            if (
+                    set([utf8_general_ci_norm(author[0]) for author in authors]) !=
+                    set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])
+            ):
+                logger.info("problem with authors")
                 return False
 
             try:
@@ -327,7 +331,7 @@ def loaded_book_ok(book, work, edition):
 
     # work description
     description = get_description(book)
-    if not ((work.description == description) or (len(description) <len (work.description))):
+    if not ((work.description == description) or (len(description) < len(work.description))):
         return False
 
     # bisac
@@ -364,19 +368,23 @@ def ids_from_urls(url):
         if id_match:
             ids[ident] = id_match.group('id')
     return ids
-        
+
 DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"')
 
 def dl_online(ebook):
     if ebook.format != 'online':
-        return None, False
-        
-    if ebook.url.find(u'dropbox.com/s/') >= 0:
+        pass
+    elif ebook.url.find(u'dropbox.com/s/') >= 0:
         response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
         if response.status_code == 200:
             match_dl = DROPBOX_DL.search(response.content)
             if match_dl:
                 return make_dl_ebook(match_dl.group(1), ebook)
+            else:
+                logger.warning('couldn\'t get {}'.format(ebook.url))
+        else:
+            logger.warning('couldn\'t get dl for {}'.format(ebook.url))
+
     elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
         doc = get_soup(ebook.url)
         if doc:
@@ -384,7 +392,13 @@ def dl_online(ebook):
             if obj:
                 dl_url = urlparse.urljoin(ebook.url, obj['href'])
                 return make_dl_ebook(dl_url, ebook)
-                
+            else:
+                logger.warning('couldn\'t get dl_url for {}'.format(ebook.url))
+        else:
+            logger.warning('couldn\'t get soup for {}'.format(ebook.url))
+
+    return None, False
+
 def make_dl_ebook(url, ebook):
     if EbookFile.objects.filter(source=ebook.url):
         return EbookFile.objects.filter(source=ebook.url)[0], False
@@ -414,12 +428,17 @@ def make_dl_ebook(url, ebook):
             new_ebf.ebook = new_ebook
             new_ebf.save()
             return new_ebf, True
+        else:
+            logger.warning('download format for {} is not ebook'.format(url))
+    else:
+        logger.warning('couldn\'t get {}'.format(url))
+    return None, False
 
 def type_for_url(url, content_type=None):
     if not url:
         return ''
     if url.find('books.openedition.org') >= 0:
-        return ('online')
+        return 'online'
     if Ebook.objects.filter(url=url):
         return Ebook.objects.filter(url=url)[0].format
     ct = content_type if content_type else contenttyper.calc_type(url)
@@ -440,7 +459,7 @@ def type_for_url(url, content_type=None):
     elif re.search("mobi", ct):
         return "mobi"
     return "other"
-   
+
 class ContentTyper(object):
     """ """
     def __init__(self):