Merge pull request #778 from Gluejar/doab-misc

fix bugs and delint
2018-05-11 11:47:48 -04:00 · 2018-05-11 11:47:48 -04:00 · 71ba8dc9fa
parent 95b8276829 05fae60ddb
commit 71ba8dc9fa
1 changed files with 113 additions and 94 deletions
--- a/core/loaders/utils.py
+++ b/core/loaders/utils.py
@ -1,7 +1,6 @@
 import csv
 import logging
 import re
-import sys
 import time
 import unicodedata
 import urlparse
@ -17,7 +16,7 @@ from regluit.bisac.models import BisacHeading
 from regluit.core.bookloader import add_by_isbn_from_google, merge_works
 from regluit.core.isbn import ISBN
 from regluit.core.models import (
-    Author, Ebook, EbookFile, Edition, Identifier, path_for_file, PublisherName, Subject, Work,
+    Ebook, EbookFile, Edition, Identifier, path_for_file, Subject, Work,
 )

 logger = logging.getLogger(__name__)
@ -111,7 +110,6 @@ def get_title(book):
    sub = book.get('Subtitle', '')
    if sub:
        return u'{}: {}'.format(title, sub)
-    else:
    return title

 def get_cover(book):
@ -140,7 +138,9 @@ def get_isbns(book):
    isbns = []
    edition = None
    #'ISBN 1' is OBP, others are UMICH
-    for code in ['eISBN', 'ISBN 3','PaperISBN', 'ISBN 2', 'ClothISBN', 'ISBN 1', 'ISBN 4', 'ISBN 5']:
+    for code in ['eISBN', 'ISBN 3', 'PaperISBN', 'ISBN 2', 'ClothISBN',
+                 'ISBN 1', 'ISBN 4', 'ISBN 5'
+                ]:
        if book.get(code, '') not in ('', 'N/A'):
            values = book[code].split(',')
            for value in values:
@ -163,7 +163,6 @@ def get_pubdate(book):
        return u'{}-{}-{}'.format(value, sub, sub2)
    elif sub:
        return u'{}-{}'.format(value, sub, sub2)
-    else:
    return value

 def get_publisher(book):
@ -175,7 +174,10 @@ def get_publisher(book):

 def get_url(book):
    url = book.get('URL', '')
-    url = url if url else u'https://doi.org/{}/{}'.format( book.get('DOI prefix',''),book.get('DOI suffix',''))
+    url = url if url else u'https://doi.org/{}/{}'.format(
+        book.get('DOI prefix', ''),
+        book.get('DOI suffix', '')
+    )
    return url

 def get_description(book):
@ -209,7 +211,7 @@ def load_from_books(books):

        # try first to get an Edition already in DB with by one of the ISBNs in book
        (isbns, edition) = get_isbns(book)
-        if len(isbns)==0:
+        if not isbns:
            continue
        title = get_title(book)
        authors = get_authors(book)
@ -275,7 +277,7 @@ def load_from_books(books):
        try:
            logger.info(u"{} {} {}\n".format(i, title, loading_ok))
        except Exception as e:
-            logger.info (u"{} {}\n".format(i, title, str(e) ))
+            logger.info(u"{} {} {}\n".format(i, title, str(e)))

    return results

@ -307,15 +309,17 @@ def loaded_book_ok(book, work, edition):
            try:
                edition_for_isbn = Identifier.objects.get(type='isbn', value=isbn).edition
            except Exception as e:
-                print (e)
+                logger.info(e)
                return False

            # authors
            # print set([ed.name for ed in edition_for_isbn.authors.all()])

-            if (set([utf8_general_ci_norm(author[0]) for author in authors]) != 
-                   set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])):
-                print "problem with authors"
+            if (
+                    set([utf8_general_ci_norm(author[0]) for author in authors]) !=
+                    set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])
+            ):
+                logger.info("problem with authors")
                return False

            try:
@ -369,14 +373,18 @@ DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+

 def dl_online(ebook):
    if ebook.format != 'online':
-        return None, False
-        
-    if ebook.url.find(u'dropbox.com/s/') >= 0:
+        pass
+    elif ebook.url.find(u'dropbox.com/s/') >= 0:
        response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
        if response.status_code == 200:
            match_dl = DROPBOX_DL.search(response.content)
            if match_dl:
                return make_dl_ebook(match_dl.group(1), ebook)
+            else:
+                logger.warning('couldn\'t get {}'.format(ebook.url))
+        else:
+            logger.warning('couldn\'t get dl for {}'.format(ebook.url))
+
    elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
        doc = get_soup(ebook.url)
        if doc:
@ -384,6 +392,12 @@ def dl_online(ebook):
            if obj:
                dl_url = urlparse.urljoin(ebook.url, obj['href'])
                return make_dl_ebook(dl_url, ebook)
+            else:
+                logger.warning('couldn\'t get dl_url for {}'.format(ebook.url))
+        else:
+            logger.warning('couldn\'t get soup for {}'.format(ebook.url))
+
+    return None, False

 def make_dl_ebook(url, ebook):
    if EbookFile.objects.filter(source=ebook.url):
@ -414,12 +428,17 @@ def make_dl_ebook(url, ebook):
            new_ebf.ebook = new_ebook
            new_ebf.save()
            return new_ebf, True
+        else:
+            logger.warning('download format for {} is not ebook'.format(url))
+    else:
+        logger.warning('couldn\'t get {}'.format(url))
+    return None, False

 def type_for_url(url, content_type=None):
    if not url:
        return ''
    if url.find('books.openedition.org') >= 0:
-        return ('online')
+        return 'online'
    if Ebook.objects.filter(url=url):
        return Ebook.objects.filter(url=url)[0].format
    ct = content_type if content_type else contenttyper.calc_type(url)