Merge pull request #778 from Gluejar/doab-misc

fix bugs and delint
pull/91/head
eshellman 2018-05-11 11:47:48 -04:00 committed by GitHub
commit 71ba8dc9fa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 113 additions and 94 deletions

View File

@ -1,7 +1,6 @@
import csv
import logging
import re
import sys
import time
import unicodedata
import urlparse
@ -17,7 +16,7 @@ from regluit.bisac.models import BisacHeading
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
from regluit.core.isbn import ISBN
from regluit.core.models import (
Author, Ebook, EbookFile, Edition, Identifier, path_for_file, PublisherName, Subject, Work,
Ebook, EbookFile, Edition, Identifier, path_for_file, Subject, Work,
)
logger = logging.getLogger(__name__)
@ -111,7 +110,6 @@ def get_title(book):
sub = book.get('Subtitle', '')
if sub:
return u'{}: {}'.format(title, sub)
else:
return title
def get_cover(book):
@ -140,7 +138,9 @@ def get_isbns(book):
isbns = []
edition = None
#'ISBN 1' is OBP, others are UMICH
for code in ['eISBN', 'ISBN 3','PaperISBN', 'ISBN 2', 'ClothISBN', 'ISBN 1', 'ISBN 4', 'ISBN 5']:
for code in ['eISBN', 'ISBN 3', 'PaperISBN', 'ISBN 2', 'ClothISBN',
'ISBN 1', 'ISBN 4', 'ISBN 5'
]:
if book.get(code, '') not in ('', 'N/A'):
values = book[code].split(',')
for value in values:
@ -163,7 +163,6 @@ def get_pubdate(book):
return u'{}-{}-{}'.format(value, sub, sub2)
elif sub:
return u'{}-{}'.format(value, sub, sub2)
else:
return value
def get_publisher(book):
@ -175,7 +174,10 @@ def get_publisher(book):
def get_url(book):
url = book.get('URL', '')
url = url if url else u'https://doi.org/{}/{}'.format( book.get('DOI prefix',''),book.get('DOI suffix',''))
url = url if url else u'https://doi.org/{}/{}'.format(
book.get('DOI prefix', ''),
book.get('DOI suffix', '')
)
return url
def get_description(book):
@ -209,7 +211,7 @@ def load_from_books(books):
# try first to get an Edition already in DB with by one of the ISBNs in book
(isbns, edition) = get_isbns(book)
if len(isbns)==0:
if not isbns:
continue
title = get_title(book)
authors = get_authors(book)
@ -275,7 +277,7 @@ def load_from_books(books):
try:
logger.info(u"{} {} {}\n".format(i, title, loading_ok))
except Exception as e:
logger.info (u"{} {}\n".format(i, title, str(e) ))
logger.info(u"{} {} {}\n".format(i, title, str(e)))
return results
@ -307,15 +309,17 @@ def loaded_book_ok(book, work, edition):
try:
edition_for_isbn = Identifier.objects.get(type='isbn', value=isbn).edition
except Exception as e:
print (e)
logger.info(e)
return False
# authors
# print set([ed.name for ed in edition_for_isbn.authors.all()])
if (set([utf8_general_ci_norm(author[0]) for author in authors]) !=
set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])):
print "problem with authors"
if (
set([utf8_general_ci_norm(author[0]) for author in authors]) !=
set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])
):
logger.info("problem with authors")
return False
try:
@ -369,14 +373,18 @@ DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+
def dl_online(ebook):
if ebook.format != 'online':
return None, False
if ebook.url.find(u'dropbox.com/s/') >= 0:
pass
elif ebook.url.find(u'dropbox.com/s/') >= 0:
response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
match_dl = DROPBOX_DL.search(response.content)
if match_dl:
return make_dl_ebook(match_dl.group(1), ebook)
else:
logger.warning('couldn\'t get {}'.format(ebook.url))
else:
logger.warning('couldn\'t get dl for {}'.format(ebook.url))
elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
doc = get_soup(ebook.url)
if doc:
@ -384,6 +392,12 @@ def dl_online(ebook):
if obj:
dl_url = urlparse.urljoin(ebook.url, obj['href'])
return make_dl_ebook(dl_url, ebook)
else:
logger.warning('couldn\'t get dl_url for {}'.format(ebook.url))
else:
logger.warning('couldn\'t get soup for {}'.format(ebook.url))
return None, False
def make_dl_ebook(url, ebook):
if EbookFile.objects.filter(source=ebook.url):
@ -414,12 +428,17 @@ def make_dl_ebook(url, ebook):
new_ebf.ebook = new_ebook
new_ebf.save()
return new_ebf, True
else:
logger.warning('download format for {} is not ebook'.format(url))
else:
logger.warning('couldn\'t get {}'.format(url))
return None, False
def type_for_url(url, content_type=None):
if not url:
return ''
if url.find('books.openedition.org') >= 0:
return ('online')
return 'online'
if Ebook.objects.filter(url=url):
return Ebook.objects.filter(url=url)[0].format
ct = content_type if content_type else contenttyper.calc_type(url)