Merge remote-tracking branch 'Gluejar/master' into production
commit
89b4221dcd
|
@ -25,7 +25,7 @@ def onix_feed(facet, max=None):
|
||||||
editions = facet.facet_object.filter_model("Edition",editions).distinct()
|
editions = facet.facet_object.filter_model("Edition",editions).distinct()
|
||||||
for edition in editions:
|
for edition in editions:
|
||||||
edition_prod = product(edition, facet.facet_object)
|
edition_prod = product(edition, facet.facet_object)
|
||||||
if edition_prod:
|
if edition_prod is not None:
|
||||||
feed.append(edition_prod)
|
feed.append(edition_prod)
|
||||||
return etree.tostring(feed, pretty_print=True)
|
return etree.tostring(feed, pretty_print=True)
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ def onix_feed_for_work(work):
|
||||||
feed.append(header(work))
|
feed.append(header(work))
|
||||||
for edition in models.Edition.objects.filter(work=work,ebooks__isnull=False).distinct():
|
for edition in models.Edition.objects.filter(work=work,ebooks__isnull=False).distinct():
|
||||||
edition_prod = product(edition)
|
edition_prod = product(edition)
|
||||||
if edition_prod:
|
if edition_prod is not None:
|
||||||
feed.append(product(edition))
|
feed.append(product(edition))
|
||||||
return etree.tostring(feed, pretty_print=True)
|
return etree.tostring(feed, pretty_print=True)
|
||||||
|
|
||||||
|
|
File diff suppressed because one or more lines are too long
54230
bookdata/doab_auths.json
54230
bookdata/doab_auths.json
File diff suppressed because it is too large
Load Diff
|
@ -49,7 +49,7 @@ def add_by_oclc(isbn, work=None):
|
||||||
|
|
||||||
def add_by_oclc_from_google(oclc):
|
def add_by_oclc_from_google(oclc):
|
||||||
if oclc:
|
if oclc:
|
||||||
logger.info("adding book by oclc %s", oclc)
|
logger.info(u"adding book by oclc %s", oclc)
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
try:
|
try:
|
||||||
|
@ -59,10 +59,10 @@ def add_by_oclc_from_google(oclc):
|
||||||
try:
|
try:
|
||||||
results = _get_json(url, {"q": '"OCLC%s"' % oclc})
|
results = _get_json(url, {"q": '"OCLC%s"' % oclc})
|
||||||
except LookupFailure, e:
|
except LookupFailure, e:
|
||||||
logger.exception("lookup failure for %s", oclc)
|
logger.exception(u"lookup failure for %s", oclc)
|
||||||
return None
|
return None
|
||||||
if not results.has_key('items') or not results['items']:
|
if not results.has_key('items') or not results['items']:
|
||||||
logger.warn("no google hits for %s", oclc)
|
logger.warn(u"no google hits for %s", oclc)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -70,16 +70,16 @@ def add_by_oclc_from_google(oclc):
|
||||||
models.Identifier(type='oclc', value=oclc, edition=e, work=e.work).save()
|
models.Identifier(type='oclc', value=oclc, edition=e, work=e.work).save()
|
||||||
return e
|
return e
|
||||||
except LookupFailure, e:
|
except LookupFailure, e:
|
||||||
logger.exception("failed to add edition for %s", oclc)
|
logger.exception(u"failed to add edition for %s", oclc)
|
||||||
except IntegrityError, e:
|
except IntegrityError, e:
|
||||||
logger.exception("google books data for %s didn't fit our db", oclc)
|
logger.exception(u"google books data for %s didn't fit our db", oclc)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def valid_isbn(isbn):
|
def valid_isbn(isbn):
|
||||||
try:
|
try:
|
||||||
return identifier_cleaner('isbn')(isbn)
|
return identifier_cleaner('isbn')(isbn)
|
||||||
except:
|
except:
|
||||||
logger.exception("invalid isbn: %s", isbn)
|
logger.exception(u"invalid isbn: %s", isbn)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def add_by_isbn(isbn, work=None, language='xx', title=''):
|
def add_by_isbn(isbn, work=None, language='xx', title=''):
|
||||||
|
@ -88,13 +88,13 @@ def add_by_isbn(isbn, work=None, language='xx', title=''):
|
||||||
try:
|
try:
|
||||||
e = add_by_isbn_from_google(isbn, work=work)
|
e = add_by_isbn_from_google(isbn, work=work)
|
||||||
except LookupFailure:
|
except LookupFailure:
|
||||||
logger.exception("failed google lookup for %s", isbn)
|
logger.exception(u"failed google lookup for %s", isbn)
|
||||||
# try again some other time
|
# try again some other time
|
||||||
return None
|
return None
|
||||||
if e:
|
if e:
|
||||||
return e
|
return e
|
||||||
|
|
||||||
logger.info("null came back from add_by_isbn_from_google: %s", isbn)
|
logger.info(u"null came back from add_by_isbn_from_google: %s", isbn)
|
||||||
|
|
||||||
# if there's a a title, we want to create stub editions and
|
# if there's a a title, we want to create stub editions and
|
||||||
# works, even if google doesn't know about it # but if it's not valid,
|
# works, even if google doesn't know about it # but if it's not valid,
|
||||||
|
@ -129,10 +129,10 @@ def get_google_isbn_results(isbn):
|
||||||
try:
|
try:
|
||||||
results = _get_json(url, {"q": "isbn:%s" % isbn})
|
results = _get_json(url, {"q": "isbn:%s" % isbn})
|
||||||
except LookupFailure:
|
except LookupFailure:
|
||||||
logger.exception("lookup failure for %s", isbn)
|
logger.exception(u"lookup failure for %s", isbn)
|
||||||
return None
|
return None
|
||||||
if not results.has_key('items') or not results['items']:
|
if not results.has_key('items') or not results['items']:
|
||||||
logger.warn("no google hits for %s", isbn)
|
logger.warn(u"no google hits for %s", isbn)
|
||||||
return None
|
return None
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
@ -201,7 +201,7 @@ def update_edition(edition):
|
||||||
# if the language of the edition no longer matches that of the parent work,
|
# if the language of the edition no longer matches that of the parent work,
|
||||||
# attach edition to the
|
# attach edition to the
|
||||||
if edition.work.language != language:
|
if edition.work.language != language:
|
||||||
logger.info("reconnecting %s since it is %s instead of %s",
|
logger.info(u"reconnecting %s since it is %s instead of %s",
|
||||||
googlebooks_id, language, edition.work.language)
|
googlebooks_id, language, edition.work.language)
|
||||||
old_work = edition.work
|
old_work = edition.work
|
||||||
|
|
||||||
|
@ -210,7 +210,7 @@ def update_edition(edition):
|
||||||
edition.work = new_work
|
edition.work = new_work
|
||||||
edition.save()
|
edition.save()
|
||||||
for identifier in edition.identifiers.all():
|
for identifier in edition.identifiers.all():
|
||||||
logger.info("moving identifier %s", identifier.value)
|
logger.info(u"moving identifier %s", identifier.value)
|
||||||
identifier.work = new_work
|
identifier.work = new_work
|
||||||
identifier.save()
|
identifier.save()
|
||||||
if old_work and old_work.editions.count() == 0:
|
if old_work and old_work.editions.count() == 0:
|
||||||
|
@ -256,7 +256,7 @@ def add_by_isbn_from_google(isbn, work=None):
|
||||||
edition.new = False
|
edition.new = False
|
||||||
return edition
|
return edition
|
||||||
|
|
||||||
logger.info("adding new book by isbn %s", isbn)
|
logger.info(u"adding new book by isbn %s", isbn)
|
||||||
results = get_google_isbn_results(isbn)
|
results = get_google_isbn_results(isbn)
|
||||||
if results:
|
if results:
|
||||||
try:
|
try:
|
||||||
|
@ -267,9 +267,9 @@ def add_by_isbn_from_google(isbn, work=None):
|
||||||
isbn=isbn
|
isbn=isbn
|
||||||
)
|
)
|
||||||
except LookupFailure, e:
|
except LookupFailure, e:
|
||||||
logger.exception("failed to add edition for %s", isbn)
|
logger.exception(u"failed to add edition for %s", isbn)
|
||||||
except IntegrityError, e:
|
except IntegrityError, e:
|
||||||
logger.exception("google books data for %s didn't fit our db", isbn)
|
logger.exception(u"google books data for %s didn't fit our db", isbn)
|
||||||
return None
|
return None
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -320,7 +320,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
|
||||||
if results:
|
if results:
|
||||||
item = results
|
item = results
|
||||||
else:
|
else:
|
||||||
logger.info("loading metadata from google for %s", googlebooks_id)
|
logger.info(u"loading metadata from google for %s", googlebooks_id)
|
||||||
url = "https://www.googleapis.com/books/v1/volumes/%s" % googlebooks_id
|
url = "https://www.googleapis.com/books/v1/volumes/%s" % googlebooks_id
|
||||||
item = _get_json(url)
|
item = _get_json(url)
|
||||||
d = item['volumeInfo']
|
d = item['volumeInfo']
|
||||||
|
@ -343,7 +343,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
|
||||||
if len(language) > 5:
|
if len(language) > 5:
|
||||||
language = language[0:5]
|
language = language[0:5]
|
||||||
if work and work.language != language:
|
if work and work.language != language:
|
||||||
logger.info("not connecting %s since it is %s instead of %s",
|
logger.info(u"not connecting %s since it is %s instead of %s",
|
||||||
googlebooks_id, language, work.language)
|
googlebooks_id, language, work.language)
|
||||||
work = None
|
work = None
|
||||||
# isbn = None
|
# isbn = None
|
||||||
|
@ -371,7 +371,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
|
||||||
try:
|
try:
|
||||||
e = models.Identifier.objects.get(type='goog', value=googlebooks_id).edition
|
e = models.Identifier.objects.get(type='goog', value=googlebooks_id).edition
|
||||||
e.new = False
|
e.new = False
|
||||||
logger.warning(" whoa nellie, somebody else created an edition while we were working.")
|
logger.warning(u" whoa nellie, somebody else created an edition while we were working.")
|
||||||
if work.new:
|
if work.new:
|
||||||
work.delete()
|
work.delete()
|
||||||
return e
|
return e
|
||||||
|
@ -404,19 +404,19 @@ def relate_isbn(isbn, cluster_size=1):
|
||||||
"""add a book by isbn and then see if there's an existing work to add it to so as to make a
|
"""add a book by isbn and then see if there's an existing work to add it to so as to make a
|
||||||
cluster bigger than cluster_size.
|
cluster bigger than cluster_size.
|
||||||
"""
|
"""
|
||||||
logger.info("finding a related work for %s", isbn)
|
logger.info(u"finding a related work for %s", isbn)
|
||||||
|
|
||||||
edition = add_by_isbn(isbn)
|
edition = add_by_isbn(isbn)
|
||||||
if edition is None:
|
if edition is None:
|
||||||
return None
|
return None
|
||||||
if edition.work is None:
|
if edition.work is None:
|
||||||
logger.info("didn't add related to null work")
|
logger.info(u"didn't add related to null work")
|
||||||
return None
|
return None
|
||||||
if edition.work.editions.count() > cluster_size:
|
if edition.work.editions.count() > cluster_size:
|
||||||
return edition.work
|
return edition.work
|
||||||
for other_isbn in thingisbn(isbn):
|
for other_isbn in thingisbn(isbn):
|
||||||
# 979's come back as 13
|
# 979's come back as 13
|
||||||
logger.debug("other_isbn: %s", other_isbn)
|
logger.debug(u"other_isbn: %s", other_isbn)
|
||||||
if len(other_isbn) == 10:
|
if len(other_isbn) == 10:
|
||||||
other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
|
other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
|
||||||
related_edition = add_by_isbn(other_isbn, work=edition.work)
|
related_edition = add_by_isbn(other_isbn, work=edition.work)
|
||||||
|
@ -427,7 +427,7 @@ def relate_isbn(isbn, cluster_size=1):
|
||||||
related_edition.work = edition.work
|
related_edition.work = edition.work
|
||||||
related_edition.save()
|
related_edition.save()
|
||||||
elif related_edition.work_id != edition.work_id:
|
elif related_edition.work_id != edition.work_id:
|
||||||
logger.debug("merge_works path 1 %s %s", edition.work_id, related_edition.work_id)
|
logger.debug(u"merge_works path 1 %s %s", edition.work_id, related_edition.work_id)
|
||||||
merge_works(related_edition.work, edition.work)
|
merge_works(related_edition.work, edition.work)
|
||||||
if related_edition.work.editions.count() > cluster_size:
|
if related_edition.work.editions.count() > cluster_size:
|
||||||
return related_edition.work
|
return related_edition.work
|
||||||
|
@ -438,7 +438,7 @@ def add_related(isbn):
|
||||||
The initial seed ISBN will be added if it's not already there.
|
The initial seed ISBN will be added if it's not already there.
|
||||||
"""
|
"""
|
||||||
# make sure the seed edition is there
|
# make sure the seed edition is there
|
||||||
logger.info("adding related editions for %s", isbn)
|
logger.info(u"adding related editions for %s", isbn)
|
||||||
|
|
||||||
new_editions = []
|
new_editions = []
|
||||||
|
|
||||||
|
@ -446,14 +446,14 @@ def add_related(isbn):
|
||||||
if edition is None:
|
if edition is None:
|
||||||
return new_editions
|
return new_editions
|
||||||
if edition.work is None:
|
if edition.work is None:
|
||||||
logger.warning("didn't add related to null work")
|
logger.warning(u"didn't add related to null work")
|
||||||
return new_editions
|
return new_editions
|
||||||
# this is the work everything will hang off
|
# this is the work everything will hang off
|
||||||
work = edition.work
|
work = edition.work
|
||||||
other_editions = {}
|
other_editions = {}
|
||||||
for other_isbn in thingisbn(isbn):
|
for other_isbn in thingisbn(isbn):
|
||||||
# 979's come back as 13
|
# 979's come back as 13
|
||||||
logger.debug("other_isbn: %s", other_isbn)
|
logger.debug(u"other_isbn: %s", other_isbn)
|
||||||
if len(other_isbn) == 10:
|
if len(other_isbn) == 10:
|
||||||
other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
|
other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
|
||||||
related_edition = add_by_isbn(other_isbn, work=work)
|
related_edition = add_by_isbn(other_isbn, work=work)
|
||||||
|
@ -466,7 +466,7 @@ def add_related(isbn):
|
||||||
related_edition.work = work
|
related_edition.work = work
|
||||||
related_edition.save()
|
related_edition.save()
|
||||||
elif related_edition.work_id != work.id:
|
elif related_edition.work_id != work.id:
|
||||||
logger.debug("merge_works path 1 %s %s", work.id, related_edition.work_id)
|
logger.debug(u"merge_works path 1 %s %s", work.id, related_edition.work_id)
|
||||||
work = merge_works(work, related_edition.work)
|
work = merge_works(work, related_edition.work)
|
||||||
else:
|
else:
|
||||||
if other_editions.has_key(related_language):
|
if other_editions.has_key(related_language):
|
||||||
|
@ -476,14 +476,14 @@ def add_related(isbn):
|
||||||
|
|
||||||
# group the other language editions together
|
# group the other language editions together
|
||||||
for lang_group in other_editions.itervalues():
|
for lang_group in other_editions.itervalues():
|
||||||
logger.debug("lang_group (ed, work): %s", [(ed.id, ed.work_id) for ed in lang_group])
|
logger.debug(u"lang_group (ed, work): %s", [(ed.id, ed.work_id) for ed in lang_group])
|
||||||
if len(lang_group) > 1:
|
if len(lang_group) > 1:
|
||||||
lang_edition = lang_group[0]
|
lang_edition = lang_group[0]
|
||||||
logger.debug("lang_edition.id: %s", lang_edition.id)
|
logger.debug(u"lang_edition.id: %s", lang_edition.id)
|
||||||
# compute the distinct set of works to merge into lang_edition.work
|
# compute the distinct set of works to merge into lang_edition.work
|
||||||
works_to_merge = set([ed.work for ed in lang_group[1:]]) - set([lang_edition.work])
|
works_to_merge = set([ed.work for ed in lang_group[1:]]) - set([lang_edition.work])
|
||||||
for w in works_to_merge:
|
for w in works_to_merge:
|
||||||
logger.debug("merge_works path 2 %s %s", lang_edition.work_id, w.id)
|
logger.debug(u"merge_works path 2 %s %s", lang_edition.work_id, w.id)
|
||||||
merged_work = merge_works(lang_edition.work, w)
|
merged_work = merge_works(lang_edition.work, w)
|
||||||
models.WorkRelation.objects.get_or_create(
|
models.WorkRelation.objects.get_or_create(
|
||||||
to_work=lang_group[0].work,
|
to_work=lang_group[0].work,
|
||||||
|
@ -498,17 +498,21 @@ def thingisbn(isbn):
|
||||||
Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns,
|
Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns,
|
||||||
which come back as isbn_13')
|
which come back as isbn_13')
|
||||||
"""
|
"""
|
||||||
logger.info("looking up %s at ThingISBN", isbn)
|
logger.info(u"looking up %s at ThingISBN", isbn)
|
||||||
url = "https://www.librarything.com/api/thingISBN/%s" % isbn
|
url = "https://www.librarything.com/api/thingISBN/%s" % isbn
|
||||||
xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content
|
xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content
|
||||||
doc = ElementTree.fromstring(xml)
|
try:
|
||||||
return [e.text for e in doc.findall('isbn')]
|
doc = ElementTree.fromstring(xml)
|
||||||
|
return [e.text for e in doc.findall('isbn')]
|
||||||
|
except SyntaxError:
|
||||||
|
# LibraryThing down
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def merge_works(w1, w2, user=None):
|
def merge_works(w1, w2, user=None):
|
||||||
"""will merge the second work (w2) into the first (w1)
|
"""will merge the second work (w2) into the first (w1)
|
||||||
"""
|
"""
|
||||||
logger.info("merging work %s into %s", w2.id, w1.id)
|
logger.info(u"merging work %s into %s", w2.id, w1.id)
|
||||||
# don't merge if the works are the same or at least one of the works has no id
|
# don't merge if the works are the same or at least one of the works has no id
|
||||||
#(for example, when w2 has already been deleted)
|
#(for example, when w2 has already been deleted)
|
||||||
if w1 is None or w2 is None or w1.id == w2.id or w1.id is None or w2.id is None:
|
if w1 is None or w2 is None or w1.id == w2.id or w1.id is None or w2.id is None:
|
||||||
|
@ -583,7 +587,7 @@ def detach_edition(e):
|
||||||
will detach edition from its work, creating a new stub work. if remerge=true, will see if
|
will detach edition from its work, creating a new stub work. if remerge=true, will see if
|
||||||
there's another work to attach to
|
there's another work to attach to
|
||||||
"""
|
"""
|
||||||
logger.info("splitting edition %s from %s", e, e.work)
|
logger.info(u"splitting edition %s from %s", e, e.work)
|
||||||
w = models.Work(title=e.title, language=e.work.language)
|
w = models.Work(title=e.title, language=e.work.language)
|
||||||
w.save()
|
w.save()
|
||||||
|
|
||||||
|
@ -618,7 +622,7 @@ def add_openlibrary(work, hard_refresh=False):
|
||||||
work.save()
|
work.save()
|
||||||
|
|
||||||
# find the first ISBN match in OpenLibrary
|
# find the first ISBN match in OpenLibrary
|
||||||
logger.info("looking up openlibrary data for work %s", work.id)
|
logger.info(u"looking up openlibrary data for work %s", work.id)
|
||||||
|
|
||||||
e = None # openlibrary edition json
|
e = None # openlibrary edition json
|
||||||
w = None # openlibrary work json
|
w = None # openlibrary work json
|
||||||
|
@ -633,7 +637,7 @@ def add_openlibrary(work, hard_refresh=False):
|
||||||
try:
|
try:
|
||||||
e = _get_json(url, params, type='ol')
|
e = _get_json(url, params, type='ol')
|
||||||
except LookupFailure:
|
except LookupFailure:
|
||||||
logger.exception("OL lookup failed for %s", isbn_key)
|
logger.exception(u"OL lookup failed for %s", isbn_key)
|
||||||
e = {}
|
e = {}
|
||||||
if e.has_key(isbn_key):
|
if e.has_key(isbn_key):
|
||||||
if e[isbn_key].has_key('details'):
|
if e[isbn_key].has_key('details'):
|
||||||
|
@ -673,7 +677,7 @@ def add_openlibrary(work, hard_refresh=False):
|
||||||
)
|
)
|
||||||
if e[isbn_key]['details'].has_key('works'):
|
if e[isbn_key]['details'].has_key('works'):
|
||||||
work_key = e[isbn_key]['details']['works'].pop(0)['key']
|
work_key = e[isbn_key]['details']['works'].pop(0)['key']
|
||||||
logger.info("got openlibrary work %s for isbn %s", work_key, isbn_key)
|
logger.info(u"got openlibrary work %s for isbn %s", work_key, isbn_key)
|
||||||
models.Identifier.get_or_add(type='olwk', value=work_key, work=work)
|
models.Identifier.get_or_add(type='olwk', value=work_key, work=work)
|
||||||
try:
|
try:
|
||||||
w = _get_json("https://openlibrary.org" + work_key, type='ol')
|
w = _get_json("https://openlibrary.org" + work_key, type='ol')
|
||||||
|
@ -691,14 +695,14 @@ def add_openlibrary(work, hard_refresh=False):
|
||||||
if w.has_key('subjects') and len(w['subjects']) > len(subjects):
|
if w.has_key('subjects') and len(w['subjects']) > len(subjects):
|
||||||
subjects = w['subjects']
|
subjects = w['subjects']
|
||||||
except LookupFailure:
|
except LookupFailure:
|
||||||
logger.exception("OL lookup failed for %s", work_key)
|
logger.exception(u"OL lookup failed for %s", work_key)
|
||||||
if not subjects:
|
if not subjects:
|
||||||
logger.warn("unable to find work %s at openlibrary", work.id)
|
logger.warn(u"unable to find work %s at openlibrary", work.id)
|
||||||
return
|
return
|
||||||
|
|
||||||
# add the subjects to the Work
|
# add the subjects to the Work
|
||||||
for s in subjects:
|
for s in subjects:
|
||||||
logger.info("adding subject %s to work %s", s, work.id)
|
logger.info(u"adding subject %s to work %s", s, work.id)
|
||||||
subject = models.Subject.set_by_name(s, work=work)
|
subject = models.Subject.set_by_name(s, work=work)
|
||||||
|
|
||||||
work.save()
|
work.save()
|
||||||
|
@ -716,9 +720,9 @@ def _get_json(url, params={}, type='gb'):
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
return json.loads(response.content)
|
return json.loads(response.content)
|
||||||
else:
|
else:
|
||||||
logger.error("unexpected HTTP response: %s", response)
|
logger.error(u"unexpected HTTP response: %s", response)
|
||||||
if response.content:
|
if response.content:
|
||||||
logger.error("response content: %s", response.content)
|
logger.error(u"response content: %s", response.content)
|
||||||
raise LookupFailure("GET failed: url=%s and params=%s" % (url, params))
|
raise LookupFailure("GET failed: url=%s and params=%s" % (url, params))
|
||||||
|
|
||||||
|
|
||||||
|
@ -766,7 +770,7 @@ def load_gutenberg_edition(title, gutenberg_etext_id, ol_work_id, seed_isbn, url
|
||||||
ebook = models.Ebook()
|
ebook = models.Ebook()
|
||||||
|
|
||||||
if len(ebooks) > 1:
|
if len(ebooks) > 1:
|
||||||
logger.warning("There is more than one Ebook matching url {0}".format(url))
|
logger.warning(u"There is more than one Ebook matching url {0}".format(url))
|
||||||
|
|
||||||
|
|
||||||
ebook.format = format
|
ebook.format = format
|
||||||
|
@ -826,8 +830,6 @@ def edition_for_etype(etype, metadata, default=None):
|
||||||
for key in metadata.edition_identifiers.keys():
|
for key in metadata.edition_identifiers.keys():
|
||||||
return edition_for_ident(key, metadata.identifiers[key])
|
return edition_for_ident(key, metadata.identifiers[key])
|
||||||
|
|
||||||
MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
|
|
||||||
|
|
||||||
def load_ebookfile(url, etype):
|
def load_ebookfile(url, etype):
|
||||||
'''
|
'''
|
||||||
return a ContentFile if a new ebook has been loaded
|
return a ContentFile if a new ebook has been loaded
|
||||||
|
@ -960,8 +962,7 @@ class BasePandataLoader(object):
|
||||||
if contentfile:
|
if contentfile:
|
||||||
contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
|
contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
|
||||||
path = default_storage.save(contentfile_name, contentfile)
|
path = default_storage.save(contentfile_name, contentfile)
|
||||||
lic = MATCH_LICENSE.search(metadata.rights_url)
|
license = cc.license_from_cc_url(metadata.rights_url)
|
||||||
license = 'CC {}'.format(lic.group(1).upper()) if lic else ''
|
|
||||||
ebf = models.EbookFile.objects.create(
|
ebf = models.EbookFile.objects.create(
|
||||||
format=key,
|
format=key,
|
||||||
edition=edition,
|
edition=edition,
|
||||||
|
|
17
core/cc.py
17
core/cc.py
|
@ -1,8 +1,11 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
# mostly constants related to Creative Commons
|
''' mostly constants related to Creative Commons
|
||||||
# let's be DRY with these parameters
|
# let's be DRY with these parameters
|
||||||
|
|
||||||
## need to add versioned CC entries
|
## need to add versioned CC entries
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
INFO_CC = (
|
INFO_CC = (
|
||||||
('CC BY-NC-ND', 'by-nc-nd', 'Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported (CC BY-NC-ND 3.0)', 'https://creativecommons.org/licenses/by-nc-nd/3.0/', 'Creative Commons Attribution-NonCommercial-NoDerivs'),
|
('CC BY-NC-ND', 'by-nc-nd', 'Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported (CC BY-NC-ND 3.0)', 'https://creativecommons.org/licenses/by-nc-nd/3.0/', 'Creative Commons Attribution-NonCommercial-NoDerivs'),
|
||||||
|
@ -162,3 +165,15 @@ def match_license(license_string):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
return RIGHTS_ALIAS.get(license_string, None)
|
return RIGHTS_ALIAS.get(license_string, None)
|
||||||
|
|
||||||
|
MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
|
||||||
|
def license_from_cc_url(rights_url):
|
||||||
|
if not rights_url:
|
||||||
|
return None
|
||||||
|
lic = MATCH_LICENSE.search(rights_url)
|
||||||
|
if lic:
|
||||||
|
return 'CC {}'.format(lic.group(1).upper())
|
||||||
|
if rights_url.find('openedition.org') >= 0:
|
||||||
|
return 'OPENEDITION'
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
|
@ -52,3 +52,9 @@ def add_by_webpage(url, work=None, user=None):
|
||||||
|
|
||||||
def add_by_sitemap(url, maxnum=None):
|
def add_by_sitemap(url, maxnum=None):
|
||||||
return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum))
|
return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum))
|
||||||
|
|
||||||
|
def scrape_language(url):
|
||||||
|
scraper = get_scraper(url)
|
||||||
|
return scraper.metadata.get('language')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,42 +1,54 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# encoding: utf-8
|
# encoding: utf-8
|
||||||
import logging
|
import datetime
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from itertools import islice
|
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from django.db.models import (Q, F)
|
from django.db.models import Q
|
||||||
|
|
||||||
from django.core.files.storage import default_storage
|
|
||||||
from django.core.files.base import ContentFile
|
from django.core.files.base import ContentFile
|
||||||
|
from django.core.files.storage import default_storage
|
||||||
|
|
||||||
import regluit
|
from oaipmh.client import Client
|
||||||
|
from oaipmh.error import IdDoesNotExistError
|
||||||
|
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
|
||||||
|
|
||||||
|
from regluit.core import bookloader, cc
|
||||||
from regluit.core import models, tasks
|
from regluit.core import models, tasks
|
||||||
from regluit.core import bookloader
|
from regluit.core.bookloader import merge_works
|
||||||
from regluit.core.bookloader import add_by_isbn, merge_works
|
|
||||||
from regluit.core.isbn import ISBN
|
from regluit.core.isbn import ISBN
|
||||||
|
from regluit.core.loaders.utils import type_for_url
|
||||||
from regluit.core.validation import valid_subject
|
from regluit.core.validation import valid_subject
|
||||||
|
|
||||||
|
from . import scrape_language
|
||||||
|
from .doab_utils import doab_lang_to_iso_639_1, online_to_download, url_to_provider
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
springercover = re.compile(r'ftp.+springer\.de.+(\d{13}\.jpg)$', flags=re.U)
|
def unlist(alist):
|
||||||
|
if not alist:
|
||||||
|
return None
|
||||||
|
return alist[0]
|
||||||
|
|
||||||
|
|
||||||
|
SPRINGER_COVER = re.compile(r'ftp.+springer\.de.+(\d{13}\.jpg)$', flags=re.U)
|
||||||
|
SPRINGER_IMAGE = u'https://images.springer.com/sgw/books/medium/{}.jpg'
|
||||||
def store_doab_cover(doab_id, redo=False):
|
def store_doab_cover(doab_id, redo=False):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
returns tuple: 1) cover URL, 2) whether newly created (boolean)
|
returns tuple: 1) cover URL, 2) whether newly created (boolean)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cover_file_name= '/doab/%s/cover' % (doab_id)
|
cover_file_name = '/doab/%s/cover' % (doab_id)
|
||||||
|
|
||||||
# if we don't want to redo and the cover exists, return the URL of the cover
|
# if we don't want to redo and the cover exists, return the URL of the cover
|
||||||
|
|
||||||
if not redo and default_storage.exists(cover_file_name):
|
if not redo and default_storage.exists(cover_file_name):
|
||||||
return (default_storage.url(cover_file_name), False)
|
return (default_storage.url(cover_file_name), False)
|
||||||
|
|
||||||
# download cover image to cover_file
|
# download cover image to cover_file
|
||||||
url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)
|
url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)
|
||||||
try:
|
try:
|
||||||
|
@ -44,16 +56,16 @@ def store_doab_cover(doab_id, redo=False):
|
||||||
if r.status_code == 302:
|
if r.status_code == 302:
|
||||||
redirurl = r.headers['Location']
|
redirurl = r.headers['Location']
|
||||||
if redirurl.startswith(u'ftp'):
|
if redirurl.startswith(u'ftp'):
|
||||||
springerftp = springercover.match(redirurl)
|
springerftp = SPRINGER_COVER.match(redirurl)
|
||||||
if springerftp:
|
if springerftp:
|
||||||
redirurl = u'https://images.springer.com/sgw/books/medium/{}.jpg'.format(springerftp.groups(1))
|
redirurl = SPRINGER_IMAGE.format(springerftp.groups(1))
|
||||||
r = requests.get(redirurl)
|
r = requests.get(redirurl)
|
||||||
else:
|
else:
|
||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
cover_file = ContentFile(r.content)
|
cover_file = ContentFile(r.content)
|
||||||
cover_file.content_type = r.headers.get('content-type', '')
|
cover_file.content_type = r.headers.get('content-type', '')
|
||||||
|
|
||||||
path = default_storage.save(cover_file_name, cover_file)
|
default_storage.save(cover_file_name, cover_file)
|
||||||
return (default_storage.url(cover_file_name), True)
|
return (default_storage.url(cover_file_name), True)
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
# if there is a problem, return None for cover URL
|
# if there is a problem, return None for cover URL
|
||||||
|
@ -74,52 +86,51 @@ def update_cover_doab(doab_id, edition, store_cover=True):
|
||||||
edition.cover_image = cover_url
|
edition.cover_image = cover_url
|
||||||
edition.save()
|
edition.save()
|
||||||
return cover_url
|
return cover_url
|
||||||
else:
|
return None
|
||||||
return None
|
|
||||||
|
|
||||||
def attach_more_doab_metadata(edition, description, subjects,
|
def attach_more_doab_metadata(edition, description, subjects,
|
||||||
publication_date, publisher_name=None, language=None, authors=u''):
|
publication_date, publisher_name=None, language=None, authors=u''):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for given edition, attach description, subjects, publication date to
|
for given edition, attach description, subjects, publication date to
|
||||||
corresponding Edition and Work
|
corresponding Edition and Work
|
||||||
"""
|
"""
|
||||||
# if edition doesn't have a publication date, update it
|
# if edition doesn't have a publication date, update it
|
||||||
if not edition.publication_date:
|
if not edition.publication_date:
|
||||||
edition.publication_date = publication_date
|
edition.publication_date = publication_date
|
||||||
|
|
||||||
# if edition.publisher_name is empty, set it
|
# if edition.publisher_name is empty, set it
|
||||||
if not edition.publisher_name:
|
if not edition.publisher_name:
|
||||||
edition.set_publisher(publisher_name)
|
edition.set_publisher(publisher_name)
|
||||||
|
|
||||||
edition.save()
|
edition.save()
|
||||||
|
|
||||||
# attach description to work if it's not empty
|
# attach description to work if it's not empty
|
||||||
work = edition.work
|
work = edition.work
|
||||||
if not work.description:
|
if not work.description:
|
||||||
work.description = description
|
work.description = description
|
||||||
|
|
||||||
# update subjects
|
# update subjects
|
||||||
for s in subjects:
|
for s in subjects:
|
||||||
if valid_subject(s):
|
if valid_subject(s):
|
||||||
models.Subject.set_by_name(s, work=work)
|
models.Subject.set_by_name(s, work=work)
|
||||||
|
|
||||||
# set reading level of work if it's empty; doab is for adults.
|
# set reading level of work if it's empty; doab is for adults.
|
||||||
if not work.age_level:
|
if not work.age_level:
|
||||||
work.age_level = '18-'
|
work.age_level = '18-'
|
||||||
|
|
||||||
if language:
|
if language and language != 'xx':
|
||||||
work.language = language
|
work.language = language
|
||||||
work.save()
|
work.save()
|
||||||
|
|
||||||
if authors and authors == authors: # test for authors != NaN
|
if authors and authors == authors: # test for authors != NaN
|
||||||
authlist = creator_list(authors)
|
authlist = creator_list(authors)
|
||||||
if edition.authors.all().count() < len(authlist):
|
if edition.authors.all().count() < len(authlist):
|
||||||
edition.authors.clear()
|
edition.authors.clear()
|
||||||
if authlist is not None:
|
if authlist is not None:
|
||||||
for [rel,auth] in authlist:
|
for [rel, auth] in authlist:
|
||||||
edition.add_author(auth, rel)
|
edition.add_author(auth, rel)
|
||||||
|
|
||||||
return edition
|
return edition
|
||||||
|
|
||||||
def add_all_isbns(isbns, work, language=None, title=None):
|
def add_all_isbns(isbns, work, language=None, title=None):
|
||||||
|
@ -128,69 +139,73 @@ def add_all_isbns(isbns, work, language=None, title=None):
|
||||||
first_edition = None
|
first_edition = None
|
||||||
edition = bookloader.add_by_isbn(isbn, work, language=language, title=title)
|
edition = bookloader.add_by_isbn(isbn, work, language=language, title=title)
|
||||||
if edition:
|
if edition:
|
||||||
first_edition = first_edition if first_edition else edition
|
first_edition = first_edition if first_edition else edition
|
||||||
if work and (edition.work_id != work.id):
|
if work and (edition.work_id != work.id):
|
||||||
if work.created < edition.work.created:
|
if work.created < edition.work.created:
|
||||||
work = merge_works(work, edition.work)
|
work = merge_works(work, edition.work)
|
||||||
else:
|
else:
|
||||||
work = merge_works(edition.work, work)
|
work = merge_works(edition.work, work)
|
||||||
else:
|
else:
|
||||||
work = edition.work
|
work = edition.work
|
||||||
return first_edition
|
return first_edition
|
||||||
|
|
||||||
def load_doab_edition(title, doab_id, url, format, rights,
|
def load_doab_edition(title, doab_id, url, format, rights,
|
||||||
language, isbns,
|
language, isbns,
|
||||||
provider, **kwargs):
|
provider, **kwargs):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
load a record from doabooks.org represented by input parameters and return an ebook
|
load a record from doabooks.org represented by input parameters and return an ebook
|
||||||
"""
|
"""
|
||||||
|
logger.info('load doab {} {} {} {} {}'.format(doab_id, format, rights, language, provider))
|
||||||
if language and isinstance(language, list):
|
if language and isinstance(language, list):
|
||||||
language = language[0]
|
language = language[0]
|
||||||
|
if language == 'xx' and format == 'online':
|
||||||
|
language = scrape_language(url)
|
||||||
# check to see whether the Edition hasn't already been loaded first
|
# check to see whether the Edition hasn't already been loaded first
|
||||||
# search by url
|
# search by url
|
||||||
ebooks = models.Ebook.objects.filter(url=url)
|
ebooks = models.Ebook.objects.filter(url=url)
|
||||||
|
|
||||||
# 1 match
|
# 1 match
|
||||||
# > 1 matches
|
# > 1 matches
|
||||||
# 0 match
|
# 0 match
|
||||||
|
|
||||||
# simplest case -- if match (1 or more), we could check whether any
|
# simplest case -- if match (1 or more), we could check whether any
|
||||||
# ebook.edition.work has a doab id matching given doab_id
|
# ebook.edition.work has a doab id matching given doab_id
|
||||||
|
|
||||||
# put a migration to force Ebook.url to be unique id
|
# put a migration to force Ebook.url to be unique id
|
||||||
|
|
||||||
# if yes, then return one of the Edition(s) whose work is doab_id
|
# if yes, then return one of the Edition(s) whose work is doab_id
|
||||||
# if no, then
|
# if no, then
|
||||||
ebook = None
|
ebook = None
|
||||||
if len(ebooks) > 1:
|
if len(ebooks) > 1:
|
||||||
raise Exception("There is more than one Ebook matching url {0}".format(url))
|
raise Exception("There is more than one Ebook matching url {0}".format(url))
|
||||||
elif len(ebooks) == 1:
|
elif len(ebooks) == 1:
|
||||||
ebook = ebooks[0]
|
ebook = ebooks[0]
|
||||||
doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id,
|
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
|
||||||
work=ebook.edition.work)
|
work=ebook.edition.work)
|
||||||
# update the cover id
|
# update the cover id
|
||||||
cover_url = update_cover_doab(doab_id, ebook.edition)
|
cover_url = update_cover_doab(doab_id, ebook.edition)
|
||||||
|
|
||||||
# attach more metadata
|
# attach more metadata
|
||||||
attach_more_doab_metadata(ebook.edition,
|
attach_more_doab_metadata(
|
||||||
description=kwargs.get('description'),
|
ebook.edition,
|
||||||
subjects=kwargs.get('subject'),
|
description=unlist(kwargs.get('description')),
|
||||||
publication_date=kwargs.get('date'),
|
subjects=kwargs.get('subject'),
|
||||||
publisher_name=kwargs.get('publisher'),
|
publication_date=unlist(kwargs.get('date')),
|
||||||
language=language,
|
publisher_name=unlist(kwargs.get('publisher')),
|
||||||
authors=kwargs.get('authors'),)
|
language=language,
|
||||||
|
authors=kwargs.get('creator'),
|
||||||
|
)
|
||||||
# make sure all isbns are added
|
# make sure all isbns are added
|
||||||
add_all_isbns(isbns, None, language=language, title=title)
|
add_all_isbns(isbns, ebook.edition.work, language=language, title=title)
|
||||||
return ebook
|
return ebook.edition
|
||||||
|
|
||||||
# remaining case --> no ebook, load record, create ebook if there is one.
|
# remaining case --> no ebook, load record, create ebook if there is one.
|
||||||
assert len(ebooks) == 0
|
assert not ebooks
|
||||||
|
|
||||||
|
|
||||||
# we need to find the right Edition/Work to tie Ebook to...
|
# we need to find the right Edition/Work to tie Ebook to...
|
||||||
|
|
||||||
# look for the Edition with which to associate ebook.
|
# look for the Edition with which to associate ebook.
|
||||||
# loop through the isbns to see whether we get one that is not None
|
# loop through the isbns to see whether we get one that is not None
|
||||||
work = None
|
work = None
|
||||||
|
@ -206,16 +221,16 @@ def load_doab_edition(title, doab_id, url, format, rights,
|
||||||
edition = ident.work.preferred_edition
|
edition = ident.work.preferred_edition
|
||||||
work = edition.work
|
work = edition.work
|
||||||
break
|
break
|
||||||
|
|
||||||
if edition is not None:
|
if edition is not None:
|
||||||
# if this is a new edition, then add related editions asynchronously
|
# if this is a new edition, then add related editions asynchronously
|
||||||
if getattr(edition,'new', False):
|
if getattr(edition, 'new', False):
|
||||||
tasks.populate_edition.delay(edition.isbn_13)
|
tasks.populate_edition.delay(edition.isbn_13)
|
||||||
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
|
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
|
||||||
work=edition.work)
|
work=edition.work)
|
||||||
|
|
||||||
# we need to create Edition(s) de novo
|
# we need to create Edition(s) de novo
|
||||||
else:
|
else:
|
||||||
# if there is a Work with doab_id already, attach any new Edition(s)
|
# if there is a Work with doab_id already, attach any new Edition(s)
|
||||||
try:
|
try:
|
||||||
work = models.Identifier.objects.get(type='doab', value=doab_id).work
|
work = models.Identifier.objects.get(type='doab', value=doab_id).work
|
||||||
|
@ -226,11 +241,11 @@ def load_doab_edition(title, doab_id, url, format, rights,
|
||||||
work = models.Work(language='xx', title=title, age_level='18-')
|
work = models.Work(language='xx', title=title, age_level='18-')
|
||||||
work.save()
|
work.save()
|
||||||
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
|
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
|
||||||
work=work)
|
work=work)
|
||||||
|
|
||||||
# if work has any ebooks already, attach the ebook to the corresponding edition
|
# if work has any ebooks already, attach the ebook to the corresponding edition
|
||||||
# otherwise pick the first one
|
# otherwise pick the first one
|
||||||
# pick the first edition as the one to tie ebook to
|
# pick the first edition as the one to tie ebook to
|
||||||
editions_with_ebooks = models.Edition.objects.filter(Q(work__id=work.id) & \
|
editions_with_ebooks = models.Edition.objects.filter(Q(work__id=work.id) & \
|
||||||
Q(ebooks__isnull=False)).distinct()
|
Q(ebooks__isnull=False)).distinct()
|
||||||
if editions_with_ebooks:
|
if editions_with_ebooks:
|
||||||
|
@ -240,73 +255,41 @@ def load_doab_edition(title, doab_id, url, format, rights,
|
||||||
else:
|
else:
|
||||||
edition = models.Edition(work=work, title=title)
|
edition = models.Edition(work=work, title=title)
|
||||||
edition.save()
|
edition.save()
|
||||||
|
|
||||||
# make the edition the selected_edition of the work
|
# make the edition the selected_edition of the work
|
||||||
work.selected_edition = edition
|
work.selected_edition = edition
|
||||||
work.save()
|
work.save()
|
||||||
|
|
||||||
if format in ('pdf', 'epub', 'mobi'):
|
if format in ('pdf', 'epub', 'mobi', 'html', 'online'):
|
||||||
ebook = models.Ebook()
|
ebook = models.Ebook()
|
||||||
ebook.format = format
|
ebook.format = format
|
||||||
ebook.provider = provider
|
ebook.provider = provider
|
||||||
ebook.url = url
|
ebook.url = url
|
||||||
ebook.rights = rights
|
ebook.rights = rights
|
||||||
# tie the edition to ebook
|
# tie the edition to ebook
|
||||||
ebook.edition = edition
|
ebook.edition = edition
|
||||||
|
if format == "online":
|
||||||
|
ebook.active = False
|
||||||
ebook.save()
|
ebook.save()
|
||||||
|
|
||||||
# update the cover id (could be done separately)
|
# update the cover id (could be done separately)
|
||||||
cover_url = update_cover_doab(doab_id, edition)
|
cover_url = update_cover_doab(doab_id, edition)
|
||||||
|
|
||||||
# attach more metadata
|
# attach more metadata
|
||||||
attach_more_doab_metadata(edition,
|
attach_more_doab_metadata(
|
||||||
description=kwargs.get('description'),
|
edition,
|
||||||
subjects=kwargs.get('subject'),
|
description=unlist(kwargs.get('description')),
|
||||||
publication_date=kwargs.get('date'),
|
subjects=kwargs.get('subject'),
|
||||||
publisher_name=kwargs.get('publisher'),
|
publication_date=unlist(kwargs.get('date')),
|
||||||
authors=kwargs.get('authors'),)
|
publisher_name=unlist(kwargs.get('publisher')),
|
||||||
return ebook
|
authors=kwargs.get('creator'),
|
||||||
|
)
|
||||||
|
return edition
|
||||||
|
|
||||||
|
#
|
||||||
def load_doab_records(fname, limit=None):
|
|
||||||
|
|
||||||
success_count = 0
|
|
||||||
ebook_count = 0
|
|
||||||
|
|
||||||
records = json.load(open(fname))
|
|
||||||
|
|
||||||
for (i, book) in enumerate(islice(records,limit)):
|
|
||||||
d = dict(book)
|
|
||||||
d['isbns'] = split_isbns(d['isbns_raw']) # use stricter isbn string parsing.
|
|
||||||
try:
|
|
||||||
ebook = load_doab_edition(**d)
|
|
||||||
success_count += 1
|
|
||||||
if ebook:
|
|
||||||
ebook_count +=1
|
|
||||||
except Exception, e:
|
|
||||||
logger.error(e)
|
|
||||||
logger.error(book)
|
|
||||||
|
|
||||||
logger.info("Number of records processed: " + str(success_count))
|
|
||||||
logger.info("Number of ebooks processed: " + str(ebook_count))
|
|
||||||
|
|
||||||
"""
|
|
||||||
#tools to parse the author lists in doab.csv
|
#tools to parse the author lists in doab.csv
|
||||||
from pandas import DataFrame
|
#
|
||||||
url = "http://www.doabooks.org/doab?func=csv"
|
|
||||||
df_csv = DataFrame.from_csv(url)
|
|
||||||
|
|
||||||
out=[]
|
|
||||||
for val in df_csv.values:
|
|
||||||
isbn = split_isbns(val[0])
|
|
||||||
if isbn:
|
|
||||||
auths = []
|
|
||||||
if val[2] == val[2] and val[-2] == val[-2]: # test for NaN auths and licenses
|
|
||||||
auths = creator_list(val[2])
|
|
||||||
out.append(( isbn[0], auths))
|
|
||||||
open("/Users/eric/doab_auths.json","w+").write(json.dumps(out,indent=2, separators=(',', ': ')))
|
|
||||||
"""
|
|
||||||
|
|
||||||
au = re.compile(r'\(Authors?\)', flags=re.U)
|
au = re.compile(r'\(Authors?\)', flags=re.U)
|
||||||
ed = re.compile(r'\([^\)]*(dir.|[Eeé]ds?.|org.|coord.|Editor|a cura di|archivist)[^\)]*\)', flags=re.U)
|
ed = re.compile(r'\([^\)]*(dir.|[Eeé]ds?.|org.|coord.|Editor|a cura di|archivist)[^\)]*\)', flags=re.U)
|
||||||
tr = re.compile(r'\([^\)]*([Tt]rans.|tr.|translated by)[^\)]*\)', flags=re.U)
|
tr = re.compile(r'\([^\)]*([Tt]rans.|tr.|translated by)[^\)]*\)', flags=re.U)
|
||||||
|
@ -326,14 +309,14 @@ def fnf(auth):
|
||||||
if len(parts) == 1:
|
if len(parts) == 1:
|
||||||
return parts[0].strip()
|
return parts[0].strip()
|
||||||
elif len(parts) == 2:
|
elif len(parts) == 2:
|
||||||
return u'{} {}'.format(parts[1].strip(),parts[0].strip())
|
return u'{} {}'.format(parts[1].strip(), parts[0].strip())
|
||||||
else:
|
else:
|
||||||
if parts[1].strip() in ('der','van', 'von', 'de', 'ter'):
|
if parts[1].strip() in ('der', 'van', 'von', 'de', 'ter'):
|
||||||
return u'{} {} {}'.format(parts[2].strip(),parts[1].strip(),parts[0].strip())
|
return u'{} {} {}'.format(parts[2].strip(), parts[1].strip(), parts[0].strip())
|
||||||
#print auth
|
#print auth
|
||||||
#print re.search(namelist,auth).group(0)
|
#print re.search(namelist,auth).group(0)
|
||||||
return u'{} {}, {}'.format(parts[2].strip(),parts[0].strip(),parts[1].strip())
|
return u'{} {}, {}'.format(parts[2].strip(), parts[0].strip(), parts[1].strip())
|
||||||
|
|
||||||
|
|
||||||
def creator(auth, editor=False):
|
def creator(auth, editor=False):
|
||||||
auth = auth.strip()
|
auth = auth.strip()
|
||||||
|
@ -349,68 +332,88 @@ def creator(auth, editor=False):
|
||||||
return [u'dsr', fnf(ds.sub(u'', auth))]
|
return [u'dsr', fnf(ds.sub(u'', auth))]
|
||||||
if re.search(cm, auth):
|
if re.search(cm, auth):
|
||||||
return [u'com', fnf(cm.sub(u'', auth))]
|
return [u'com', fnf(cm.sub(u'', auth))]
|
||||||
|
|
||||||
auth = au.sub('', auth)
|
auth = au.sub('', auth)
|
||||||
return ['aut', fnf(auth)]
|
return ['aut', fnf(auth)]
|
||||||
|
|
||||||
def split_auths(auths):
|
|
||||||
if ';' in auths or '/' in auths:
|
|
||||||
return namesep2.split(auths)
|
|
||||||
else:
|
|
||||||
nl = namelist.match(auths.strip())
|
|
||||||
if nl:
|
|
||||||
if nl.group(3).endswith(' de') \
|
|
||||||
or ' de ' in nl.group(3) \
|
|
||||||
or nl.group(3).endswith(' da') \
|
|
||||||
or nl.group(1).endswith(' Jr.') \
|
|
||||||
or ' e ' in nl.group(1):
|
|
||||||
return [auths]
|
|
||||||
else:
|
|
||||||
return namesep.split(auths)
|
|
||||||
else :
|
|
||||||
return [auths]
|
|
||||||
|
|
||||||
def split_isbns(isbns):
|
|
||||||
result = []
|
|
||||||
for isbn in isbnsep.split(isbns):
|
|
||||||
isbn = ISBN(isbn)
|
|
||||||
if isbn.valid:
|
|
||||||
result.append(isbn.to_string())
|
|
||||||
return result
|
|
||||||
|
|
||||||
def creator_list(creators):
|
def creator_list(creators):
|
||||||
auths = []
|
auths = []
|
||||||
if re.search(edlist, creators):
|
for auth in creators:
|
||||||
for auth in split_auths(edlist.sub(u'', creators)):
|
auths.append(creator(auth))
|
||||||
if auth:
|
|
||||||
auths.append(creator(auth, editor=True))
|
|
||||||
else:
|
|
||||||
for auth in split_auths(unicode(creators)):
|
|
||||||
if auth:
|
|
||||||
auths.append(creator(auth))
|
|
||||||
return auths
|
return auths
|
||||||
|
|
||||||
def load_doab_auths(fname, limit=None):
|
DOAB_OAIURL = 'https://www.doabooks.org/oai'
|
||||||
doab_auths = json.load(open(fname))
|
DOAB_PATT = re.compile(r'[\./]doabooks\.org/doab\?.*rid:(\d{1,8}).*')
|
||||||
recnum = 0
|
mdregistry = MetadataRegistry()
|
||||||
failed = 0
|
mdregistry.registerReader('oai_dc', oai_dc_reader)
|
||||||
for [isbnraw, authlist] in doab_auths:
|
doab_client = Client(DOAB_OAIURL, mdregistry)
|
||||||
isbn = ISBN(isbnraw).to_string()
|
|
||||||
try:
|
def add_by_doab(doab_id, record=None):
|
||||||
work = models.Identifier.objects.get(type='isbn',value=isbn).work
|
try:
|
||||||
except models.Identifier.DoesNotExist:
|
record = record if record else doab_client.getRecord(
|
||||||
print 'isbn = {} not found'.format(isbnraw)
|
metadataPrefix='oai_dc',
|
||||||
failed += 1
|
identifier='oai:doab-books:{}'.format(doab_id)
|
||||||
if work.preferred_edition.authors.all().count() < len(authlist):
|
)
|
||||||
work.preferred_edition.authors.clear()
|
metadata = record[1].getMap()
|
||||||
if authlist is None:
|
isbns = []
|
||||||
print "null authlist; isbn={}".format(isbn)
|
url = None
|
||||||
|
for ident in metadata.pop('identifier', []):
|
||||||
|
if ident.startswith('ISBN: '):
|
||||||
|
isbn = ISBN(ident[6:])
|
||||||
|
if isbn.error:
|
||||||
|
continue
|
||||||
|
isbn.validate()
|
||||||
|
isbns.append(isbn.to_string())
|
||||||
|
elif ident.find('doabooks.org') >= 0:
|
||||||
|
# should already know the doab_id
|
||||||
continue
|
continue
|
||||||
for [rel,auth] in authlist:
|
else:
|
||||||
work.preferred_edition.add_author(auth, rel)
|
url = ident
|
||||||
recnum +=1
|
language = doab_lang_to_iso_639_1(unlist(metadata.pop('language', None)))
|
||||||
if limit and recnum > limit:
|
urls = online_to_download(url)
|
||||||
break
|
edition = None
|
||||||
logger.info("Number of records processed: " + str(recnum))
|
for dl_url in urls:
|
||||||
logger.info("Number of missing isbns: " + str(failed))
|
format = type_for_url(dl_url)
|
||||||
|
if 'format' in metadata:
|
||||||
|
del metadata['format']
|
||||||
|
edition = load_doab_edition(
|
||||||
|
unlist(metadata.pop('title', None)),
|
||||||
|
doab_id,
|
||||||
|
dl_url,
|
||||||
|
format,
|
||||||
|
cc.license_from_cc_url(unlist(metadata.pop('rights', None))),
|
||||||
|
language,
|
||||||
|
isbns,
|
||||||
|
url_to_provider(dl_url) if dl_url else None,
|
||||||
|
**metadata
|
||||||
|
)
|
||||||
|
return edition
|
||||||
|
except IdDoesNotExistError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def getdoab(url):
|
||||||
|
id_match = DOAB_PATT.search(url)
|
||||||
|
if id_match:
|
||||||
|
return id_match.group(1)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def load_doab_oai(from_year=2000, limit=100000):
|
||||||
|
'''
|
||||||
|
use oai feed to get oai updates
|
||||||
|
'''
|
||||||
|
from_ = datetime.datetime(year=from_year, month=1, day=1)
|
||||||
|
doab_ids = []
|
||||||
|
for record in doab_client.listRecords(metadataPrefix='oai_dc', from_=from_):
|
||||||
|
if not record[1]:
|
||||||
|
continue
|
||||||
|
idents = record[1].getMap()['identifier']
|
||||||
|
if idents:
|
||||||
|
for ident in idents:
|
||||||
|
doab = getdoab(ident)
|
||||||
|
if doab:
|
||||||
|
doab_ids.append(doab)
|
||||||
|
e = add_by_doab(doab, record=record)
|
||||||
|
logger.info(u'updated:\t{}\t{}'.format(doab, e.title))
|
||||||
|
if len(doab_ids) > limit:
|
||||||
|
break
|
||||||
|
|
|
@ -0,0 +1,126 @@
|
||||||
|
"""
|
||||||
|
doab_utils.py
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import urlparse
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from regluit.utils.lang import get_language_code
|
||||||
|
from .utils import get_soup
|
||||||
|
|
||||||
|
# utility functions for converting lists of individual items into individual items
|
||||||
|
|
||||||
|
# let's do a mapping of the DOAB languages into the language codes used
|
||||||
|
# mostly, we just handle mispellings
|
||||||
|
# also null -> xx
|
||||||
|
|
||||||
|
EXTRA_LANG_MAP = dict([
|
||||||
|
(u'chinese', 'de'),
|
||||||
|
(u'deutsch', 'de'),
|
||||||
|
(u'eng', 'en'),
|
||||||
|
(u'englilsh', 'en'),
|
||||||
|
(u'englilsh', 'en'),
|
||||||
|
(u'englisch', 'en'),
|
||||||
|
(u'espanol', 'es'),
|
||||||
|
(u'ger', 'de'),
|
||||||
|
(u'fra', 'fr'),
|
||||||
|
(u'fre', 'fr'),
|
||||||
|
(u'francese', 'fr'),
|
||||||
|
(u'ita', 'it'),
|
||||||
|
(u'italiano', 'it'),
|
||||||
|
(u'norwegian', 'no'),
|
||||||
|
(u'por', 'pt'),
|
||||||
|
(u'portugese', 'pt'),
|
||||||
|
(u'slovene', 'sl'),
|
||||||
|
(u'spa', 'es'),
|
||||||
|
(u'spagnolo', 'es'),
|
||||||
|
])
|
||||||
|
|
||||||
|
sep = re.compile(r'[ \-;^,/]+')
|
||||||
|
def doab_lang_to_iso_639_1(lang):
|
||||||
|
if lang is None or not lang:
|
||||||
|
return "xx"
|
||||||
|
else:
|
||||||
|
lang = sep.split(lang)[0]
|
||||||
|
code = get_language_code(lang)
|
||||||
|
if code:
|
||||||
|
return code
|
||||||
|
else:
|
||||||
|
return EXTRA_LANG_MAP.get(lang.lower(), 'xx')
|
||||||
|
|
||||||
|
|
||||||
|
DOMAIN_TO_PROVIDER = dict([
|
||||||
|
[u'www.doabooks.org', u'Directory of Open Access Books'],
|
||||||
|
[u'www.oapen.org', u'OAPEN Library'],
|
||||||
|
[u'books.openedition.org', u'OpenEdition Books'],
|
||||||
|
[u'digitalcommons.usu.edu', u'DigitalCommons, Utah State University'],
|
||||||
|
[u'www.aupress.ca', u'Athabasca University Press'],
|
||||||
|
[u'dspace.ucalgary.ca', u'Institutional Repository at the University of Calgary'],
|
||||||
|
[u'www.degruyter.com', u'De Gruyter Online'],
|
||||||
|
[u'dx.doi.org', u'DOI Resolver'],
|
||||||
|
[u'www.openbookpublishers.com', u'Open Book Publishers'],
|
||||||
|
[u'www.adelaide.edu.au', u'University of Adelaide'],
|
||||||
|
[u'hdl.handle.net', u'Handle Proxy'],
|
||||||
|
[u'link.springer.com', u'Springer'],
|
||||||
|
[u'www.bloomsburyacademic.com', u'Bloomsbury Academic'],
|
||||||
|
[u'www.ledizioni.it', u'Ledizioni'],
|
||||||
|
[u'ccdigitalpress.org', u'Computers and Composition Digital Press'],
|
||||||
|
[u'leo.cilea.it', u'LEO '],
|
||||||
|
[u'www.springerlink.com', u'Springer'],
|
||||||
|
[u'www.palgraveconnect.com', u'Palgrave Connect'],
|
||||||
|
[u'www.ubiquitypress.com', u'Ubiquity Press'],
|
||||||
|
[u'ebooks.iospress.nl', u'IOS Press Ebooks'],
|
||||||
|
[u'antropologie.zcu.cz', u'AntropoWeb'],
|
||||||
|
[u'www.unito.it', u"University of Turin"],
|
||||||
|
[u'leo.cineca.it', u'Letteratura Elettronica Online'],
|
||||||
|
[u'hw.oeaw.ac.at', u'Austrian Academy of Sciences'],
|
||||||
|
[u'www.co-action.net', u'Co-Action Publishing'],
|
||||||
|
[u'www.aliprandi.org', u'Simone Aliprandi'],
|
||||||
|
[u'www.maestrantonella.it', u'maestrantonella.it'],
|
||||||
|
[u'www.antilia.to.it', u'antilia.to.it'],
|
||||||
|
[u'www.scribd.com', u'Scribd'],
|
||||||
|
[u'ledibooks.com', u'LediBooks'],
|
||||||
|
[u'press.openedition.org', u'OpenEdition Press'],
|
||||||
|
[u'oapen.org', u'OAPEN Library'],
|
||||||
|
[u'www.ebooks.iospress.nl', u'IOS Press Ebooks'],
|
||||||
|
[u'windsor.scholarsportal.info', u'Scholars Portal'],
|
||||||
|
[u'www.unimib.it', u'University of Milano-Bicocca'],
|
||||||
|
[u'books.mdpi.com', u'MDPI Books'],
|
||||||
|
[u'www.dropbox.com', u'Dropbox'],
|
||||||
|
[u'dl.dropboxusercontent.com', u'Dropbox'],
|
||||||
|
])
|
||||||
|
|
||||||
|
def url_to_provider(url):
|
||||||
|
netloc = urlparse.urlparse(url).netloc
|
||||||
|
return DOMAIN_TO_PROVIDER.get(netloc, netloc)
|
||||||
|
|
||||||
|
FRONTIERSIN = re.compile(r'frontiersin.org/books/[^/]+/(\d+)')
|
||||||
|
|
||||||
|
def online_to_download(url):
|
||||||
|
urls = []
|
||||||
|
if url.find(u'mdpi.com/books/pdfview/book/') >= 0:
|
||||||
|
doc = get_soup(url)
|
||||||
|
if doc:
|
||||||
|
obj = doc.find('object', type='application/pdf')
|
||||||
|
if obj:
|
||||||
|
urls.append(obj['data'].split('#')[0])
|
||||||
|
elif url.find(u'books.scielo.org/') >= 0:
|
||||||
|
doc = get_soup(url)
|
||||||
|
if doc:
|
||||||
|
obj = doc.find('a', class_='pdf_file')
|
||||||
|
if obj:
|
||||||
|
urls.append(urlparse.urljoin(url, obj['href']))
|
||||||
|
obj = doc.find('a', class_='epub_file')
|
||||||
|
if obj:
|
||||||
|
urls.append(urlparse.urljoin(url, obj['href']))
|
||||||
|
elif FRONTIERSIN.search(url):
|
||||||
|
booknum = FRONTIERSIN.search(url).group(1)
|
||||||
|
urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=EPUB'.format(booknum))
|
||||||
|
urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=PDF'.format(booknum))
|
||||||
|
else:
|
||||||
|
urls.append(url)
|
||||||
|
return urls
|
||||||
|
|
|
@ -0,0 +1,28 @@
|
||||||
|
from django.conf import settings
|
||||||
|
from django.test import TestCase
|
||||||
|
from regluit.core.models import Ebook, Edition, Work
|
||||||
|
from .utils import dl_online
|
||||||
|
|
||||||
|
class LoaderTests(TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_downloads(self):
|
||||||
|
if not (settings.TEST_INTEGRATION):
|
||||||
|
return
|
||||||
|
|
||||||
|
work = Work(title="online work")
|
||||||
|
work.save()
|
||||||
|
|
||||||
|
edition = Edition(work=work)
|
||||||
|
edition.save()
|
||||||
|
|
||||||
|
dropbox_url = 'https://www.dropbox.com/s/h5jzpb4vknk8n7w/Jakobsson_The_Troll_Inside_You_EBook.pdf?dl=0'
|
||||||
|
dropbox_ebook = Ebook.objects.create(format='online', url=dropbox_url, edition=edition)
|
||||||
|
dropbox_ebf = dl_online(dropbox_ebook)
|
||||||
|
self.assertTrue(dropbox_ebf.ebook.filesize)
|
||||||
|
|
||||||
|
jbe_url = 'http://www.jbe-platform.com/content/books/9789027295958'
|
||||||
|
jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition)
|
||||||
|
jbe_ebf = dl_online(jbe_ebook)
|
||||||
|
self.assertTrue(jbe_ebf.ebook.filesize)
|
|
@ -1,15 +1,24 @@
|
||||||
import csv
|
import csv
|
||||||
import re
|
|
||||||
import requests
|
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
import urlparse
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from django.core.files.base import ContentFile
|
||||||
|
|
||||||
from regluit.core.models import Work, Edition, Author, PublisherName, Identifier, Subject
|
|
||||||
from regluit.core.isbn import ISBN
|
|
||||||
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
|
|
||||||
from regluit.api.crosswalks import inv_relator_contrib
|
from regluit.api.crosswalks import inv_relator_contrib
|
||||||
from regluit.bisac.models import BisacHeading
|
from regluit.bisac.models import BisacHeading
|
||||||
|
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
|
||||||
|
from regluit.core.isbn import ISBN
|
||||||
|
from regluit.core.models import (
|
||||||
|
Author, Ebook, EbookFile, Edition, Identifier, path_for_file, PublisherName, Subject, Work,
|
||||||
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -34,6 +43,12 @@ def utf8_general_ci_norm(s):
|
||||||
s1 = unicodedata.normalize('NFD', s)
|
s1 = unicodedata.normalize('NFD', s)
|
||||||
return ''.join(c for c in s1 if not unicodedata.combining(c)).upper()
|
return ''.join(c for c in s1 if not unicodedata.combining(c)).upper()
|
||||||
|
|
||||||
|
def get_soup(url):
|
||||||
|
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
||||||
|
if response.status_code == 200:
|
||||||
|
return BeautifulSoup(response.content, 'lxml')
|
||||||
|
return None
|
||||||
|
|
||||||
def get_authors(book):
|
def get_authors(book):
|
||||||
authors=[]
|
authors=[]
|
||||||
if book.get('AuthorsList',''):
|
if book.get('AuthorsList',''):
|
||||||
|
@ -331,14 +346,15 @@ def loaded_book_ok(book, work, edition):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
ID_URLPATTERNS = {
|
ID_URLPATTERNS = {
|
||||||
'goog': re.compile(r'[\./]google\.com/books\?.*id=([a-zA-Z0-9\-_]{12})'),
|
'goog': re.compile(r'[\./]google\.com/books\?.*id=(?P<id>[a-zA-Z0-9\-_]{12})'),
|
||||||
'olwk': re.compile(r'[\./]openlibrary\.org(/works/OL\d{1,8}W)'),
|
'olwk': re.compile(r'[\./]openlibrary\.org(?P<id>/works/OL\d{1,8}W)'),
|
||||||
'gdrd': re.compile(r'[\./]goodreads\.com/book/show/(\d{1,8})'),
|
'doab': re.compile(r'([\./]doabooks\.org/doab\?.*rid:|=oai:doab-books:)(?P<id>\d{1,8})'),
|
||||||
'ltwk': re.compile(r'[\./]librarything\.com/work/(\d{1,8})'),
|
'gdrd': re.compile(r'[\./]goodreads\.com/book/show/(?P<id>\d{1,8})'),
|
||||||
'oclc': re.compile(r'\.worldcat\.org/.*oclc/(\d{8,12})'),
|
'ltwk': re.compile(r'[\./]librarything\.com/work/(?P<id>\d{1,8})'),
|
||||||
'doi': re.compile(r'[\./]doi\.org/(10\.\d+/\S+)'),
|
'oclc': re.compile(r'\.worldcat\.org/.*oclc/(?P<id>\d{8,12})'),
|
||||||
'gtbg': re.compile(r'[\./]gutenberg\.org/ebooks/(\d{1,6})'),
|
'doi': re.compile(r'[\./]doi\.org/(?P<id>10\.\d+/\S+)'),
|
||||||
'glue': re.compile(r'[\./]unglue\.it/work/(\d{1,7})'),
|
'gtbg': re.compile(r'[\./]gutenberg\.org/ebooks/(?P<id>\d{1,6})'),
|
||||||
|
'glue': re.compile(r'[\./]unglue\.it/work/(?P<id>\d{1,7})'),
|
||||||
}
|
}
|
||||||
|
|
||||||
def ids_from_urls(url):
|
def ids_from_urls(url):
|
||||||
|
@ -346,7 +362,111 @@ def ids_from_urls(url):
|
||||||
for ident in ID_URLPATTERNS.keys():
|
for ident in ID_URLPATTERNS.keys():
|
||||||
id_match = ID_URLPATTERNS[ident].search(url)
|
id_match = ID_URLPATTERNS[ident].search(url)
|
||||||
if id_match:
|
if id_match:
|
||||||
ids[ident] = id_match.group(1)
|
ids[ident] = id_match.group('id')
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
|
DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"')
|
||||||
|
|
||||||
|
def dl_online(ebook):
|
||||||
|
if ebook.format != 'online':
|
||||||
|
return
|
||||||
|
|
||||||
|
if ebook.url.find(u'dropbox.com/s/') >= 0:
|
||||||
|
response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
|
||||||
|
if response.status_code == 200:
|
||||||
|
match_dl = DROPBOX_DL.search(response.content)
|
||||||
|
if match_dl:
|
||||||
|
return make_dl_ebook(match_dl.group(1), ebook)
|
||||||
|
elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
|
||||||
|
doc = get_soup(ebook.url)
|
||||||
|
if doc:
|
||||||
|
obj = doc.select_one('div.fulltexticoncontainer-PDF a')
|
||||||
|
if obj:
|
||||||
|
dl_url = urlparse.urljoin(ebook.url, obj['href'])
|
||||||
|
return make_dl_ebook(dl_url, ebook)
|
||||||
|
|
||||||
|
def make_dl_ebook(url, ebook):
|
||||||
|
if EbookFile.objects.filter(source=ebook.url):
|
||||||
|
return EbookFile.objects.filter(source=ebook.url)[0]
|
||||||
|
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
||||||
|
if response.status_code == 200:
|
||||||
|
filesize = int(response.headers.get("Content-Length", 0))
|
||||||
|
filesize = filesize if filesize else None
|
||||||
|
format = type_for_url(url, content_type=response.headers.get('content-type'))
|
||||||
|
if format != 'online':
|
||||||
|
new_ebf = EbookFile.objects.create(
|
||||||
|
edition=ebook.edition,
|
||||||
|
format=format,
|
||||||
|
source=ebook.url,
|
||||||
|
)
|
||||||
|
new_ebf.file.save(path_for_file(new_ebf, None), ContentFile(response.content))
|
||||||
|
new_ebf.save()
|
||||||
|
new_ebook = Ebook.objects.create(
|
||||||
|
edition=ebook.edition,
|
||||||
|
format=format,
|
||||||
|
provider='Unglue.it',
|
||||||
|
url=new_ebf.file.url,
|
||||||
|
rights=ebook.rights,
|
||||||
|
filesize=filesize,
|
||||||
|
version_label=ebook.version_label,
|
||||||
|
version_iter=ebook.version_iter,
|
||||||
|
)
|
||||||
|
new_ebf.ebook = new_ebook
|
||||||
|
new_ebf.save()
|
||||||
|
return new_ebf
|
||||||
|
|
||||||
|
def type_for_url(url, content_type=None):
|
||||||
|
if not url:
|
||||||
|
return ''
|
||||||
|
if url.find('books.openedition.org') >= 0:
|
||||||
|
return ('online')
|
||||||
|
ct = content_type if content_type else contenttyper.calc_type(url)
|
||||||
|
if re.search("pdf", ct):
|
||||||
|
return "pdf"
|
||||||
|
elif re.search("octet-stream", ct) and re.search("pdf", url, flags=re.I):
|
||||||
|
return "pdf"
|
||||||
|
elif re.search("octet-stream", ct) and re.search("epub", url, flags=re.I):
|
||||||
|
return "epub"
|
||||||
|
elif re.search("text/plain", ct):
|
||||||
|
return "text"
|
||||||
|
elif re.search("text/html", ct):
|
||||||
|
if url.find('oapen.org/view') >= 0:
|
||||||
|
return "html"
|
||||||
|
return "online"
|
||||||
|
elif re.search("epub", ct):
|
||||||
|
return "epub"
|
||||||
|
elif re.search("mobi", ct):
|
||||||
|
return "mobi"
|
||||||
|
return "other"
|
||||||
|
|
||||||
|
class ContentTyper(object):
|
||||||
|
""" """
|
||||||
|
def __init__(self):
|
||||||
|
self.last_call = dict()
|
||||||
|
|
||||||
|
def content_type(self, url):
|
||||||
|
try:
|
||||||
|
r = requests.head(url)
|
||||||
|
return r.headers.get('content-type')
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def calc_type(self, url):
|
||||||
|
delay = 1
|
||||||
|
# is there a delay associated with the url
|
||||||
|
netloc = urlparse.urlparse(url).netloc
|
||||||
|
|
||||||
|
# wait if necessary
|
||||||
|
last_call = self.last_call.get(netloc)
|
||||||
|
if last_call is not None:
|
||||||
|
now = time.time()
|
||||||
|
min_time_next_call = last_call + delay
|
||||||
|
if min_time_next_call > now:
|
||||||
|
time.sleep(min_time_next_call-now)
|
||||||
|
|
||||||
|
self.last_call[netloc] = time.time()
|
||||||
|
|
||||||
|
# compute the content-type
|
||||||
|
return self.content_type(url)
|
||||||
|
|
||||||
|
contenttyper = ContentTyper()
|
||||||
|
|
|
@ -1,17 +0,0 @@
|
||||||
import os
|
|
||||||
|
|
||||||
from django.conf import settings
|
|
||||||
from django.contrib.auth.models import User
|
|
||||||
from django.core.management.base import BaseCommand
|
|
||||||
|
|
||||||
from regluit.core.loaders import doab
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
|
||||||
help = "load doab books"
|
|
||||||
args = "<limit> <file_name>"
|
|
||||||
|
|
||||||
def handle(self, limit=None, file_name="../../../bookdata/doab.json", **options):
|
|
||||||
|
|
||||||
command_dir = os.path.dirname(os.path.realpath(__file__))
|
|
||||||
file_path = os.path.join(command_dir, file_name)
|
|
||||||
doab.load_doab_records(file_path, limit=int(limit))
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
|
||||||
|
from regluit.core.loaders.utils import dl_online
|
||||||
|
from regluit.core.models import Ebook
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = "harvest downloadable ebooks from 'online' ebooks"
|
||||||
|
args = "<limit>"
|
||||||
|
|
||||||
|
def handle(self, limit=0, **options):
|
||||||
|
limit = int(limit) if limit else 0
|
||||||
|
onlines = Ebook.objects.filter(format='online')
|
||||||
|
done = 0
|
||||||
|
for online in onlines:
|
||||||
|
new_ebf = dl_online(online)
|
||||||
|
if new_ebf:
|
||||||
|
done += 1
|
||||||
|
if done > limit:
|
||||||
|
break
|
||||||
|
print 'harvested {} ebooks'.format(done)
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
|
||||||
|
from regluit.core.loaders import doab
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = "load doab books by doab_id via oai"
|
||||||
|
args = "<doab_id>"
|
||||||
|
|
||||||
|
def handle(self, doab_id, **options):
|
||||||
|
doab.add_by_doab(doab_id)
|
|
@ -0,0 +1,18 @@
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
|
||||||
|
from regluit.core.loaders import doab
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = "load doab books via oai"
|
||||||
|
args = "<from_year> <limit>"
|
||||||
|
|
||||||
|
def handle(self, from_year= None, limit=None, **options):
|
||||||
|
from_year = int(from_year) if from_year else None
|
||||||
|
limit = int(limit) if limit else None
|
||||||
|
if limit:
|
||||||
|
doab.load_doab_oai(from_year=from_year, limit=limit)
|
||||||
|
else:
|
||||||
|
if from_year:
|
||||||
|
doab.load_doab_oai(from_year=from_year)
|
||||||
|
else:
|
||||||
|
doab.load_doab_oai()
|
|
@ -1083,7 +1083,7 @@ class EbookFile(models.Model):
|
||||||
source=self.file.url
|
source=self.file.url
|
||||||
)
|
)
|
||||||
|
|
||||||
new_mobi_ebf.file.save(path_for_file('ebf', None), mobi_cf)
|
new_mobi_ebf.file.save(path_for_file(new_mobi_ebf, None), mobi_cf)
|
||||||
new_mobi_ebf.save()
|
new_mobi_ebf.save()
|
||||||
if self.ebook:
|
if self.ebook:
|
||||||
new_ebook = Ebook.objects.create(
|
new_ebook = Ebook.objects.create(
|
||||||
|
|
830
core/tests.py
830
core/tests.py
File diff suppressed because it is too large
Load Diff
|
@ -21,6 +21,7 @@ from regluit.core.bookloader import (
|
||||||
from regluit.core.parameters import WORK_IDENTIFIERS
|
from regluit.core.parameters import WORK_IDENTIFIERS
|
||||||
|
|
||||||
from regluit.core.loaders import add_by_webpage
|
from regluit.core.loaders import add_by_webpage
|
||||||
|
from regluit.core.loaders.doab import add_by_doab
|
||||||
from regluit.core.loaders.utils import ids_from_urls
|
from regluit.core.loaders.utils import ids_from_urls
|
||||||
from regluit.frontend.forms import EditionForm, IdentifierForm
|
from regluit.frontend.forms import EditionForm, IdentifierForm
|
||||||
|
|
||||||
|
@ -106,6 +107,11 @@ def get_edition_for_id(id_type, id_value, user=None):
|
||||||
if edition:
|
if edition:
|
||||||
return user_edition(edition, user)
|
return user_edition(edition, user)
|
||||||
|
|
||||||
|
if identifiers.has_key('doab'):
|
||||||
|
edition = add_by_doab(identifiers['doab'])
|
||||||
|
if edition:
|
||||||
|
return user_edition(edition, user)
|
||||||
|
|
||||||
if identifiers.has_key('oclc'):
|
if identifiers.has_key('oclc'):
|
||||||
edition = add_by_oclc(identifiers['oclc'])
|
edition = add_by_oclc(identifiers['oclc'])
|
||||||
if edition:
|
if edition:
|
||||||
|
|
|
@ -6,16 +6,11 @@ PyJWT==1.4.1
|
||||||
PyPDF2==1.23
|
PyPDF2==1.23
|
||||||
PyGithub==1.15.0
|
PyGithub==1.15.0
|
||||||
PyYAML==3.11
|
PyYAML==3.11
|
||||||
git+git://github.com/urschrei/pyzotero.git@v0.9.51
|
|
||||||
SPARQLWrapper==1.6.4
|
|
||||||
WebOb==1.2.3
|
|
||||||
WebTest==1.4.0
|
|
||||||
amqp==1.4.9
|
amqp==1.4.9
|
||||||
anyjson==0.3.3
|
anyjson==0.3.3
|
||||||
billiard==3.3.0.23
|
billiard==3.3.0.23
|
||||||
awscli==1.10.26
|
awscli==1.10.26
|
||||||
boto==2.42.0
|
boto==2.42.0
|
||||||
#git+ssh://git@github.com/Gluejar/boto.git@2.3.0
|
|
||||||
celery==3.1.23
|
celery==3.1.23
|
||||||
certifi==2016.2.28
|
certifi==2016.2.28
|
||||||
# pip installing pillow seems to delete distribute
|
# pip installing pillow seems to delete distribute
|
||||||
|
@ -33,7 +28,6 @@ django-jsonfield==1.0.0
|
||||||
#django-kombu==0.9.4
|
#django-kombu==0.9.4
|
||||||
django-maintenancemode==0.11.2
|
django-maintenancemode==0.11.2
|
||||||
django-mptt==0.8.5
|
django-mptt==0.8.5
|
||||||
#django-nose-selenium==0.7.3
|
|
||||||
#django-notification==0.2
|
#django-notification==0.2
|
||||||
git+git://github.com/eshellman/django-notification.git@412c7a03a327195a1017c2be92c8e2caabc880b6
|
git+git://github.com/eshellman/django-notification.git@412c7a03a327195a1017c2be92c8e2caabc880b6
|
||||||
django-registration==2.1.2
|
django-registration==2.1.2
|
||||||
|
@ -42,9 +36,7 @@ django-smtp-ssl==1.0
|
||||||
django-storages==1.4.1
|
django-storages==1.4.1
|
||||||
django-tastypie==0.13.3
|
django-tastypie==0.13.3
|
||||||
django-transmeta==0.7.3
|
django-transmeta==0.7.3
|
||||||
feedparser==5.1.2
|
|
||||||
fef-questionnaire==4.0.1
|
fef-questionnaire==4.0.1
|
||||||
freebase==1.0.8
|
|
||||||
#gitenberg.metadata==0.1.6
|
#gitenberg.metadata==0.1.6
|
||||||
git+https://github.com/gitenberg-dev/gitberg-build
|
git+https://github.com/gitenberg-dev/gitberg-build
|
||||||
#git+ssh://git@github.com/gitenberg-dev/metadata.git@0.1.11
|
#git+ssh://git@github.com/gitenberg-dev/metadata.git@0.1.11
|
||||||
|
@ -53,7 +45,7 @@ html5lib==1.0b3
|
||||||
httplib2==0.7.5
|
httplib2==0.7.5
|
||||||
isodate==0.5.1
|
isodate==0.5.1
|
||||||
kombu==3.0.35
|
kombu==3.0.35
|
||||||
lxml==2.3.5
|
lxml==4.2.1
|
||||||
defusedxml==0.4.1
|
defusedxml==0.4.1
|
||||||
mechanize==0.2.5
|
mechanize==0.2.5
|
||||||
mimeparse==0.1.3
|
mimeparse==0.1.3
|
||||||
|
@ -66,6 +58,7 @@ paramiko==1.14.1
|
||||||
postmonkey==1.0b
|
postmonkey==1.0b
|
||||||
pycrypto==2.6
|
pycrypto==2.6
|
||||||
pymarc==3.0.2
|
pymarc==3.0.2
|
||||||
|
pyoai==2.5.0
|
||||||
pyparsing==2.0.3
|
pyparsing==2.0.3
|
||||||
python-dateutil==2.5.3
|
python-dateutil==2.5.3
|
||||||
python-mimeparse==0.1.4
|
python-mimeparse==0.1.4
|
||||||
|
@ -80,7 +73,7 @@ requests==2.10.0
|
||||||
requests-mock==1.2.0
|
requests-mock==1.2.0
|
||||||
requests-oauthlib==0.6.2
|
requests-oauthlib==0.6.2
|
||||||
selenium==2.53.1
|
selenium==2.53.1
|
||||||
six==1.9.0
|
six==1.11.0
|
||||||
sorl-thumbnail==12.3
|
sorl-thumbnail==12.3
|
||||||
ssh==1.7.14
|
ssh==1.7.14
|
||||||
stevedore==1.12.0
|
stevedore==1.12.0
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
from django.conf.global_settings import LANGUAGES
|
from django.conf.global_settings import LANGUAGES
|
||||||
|
|
||||||
lang2code = dict([ (lang[1].lower(), lang[0]) for lang in LANGUAGES ])
|
lang2code = dict([ (lang[1].lower(), lang[0]) for lang in LANGUAGES ])
|
||||||
|
code2lang = dict(LANGUAGES)
|
||||||
|
|
||||||
def get_language_code(language):
|
def get_language_code(language):
|
||||||
return lang2code.get(language.lower().strip(), '')
|
language = language.lower().strip()
|
||||||
|
if language in code2lang:
|
||||||
|
return language
|
||||||
|
return lang2code.get(language, '')
|
||||||
|
|
Loading…
Reference in New Issue