Merge remote-tracking branch 'Gluejar/master' into production
commit
89b4221dcd
|
@ -25,7 +25,7 @@ def onix_feed(facet, max=None):
|
|||
editions = facet.facet_object.filter_model("Edition",editions).distinct()
|
||||
for edition in editions:
|
||||
edition_prod = product(edition, facet.facet_object)
|
||||
if edition_prod:
|
||||
if edition_prod is not None:
|
||||
feed.append(edition_prod)
|
||||
return etree.tostring(feed, pretty_print=True)
|
||||
|
||||
|
@ -34,7 +34,7 @@ def onix_feed_for_work(work):
|
|||
feed.append(header(work))
|
||||
for edition in models.Edition.objects.filter(work=work,ebooks__isnull=False).distinct():
|
||||
edition_prod = product(edition)
|
||||
if edition_prod:
|
||||
if edition_prod is not None:
|
||||
feed.append(product(edition))
|
||||
return etree.tostring(feed, pretty_print=True)
|
||||
|
||||
|
|
File diff suppressed because one or more lines are too long
54230
bookdata/doab_auths.json
54230
bookdata/doab_auths.json
File diff suppressed because it is too large
Load Diff
|
@ -49,7 +49,7 @@ def add_by_oclc(isbn, work=None):
|
|||
|
||||
def add_by_oclc_from_google(oclc):
|
||||
if oclc:
|
||||
logger.info("adding book by oclc %s", oclc)
|
||||
logger.info(u"adding book by oclc %s", oclc)
|
||||
else:
|
||||
return None
|
||||
try:
|
||||
|
@ -59,10 +59,10 @@ def add_by_oclc_from_google(oclc):
|
|||
try:
|
||||
results = _get_json(url, {"q": '"OCLC%s"' % oclc})
|
||||
except LookupFailure, e:
|
||||
logger.exception("lookup failure for %s", oclc)
|
||||
logger.exception(u"lookup failure for %s", oclc)
|
||||
return None
|
||||
if not results.has_key('items') or not results['items']:
|
||||
logger.warn("no google hits for %s", oclc)
|
||||
logger.warn(u"no google hits for %s", oclc)
|
||||
return None
|
||||
|
||||
try:
|
||||
|
@ -70,16 +70,16 @@ def add_by_oclc_from_google(oclc):
|
|||
models.Identifier(type='oclc', value=oclc, edition=e, work=e.work).save()
|
||||
return e
|
||||
except LookupFailure, e:
|
||||
logger.exception("failed to add edition for %s", oclc)
|
||||
logger.exception(u"failed to add edition for %s", oclc)
|
||||
except IntegrityError, e:
|
||||
logger.exception("google books data for %s didn't fit our db", oclc)
|
||||
logger.exception(u"google books data for %s didn't fit our db", oclc)
|
||||
return None
|
||||
|
||||
def valid_isbn(isbn):
|
||||
try:
|
||||
return identifier_cleaner('isbn')(isbn)
|
||||
except:
|
||||
logger.exception("invalid isbn: %s", isbn)
|
||||
logger.exception(u"invalid isbn: %s", isbn)
|
||||
return None
|
||||
|
||||
def add_by_isbn(isbn, work=None, language='xx', title=''):
|
||||
|
@ -88,13 +88,13 @@ def add_by_isbn(isbn, work=None, language='xx', title=''):
|
|||
try:
|
||||
e = add_by_isbn_from_google(isbn, work=work)
|
||||
except LookupFailure:
|
||||
logger.exception("failed google lookup for %s", isbn)
|
||||
logger.exception(u"failed google lookup for %s", isbn)
|
||||
# try again some other time
|
||||
return None
|
||||
if e:
|
||||
return e
|
||||
|
||||
logger.info("null came back from add_by_isbn_from_google: %s", isbn)
|
||||
logger.info(u"null came back from add_by_isbn_from_google: %s", isbn)
|
||||
|
||||
# if there's a a title, we want to create stub editions and
|
||||
# works, even if google doesn't know about it # but if it's not valid,
|
||||
|
@ -129,10 +129,10 @@ def get_google_isbn_results(isbn):
|
|||
try:
|
||||
results = _get_json(url, {"q": "isbn:%s" % isbn})
|
||||
except LookupFailure:
|
||||
logger.exception("lookup failure for %s", isbn)
|
||||
logger.exception(u"lookup failure for %s", isbn)
|
||||
return None
|
||||
if not results.has_key('items') or not results['items']:
|
||||
logger.warn("no google hits for %s", isbn)
|
||||
logger.warn(u"no google hits for %s", isbn)
|
||||
return None
|
||||
return results
|
||||
|
||||
|
@ -201,7 +201,7 @@ def update_edition(edition):
|
|||
# if the language of the edition no longer matches that of the parent work,
|
||||
# attach edition to the
|
||||
if edition.work.language != language:
|
||||
logger.info("reconnecting %s since it is %s instead of %s",
|
||||
logger.info(u"reconnecting %s since it is %s instead of %s",
|
||||
googlebooks_id, language, edition.work.language)
|
||||
old_work = edition.work
|
||||
|
||||
|
@ -210,7 +210,7 @@ def update_edition(edition):
|
|||
edition.work = new_work
|
||||
edition.save()
|
||||
for identifier in edition.identifiers.all():
|
||||
logger.info("moving identifier %s", identifier.value)
|
||||
logger.info(u"moving identifier %s", identifier.value)
|
||||
identifier.work = new_work
|
||||
identifier.save()
|
||||
if old_work and old_work.editions.count() == 0:
|
||||
|
@ -256,7 +256,7 @@ def add_by_isbn_from_google(isbn, work=None):
|
|||
edition.new = False
|
||||
return edition
|
||||
|
||||
logger.info("adding new book by isbn %s", isbn)
|
||||
logger.info(u"adding new book by isbn %s", isbn)
|
||||
results = get_google_isbn_results(isbn)
|
||||
if results:
|
||||
try:
|
||||
|
@ -267,9 +267,9 @@ def add_by_isbn_from_google(isbn, work=None):
|
|||
isbn=isbn
|
||||
)
|
||||
except LookupFailure, e:
|
||||
logger.exception("failed to add edition for %s", isbn)
|
||||
logger.exception(u"failed to add edition for %s", isbn)
|
||||
except IntegrityError, e:
|
||||
logger.exception("google books data for %s didn't fit our db", isbn)
|
||||
logger.exception(u"google books data for %s didn't fit our db", isbn)
|
||||
return None
|
||||
return None
|
||||
|
||||
|
@ -320,7 +320,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
|
|||
if results:
|
||||
item = results
|
||||
else:
|
||||
logger.info("loading metadata from google for %s", googlebooks_id)
|
||||
logger.info(u"loading metadata from google for %s", googlebooks_id)
|
||||
url = "https://www.googleapis.com/books/v1/volumes/%s" % googlebooks_id
|
||||
item = _get_json(url)
|
||||
d = item['volumeInfo']
|
||||
|
@ -343,7 +343,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
|
|||
if len(language) > 5:
|
||||
language = language[0:5]
|
||||
if work and work.language != language:
|
||||
logger.info("not connecting %s since it is %s instead of %s",
|
||||
logger.info(u"not connecting %s since it is %s instead of %s",
|
||||
googlebooks_id, language, work.language)
|
||||
work = None
|
||||
# isbn = None
|
||||
|
@ -371,7 +371,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
|
|||
try:
|
||||
e = models.Identifier.objects.get(type='goog', value=googlebooks_id).edition
|
||||
e.new = False
|
||||
logger.warning(" whoa nellie, somebody else created an edition while we were working.")
|
||||
logger.warning(u" whoa nellie, somebody else created an edition while we were working.")
|
||||
if work.new:
|
||||
work.delete()
|
||||
return e
|
||||
|
@ -404,19 +404,19 @@ def relate_isbn(isbn, cluster_size=1):
|
|||
"""add a book by isbn and then see if there's an existing work to add it to so as to make a
|
||||
cluster bigger than cluster_size.
|
||||
"""
|
||||
logger.info("finding a related work for %s", isbn)
|
||||
logger.info(u"finding a related work for %s", isbn)
|
||||
|
||||
edition = add_by_isbn(isbn)
|
||||
if edition is None:
|
||||
return None
|
||||
if edition.work is None:
|
||||
logger.info("didn't add related to null work")
|
||||
logger.info(u"didn't add related to null work")
|
||||
return None
|
||||
if edition.work.editions.count() > cluster_size:
|
||||
return edition.work
|
||||
for other_isbn in thingisbn(isbn):
|
||||
# 979's come back as 13
|
||||
logger.debug("other_isbn: %s", other_isbn)
|
||||
logger.debug(u"other_isbn: %s", other_isbn)
|
||||
if len(other_isbn) == 10:
|
||||
other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
|
||||
related_edition = add_by_isbn(other_isbn, work=edition.work)
|
||||
|
@ -427,7 +427,7 @@ def relate_isbn(isbn, cluster_size=1):
|
|||
related_edition.work = edition.work
|
||||
related_edition.save()
|
||||
elif related_edition.work_id != edition.work_id:
|
||||
logger.debug("merge_works path 1 %s %s", edition.work_id, related_edition.work_id)
|
||||
logger.debug(u"merge_works path 1 %s %s", edition.work_id, related_edition.work_id)
|
||||
merge_works(related_edition.work, edition.work)
|
||||
if related_edition.work.editions.count() > cluster_size:
|
||||
return related_edition.work
|
||||
|
@ -438,7 +438,7 @@ def add_related(isbn):
|
|||
The initial seed ISBN will be added if it's not already there.
|
||||
"""
|
||||
# make sure the seed edition is there
|
||||
logger.info("adding related editions for %s", isbn)
|
||||
logger.info(u"adding related editions for %s", isbn)
|
||||
|
||||
new_editions = []
|
||||
|
||||
|
@ -446,14 +446,14 @@ def add_related(isbn):
|
|||
if edition is None:
|
||||
return new_editions
|
||||
if edition.work is None:
|
||||
logger.warning("didn't add related to null work")
|
||||
logger.warning(u"didn't add related to null work")
|
||||
return new_editions
|
||||
# this is the work everything will hang off
|
||||
work = edition.work
|
||||
other_editions = {}
|
||||
for other_isbn in thingisbn(isbn):
|
||||
# 979's come back as 13
|
||||
logger.debug("other_isbn: %s", other_isbn)
|
||||
logger.debug(u"other_isbn: %s", other_isbn)
|
||||
if len(other_isbn) == 10:
|
||||
other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
|
||||
related_edition = add_by_isbn(other_isbn, work=work)
|
||||
|
@ -466,7 +466,7 @@ def add_related(isbn):
|
|||
related_edition.work = work
|
||||
related_edition.save()
|
||||
elif related_edition.work_id != work.id:
|
||||
logger.debug("merge_works path 1 %s %s", work.id, related_edition.work_id)
|
||||
logger.debug(u"merge_works path 1 %s %s", work.id, related_edition.work_id)
|
||||
work = merge_works(work, related_edition.work)
|
||||
else:
|
||||
if other_editions.has_key(related_language):
|
||||
|
@ -476,14 +476,14 @@ def add_related(isbn):
|
|||
|
||||
# group the other language editions together
|
||||
for lang_group in other_editions.itervalues():
|
||||
logger.debug("lang_group (ed, work): %s", [(ed.id, ed.work_id) for ed in lang_group])
|
||||
logger.debug(u"lang_group (ed, work): %s", [(ed.id, ed.work_id) for ed in lang_group])
|
||||
if len(lang_group) > 1:
|
||||
lang_edition = lang_group[0]
|
||||
logger.debug("lang_edition.id: %s", lang_edition.id)
|
||||
logger.debug(u"lang_edition.id: %s", lang_edition.id)
|
||||
# compute the distinct set of works to merge into lang_edition.work
|
||||
works_to_merge = set([ed.work for ed in lang_group[1:]]) - set([lang_edition.work])
|
||||
for w in works_to_merge:
|
||||
logger.debug("merge_works path 2 %s %s", lang_edition.work_id, w.id)
|
||||
logger.debug(u"merge_works path 2 %s %s", lang_edition.work_id, w.id)
|
||||
merged_work = merge_works(lang_edition.work, w)
|
||||
models.WorkRelation.objects.get_or_create(
|
||||
to_work=lang_group[0].work,
|
||||
|
@ -498,17 +498,21 @@ def thingisbn(isbn):
|
|||
Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns,
|
||||
which come back as isbn_13')
|
||||
"""
|
||||
logger.info("looking up %s at ThingISBN", isbn)
|
||||
logger.info(u"looking up %s at ThingISBN", isbn)
|
||||
url = "https://www.librarything.com/api/thingISBN/%s" % isbn
|
||||
xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content
|
||||
doc = ElementTree.fromstring(xml)
|
||||
return [e.text for e in doc.findall('isbn')]
|
||||
try:
|
||||
doc = ElementTree.fromstring(xml)
|
||||
return [e.text for e in doc.findall('isbn')]
|
||||
except SyntaxError:
|
||||
# LibraryThing down
|
||||
return []
|
||||
|
||||
|
||||
def merge_works(w1, w2, user=None):
|
||||
"""will merge the second work (w2) into the first (w1)
|
||||
"""
|
||||
logger.info("merging work %s into %s", w2.id, w1.id)
|
||||
logger.info(u"merging work %s into %s", w2.id, w1.id)
|
||||
# don't merge if the works are the same or at least one of the works has no id
|
||||
#(for example, when w2 has already been deleted)
|
||||
if w1 is None or w2 is None or w1.id == w2.id or w1.id is None or w2.id is None:
|
||||
|
@ -583,7 +587,7 @@ def detach_edition(e):
|
|||
will detach edition from its work, creating a new stub work. if remerge=true, will see if
|
||||
there's another work to attach to
|
||||
"""
|
||||
logger.info("splitting edition %s from %s", e, e.work)
|
||||
logger.info(u"splitting edition %s from %s", e, e.work)
|
||||
w = models.Work(title=e.title, language=e.work.language)
|
||||
w.save()
|
||||
|
||||
|
@ -618,7 +622,7 @@ def add_openlibrary(work, hard_refresh=False):
|
|||
work.save()
|
||||
|
||||
# find the first ISBN match in OpenLibrary
|
||||
logger.info("looking up openlibrary data for work %s", work.id)
|
||||
logger.info(u"looking up openlibrary data for work %s", work.id)
|
||||
|
||||
e = None # openlibrary edition json
|
||||
w = None # openlibrary work json
|
||||
|
@ -633,7 +637,7 @@ def add_openlibrary(work, hard_refresh=False):
|
|||
try:
|
||||
e = _get_json(url, params, type='ol')
|
||||
except LookupFailure:
|
||||
logger.exception("OL lookup failed for %s", isbn_key)
|
||||
logger.exception(u"OL lookup failed for %s", isbn_key)
|
||||
e = {}
|
||||
if e.has_key(isbn_key):
|
||||
if e[isbn_key].has_key('details'):
|
||||
|
@ -673,7 +677,7 @@ def add_openlibrary(work, hard_refresh=False):
|
|||
)
|
||||
if e[isbn_key]['details'].has_key('works'):
|
||||
work_key = e[isbn_key]['details']['works'].pop(0)['key']
|
||||
logger.info("got openlibrary work %s for isbn %s", work_key, isbn_key)
|
||||
logger.info(u"got openlibrary work %s for isbn %s", work_key, isbn_key)
|
||||
models.Identifier.get_or_add(type='olwk', value=work_key, work=work)
|
||||
try:
|
||||
w = _get_json("https://openlibrary.org" + work_key, type='ol')
|
||||
|
@ -691,14 +695,14 @@ def add_openlibrary(work, hard_refresh=False):
|
|||
if w.has_key('subjects') and len(w['subjects']) > len(subjects):
|
||||
subjects = w['subjects']
|
||||
except LookupFailure:
|
||||
logger.exception("OL lookup failed for %s", work_key)
|
||||
logger.exception(u"OL lookup failed for %s", work_key)
|
||||
if not subjects:
|
||||
logger.warn("unable to find work %s at openlibrary", work.id)
|
||||
logger.warn(u"unable to find work %s at openlibrary", work.id)
|
||||
return
|
||||
|
||||
# add the subjects to the Work
|
||||
for s in subjects:
|
||||
logger.info("adding subject %s to work %s", s, work.id)
|
||||
logger.info(u"adding subject %s to work %s", s, work.id)
|
||||
subject = models.Subject.set_by_name(s, work=work)
|
||||
|
||||
work.save()
|
||||
|
@ -716,9 +720,9 @@ def _get_json(url, params={}, type='gb'):
|
|||
if response.status_code == 200:
|
||||
return json.loads(response.content)
|
||||
else:
|
||||
logger.error("unexpected HTTP response: %s", response)
|
||||
logger.error(u"unexpected HTTP response: %s", response)
|
||||
if response.content:
|
||||
logger.error("response content: %s", response.content)
|
||||
logger.error(u"response content: %s", response.content)
|
||||
raise LookupFailure("GET failed: url=%s and params=%s" % (url, params))
|
||||
|
||||
|
||||
|
@ -766,7 +770,7 @@ def load_gutenberg_edition(title, gutenberg_etext_id, ol_work_id, seed_isbn, url
|
|||
ebook = models.Ebook()
|
||||
|
||||
if len(ebooks) > 1:
|
||||
logger.warning("There is more than one Ebook matching url {0}".format(url))
|
||||
logger.warning(u"There is more than one Ebook matching url {0}".format(url))
|
||||
|
||||
|
||||
ebook.format = format
|
||||
|
@ -826,8 +830,6 @@ def edition_for_etype(etype, metadata, default=None):
|
|||
for key in metadata.edition_identifiers.keys():
|
||||
return edition_for_ident(key, metadata.identifiers[key])
|
||||
|
||||
MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
|
||||
|
||||
def load_ebookfile(url, etype):
|
||||
'''
|
||||
return a ContentFile if a new ebook has been loaded
|
||||
|
@ -960,8 +962,7 @@ class BasePandataLoader(object):
|
|||
if contentfile:
|
||||
contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
|
||||
path = default_storage.save(contentfile_name, contentfile)
|
||||
lic = MATCH_LICENSE.search(metadata.rights_url)
|
||||
license = 'CC {}'.format(lic.group(1).upper()) if lic else ''
|
||||
license = cc.license_from_cc_url(metadata.rights_url)
|
||||
ebf = models.EbookFile.objects.create(
|
||||
format=key,
|
||||
edition=edition,
|
||||
|
|
17
core/cc.py
17
core/cc.py
|
@ -1,8 +1,11 @@
|
|||
# coding=utf-8
|
||||
# mostly constants related to Creative Commons
|
||||
''' mostly constants related to Creative Commons
|
||||
# let's be DRY with these parameters
|
||||
|
||||
## need to add versioned CC entries
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
INFO_CC = (
|
||||
('CC BY-NC-ND', 'by-nc-nd', 'Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported (CC BY-NC-ND 3.0)', 'https://creativecommons.org/licenses/by-nc-nd/3.0/', 'Creative Commons Attribution-NonCommercial-NoDerivs'),
|
||||
|
@ -162,3 +165,15 @@ def match_license(license_string):
|
|||
except ValueError:
|
||||
pass
|
||||
return RIGHTS_ALIAS.get(license_string, None)
|
||||
|
||||
MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
|
||||
def license_from_cc_url(rights_url):
|
||||
if not rights_url:
|
||||
return None
|
||||
lic = MATCH_LICENSE.search(rights_url)
|
||||
if lic:
|
||||
return 'CC {}'.format(lic.group(1).upper())
|
||||
if rights_url.find('openedition.org') >= 0:
|
||||
return 'OPENEDITION'
|
||||
return ''
|
||||
|
||||
|
|
|
@ -52,3 +52,9 @@ def add_by_webpage(url, work=None, user=None):
|
|||
|
||||
def add_by_sitemap(url, maxnum=None):
|
||||
return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum))
|
||||
|
||||
def scrape_language(url):
|
||||
scraper = get_scraper(url)
|
||||
return scraper.metadata.get('language')
|
||||
|
||||
|
||||
|
|
|
@ -1,42 +1,54 @@
|
|||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
import logging
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
|
||||
from itertools import islice
|
||||
|
||||
import requests
|
||||
|
||||
from django.db.models import (Q, F)
|
||||
from django.db.models import Q
|
||||
|
||||
from django.core.files.storage import default_storage
|
||||
from django.core.files.base import ContentFile
|
||||
from django.core.files.storage import default_storage
|
||||
|
||||
import regluit
|
||||
from oaipmh.client import Client
|
||||
from oaipmh.error import IdDoesNotExistError
|
||||
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
|
||||
|
||||
from regluit.core import bookloader, cc
|
||||
from regluit.core import models, tasks
|
||||
from regluit.core import bookloader
|
||||
from regluit.core.bookloader import add_by_isbn, merge_works
|
||||
from regluit.core.bookloader import merge_works
|
||||
from regluit.core.isbn import ISBN
|
||||
from regluit.core.loaders.utils import type_for_url
|
||||
from regluit.core.validation import valid_subject
|
||||
|
||||
from . import scrape_language
|
||||
from .doab_utils import doab_lang_to_iso_639_1, online_to_download, url_to_provider
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
springercover = re.compile(r'ftp.+springer\.de.+(\d{13}\.jpg)$', flags=re.U)
|
||||
def unlist(alist):
|
||||
if not alist:
|
||||
return None
|
||||
return alist[0]
|
||||
|
||||
|
||||
SPRINGER_COVER = re.compile(r'ftp.+springer\.de.+(\d{13}\.jpg)$', flags=re.U)
|
||||
SPRINGER_IMAGE = u'https://images.springer.com/sgw/books/medium/{}.jpg'
|
||||
def store_doab_cover(doab_id, redo=False):
|
||||
|
||||
|
||||
"""
|
||||
returns tuple: 1) cover URL, 2) whether newly created (boolean)
|
||||
"""
|
||||
|
||||
cover_file_name= '/doab/%s/cover' % (doab_id)
|
||||
|
||||
|
||||
cover_file_name = '/doab/%s/cover' % (doab_id)
|
||||
|
||||
# if we don't want to redo and the cover exists, return the URL of the cover
|
||||
|
||||
|
||||
if not redo and default_storage.exists(cover_file_name):
|
||||
return (default_storage.url(cover_file_name), False)
|
||||
|
||||
|
||||
# download cover image to cover_file
|
||||
url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)
|
||||
try:
|
||||
|
@ -44,16 +56,16 @@ def store_doab_cover(doab_id, redo=False):
|
|||
if r.status_code == 302:
|
||||
redirurl = r.headers['Location']
|
||||
if redirurl.startswith(u'ftp'):
|
||||
springerftp = springercover.match(redirurl)
|
||||
springerftp = SPRINGER_COVER.match(redirurl)
|
||||
if springerftp:
|
||||
redirurl = u'https://images.springer.com/sgw/books/medium/{}.jpg'.format(springerftp.groups(1))
|
||||
redirurl = SPRINGER_IMAGE.format(springerftp.groups(1))
|
||||
r = requests.get(redirurl)
|
||||
else:
|
||||
r = requests.get(url)
|
||||
r = requests.get(url)
|
||||
cover_file = ContentFile(r.content)
|
||||
cover_file.content_type = r.headers.get('content-type', '')
|
||||
|
||||
path = default_storage.save(cover_file_name, cover_file)
|
||||
default_storage.save(cover_file_name, cover_file)
|
||||
return (default_storage.url(cover_file_name), True)
|
||||
except Exception, e:
|
||||
# if there is a problem, return None for cover URL
|
||||
|
@ -74,52 +86,51 @@ def update_cover_doab(doab_id, edition, store_cover=True):
|
|||
edition.cover_image = cover_url
|
||||
edition.save()
|
||||
return cover_url
|
||||
else:
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
def attach_more_doab_metadata(edition, description, subjects,
|
||||
publication_date, publisher_name=None, language=None, authors=u''):
|
||||
|
||||
|
||||
"""
|
||||
for given edition, attach description, subjects, publication date to
|
||||
corresponding Edition and Work
|
||||
"""
|
||||
# if edition doesn't have a publication date, update it
|
||||
# if edition doesn't have a publication date, update it
|
||||
if not edition.publication_date:
|
||||
edition.publication_date = publication_date
|
||||
|
||||
|
||||
# if edition.publisher_name is empty, set it
|
||||
if not edition.publisher_name:
|
||||
edition.set_publisher(publisher_name)
|
||||
|
||||
|
||||
edition.save()
|
||||
|
||||
|
||||
# attach description to work if it's not empty
|
||||
work = edition.work
|
||||
if not work.description:
|
||||
work.description = description
|
||||
|
||||
|
||||
# update subjects
|
||||
for s in subjects:
|
||||
if valid_subject(s):
|
||||
models.Subject.set_by_name(s, work=work)
|
||||
|
||||
|
||||
# set reading level of work if it's empty; doab is for adults.
|
||||
if not work.age_level:
|
||||
work.age_level = '18-'
|
||||
|
||||
if language:
|
||||
|
||||
if language and language != 'xx':
|
||||
work.language = language
|
||||
work.save()
|
||||
|
||||
|
||||
if authors and authors == authors: # test for authors != NaN
|
||||
authlist = creator_list(authors)
|
||||
if edition.authors.all().count() < len(authlist):
|
||||
edition.authors.clear()
|
||||
if authlist is not None:
|
||||
for [rel,auth] in authlist:
|
||||
for [rel, auth] in authlist:
|
||||
edition.add_author(auth, rel)
|
||||
|
||||
|
||||
return edition
|
||||
|
||||
def add_all_isbns(isbns, work, language=None, title=None):
|
||||
|
@ -128,69 +139,73 @@ def add_all_isbns(isbns, work, language=None, title=None):
|
|||
first_edition = None
|
||||
edition = bookloader.add_by_isbn(isbn, work, language=language, title=title)
|
||||
if edition:
|
||||
first_edition = first_edition if first_edition else edition
|
||||
if work and (edition.work_id != work.id):
|
||||
first_edition = first_edition if first_edition else edition
|
||||
if work and (edition.work_id != work.id):
|
||||
if work.created < edition.work.created:
|
||||
work = merge_works(work, edition.work)
|
||||
else:
|
||||
work = merge_works(edition.work, work)
|
||||
else:
|
||||
work = edition.work
|
||||
return first_edition
|
||||
return first_edition
|
||||
|
||||
def load_doab_edition(title, doab_id, url, format, rights,
|
||||
language, isbns,
|
||||
provider, **kwargs):
|
||||
|
||||
|
||||
"""
|
||||
load a record from doabooks.org represented by input parameters and return an ebook
|
||||
"""
|
||||
logger.info('load doab {} {} {} {} {}'.format(doab_id, format, rights, language, provider))
|
||||
if language and isinstance(language, list):
|
||||
language = language[0]
|
||||
|
||||
if language == 'xx' and format == 'online':
|
||||
language = scrape_language(url)
|
||||
# check to see whether the Edition hasn't already been loaded first
|
||||
# search by url
|
||||
ebooks = models.Ebook.objects.filter(url=url)
|
||||
|
||||
|
||||
# 1 match
|
||||
# > 1 matches
|
||||
# 0 match
|
||||
|
||||
# simplest case -- if match (1 or more), we could check whether any
|
||||
# ebook.edition.work has a doab id matching given doab_id
|
||||
|
||||
|
||||
# put a migration to force Ebook.url to be unique id
|
||||
|
||||
|
||||
# if yes, then return one of the Edition(s) whose work is doab_id
|
||||
# if no, then
|
||||
# if no, then
|
||||
ebook = None
|
||||
if len(ebooks) > 1:
|
||||
raise Exception("There is more than one Ebook matching url {0}".format(url))
|
||||
elif len(ebooks) == 1:
|
||||
raise Exception("There is more than one Ebook matching url {0}".format(url))
|
||||
elif len(ebooks) == 1:
|
||||
ebook = ebooks[0]
|
||||
doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id,
|
||||
work=ebook.edition.work)
|
||||
# update the cover id
|
||||
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
|
||||
work=ebook.edition.work)
|
||||
# update the cover id
|
||||
cover_url = update_cover_doab(doab_id, ebook.edition)
|
||||
|
||||
|
||||
# attach more metadata
|
||||
attach_more_doab_metadata(ebook.edition,
|
||||
description=kwargs.get('description'),
|
||||
subjects=kwargs.get('subject'),
|
||||
publication_date=kwargs.get('date'),
|
||||
publisher_name=kwargs.get('publisher'),
|
||||
language=language,
|
||||
authors=kwargs.get('authors'),)
|
||||
attach_more_doab_metadata(
|
||||
ebook.edition,
|
||||
description=unlist(kwargs.get('description')),
|
||||
subjects=kwargs.get('subject'),
|
||||
publication_date=unlist(kwargs.get('date')),
|
||||
publisher_name=unlist(kwargs.get('publisher')),
|
||||
language=language,
|
||||
authors=kwargs.get('creator'),
|
||||
)
|
||||
# make sure all isbns are added
|
||||
add_all_isbns(isbns, None, language=language, title=title)
|
||||
return ebook
|
||||
|
||||
add_all_isbns(isbns, ebook.edition.work, language=language, title=title)
|
||||
return ebook.edition
|
||||
|
||||
# remaining case --> no ebook, load record, create ebook if there is one.
|
||||
assert len(ebooks) == 0
|
||||
|
||||
assert not ebooks
|
||||
|
||||
|
||||
# we need to find the right Edition/Work to tie Ebook to...
|
||||
|
||||
|
||||
# look for the Edition with which to associate ebook.
|
||||
# loop through the isbns to see whether we get one that is not None
|
||||
work = None
|
||||
|
@ -206,16 +221,16 @@ def load_doab_edition(title, doab_id, url, format, rights,
|
|||
edition = ident.work.preferred_edition
|
||||
work = edition.work
|
||||
break
|
||||
|
||||
|
||||
if edition is not None:
|
||||
# if this is a new edition, then add related editions asynchronously
|
||||
if getattr(edition,'new', False):
|
||||
if getattr(edition, 'new', False):
|
||||
tasks.populate_edition.delay(edition.isbn_13)
|
||||
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
|
||||
work=edition.work)
|
||||
work=edition.work)
|
||||
|
||||
# we need to create Edition(s) de novo
|
||||
else:
|
||||
# we need to create Edition(s) de novo
|
||||
else:
|
||||
# if there is a Work with doab_id already, attach any new Edition(s)
|
||||
try:
|
||||
work = models.Identifier.objects.get(type='doab', value=doab_id).work
|
||||
|
@ -226,11 +241,11 @@ def load_doab_edition(title, doab_id, url, format, rights,
|
|||
work = models.Work(language='xx', title=title, age_level='18-')
|
||||
work.save()
|
||||
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
|
||||
work=work)
|
||||
|
||||
work=work)
|
||||
|
||||
# if work has any ebooks already, attach the ebook to the corresponding edition
|
||||
# otherwise pick the first one
|
||||
# pick the first edition as the one to tie ebook to
|
||||
# pick the first edition as the one to tie ebook to
|
||||
editions_with_ebooks = models.Edition.objects.filter(Q(work__id=work.id) & \
|
||||
Q(ebooks__isnull=False)).distinct()
|
||||
if editions_with_ebooks:
|
||||
|
@ -240,73 +255,41 @@ def load_doab_edition(title, doab_id, url, format, rights,
|
|||
else:
|
||||
edition = models.Edition(work=work, title=title)
|
||||
edition.save()
|
||||
|
||||
|
||||
# make the edition the selected_edition of the work
|
||||
work.selected_edition = edition
|
||||
work.save()
|
||||
|
||||
if format in ('pdf', 'epub', 'mobi'):
|
||||
|
||||
if format in ('pdf', 'epub', 'mobi', 'html', 'online'):
|
||||
ebook = models.Ebook()
|
||||
ebook.format = format
|
||||
ebook.provider = provider
|
||||
ebook.url = url
|
||||
ebook.url = url
|
||||
ebook.rights = rights
|
||||
# tie the edition to ebook
|
||||
ebook.edition = edition
|
||||
if format == "online":
|
||||
ebook.active = False
|
||||
ebook.save()
|
||||
|
||||
|
||||
# update the cover id (could be done separately)
|
||||
cover_url = update_cover_doab(doab_id, edition)
|
||||
|
||||
|
||||
# attach more metadata
|
||||
attach_more_doab_metadata(edition,
|
||||
description=kwargs.get('description'),
|
||||
subjects=kwargs.get('subject'),
|
||||
publication_date=kwargs.get('date'),
|
||||
publisher_name=kwargs.get('publisher'),
|
||||
authors=kwargs.get('authors'),)
|
||||
return ebook
|
||||
attach_more_doab_metadata(
|
||||
edition,
|
||||
description=unlist(kwargs.get('description')),
|
||||
subjects=kwargs.get('subject'),
|
||||
publication_date=unlist(kwargs.get('date')),
|
||||
publisher_name=unlist(kwargs.get('publisher')),
|
||||
authors=kwargs.get('creator'),
|
||||
)
|
||||
return edition
|
||||
|
||||
|
||||
def load_doab_records(fname, limit=None):
|
||||
|
||||
success_count = 0
|
||||
ebook_count = 0
|
||||
|
||||
records = json.load(open(fname))
|
||||
|
||||
for (i, book) in enumerate(islice(records,limit)):
|
||||
d = dict(book)
|
||||
d['isbns'] = split_isbns(d['isbns_raw']) # use stricter isbn string parsing.
|
||||
try:
|
||||
ebook = load_doab_edition(**d)
|
||||
success_count += 1
|
||||
if ebook:
|
||||
ebook_count +=1
|
||||
except Exception, e:
|
||||
logger.error(e)
|
||||
logger.error(book)
|
||||
|
||||
logger.info("Number of records processed: " + str(success_count))
|
||||
logger.info("Number of ebooks processed: " + str(ebook_count))
|
||||
|
||||
"""
|
||||
#
|
||||
#tools to parse the author lists in doab.csv
|
||||
from pandas import DataFrame
|
||||
url = "http://www.doabooks.org/doab?func=csv"
|
||||
df_csv = DataFrame.from_csv(url)
|
||||
#
|
||||
|
||||
out=[]
|
||||
for val in df_csv.values:
|
||||
isbn = split_isbns(val[0])
|
||||
if isbn:
|
||||
auths = []
|
||||
if val[2] == val[2] and val[-2] == val[-2]: # test for NaN auths and licenses
|
||||
auths = creator_list(val[2])
|
||||
out.append(( isbn[0], auths))
|
||||
open("/Users/eric/doab_auths.json","w+").write(json.dumps(out,indent=2, separators=(',', ': ')))
|
||||
"""
|
||||
|
||||
au = re.compile(r'\(Authors?\)', flags=re.U)
|
||||
ed = re.compile(r'\([^\)]*(dir.|[Eeé]ds?.|org.|coord.|Editor|a cura di|archivist)[^\)]*\)', flags=re.U)
|
||||
tr = re.compile(r'\([^\)]*([Tt]rans.|tr.|translated by)[^\)]*\)', flags=re.U)
|
||||
|
@ -326,14 +309,14 @@ def fnf(auth):
|
|||
if len(parts) == 1:
|
||||
return parts[0].strip()
|
||||
elif len(parts) == 2:
|
||||
return u'{} {}'.format(parts[1].strip(),parts[0].strip())
|
||||
return u'{} {}'.format(parts[1].strip(), parts[0].strip())
|
||||
else:
|
||||
if parts[1].strip() in ('der','van', 'von', 'de', 'ter'):
|
||||
return u'{} {} {}'.format(parts[2].strip(),parts[1].strip(),parts[0].strip())
|
||||
if parts[1].strip() in ('der', 'van', 'von', 'de', 'ter'):
|
||||
return u'{} {} {}'.format(parts[2].strip(), parts[1].strip(), parts[0].strip())
|
||||
#print auth
|
||||
#print re.search(namelist,auth).group(0)
|
||||
return u'{} {}, {}'.format(parts[2].strip(),parts[0].strip(),parts[1].strip())
|
||||
|
||||
return u'{} {}, {}'.format(parts[2].strip(), parts[0].strip(), parts[1].strip())
|
||||
|
||||
|
||||
def creator(auth, editor=False):
|
||||
auth = auth.strip()
|
||||
|
@ -349,68 +332,88 @@ def creator(auth, editor=False):
|
|||
return [u'dsr', fnf(ds.sub(u'', auth))]
|
||||
if re.search(cm, auth):
|
||||
return [u'com', fnf(cm.sub(u'', auth))]
|
||||
|
||||
|
||||
auth = au.sub('', auth)
|
||||
return ['aut', fnf(auth)]
|
||||
|
||||
def split_auths(auths):
|
||||
if ';' in auths or '/' in auths:
|
||||
return namesep2.split(auths)
|
||||
else:
|
||||
nl = namelist.match(auths.strip())
|
||||
if nl:
|
||||
if nl.group(3).endswith(' de') \
|
||||
or ' de ' in nl.group(3) \
|
||||
or nl.group(3).endswith(' da') \
|
||||
or nl.group(1).endswith(' Jr.') \
|
||||
or ' e ' in nl.group(1):
|
||||
return [auths]
|
||||
else:
|
||||
return namesep.split(auths)
|
||||
else :
|
||||
return [auths]
|
||||
|
||||
def split_isbns(isbns):
|
||||
result = []
|
||||
for isbn in isbnsep.split(isbns):
|
||||
isbn = ISBN(isbn)
|
||||
if isbn.valid:
|
||||
result.append(isbn.to_string())
|
||||
return result
|
||||
|
||||
def creator_list(creators):
|
||||
auths = []
|
||||
if re.search(edlist, creators):
|
||||
for auth in split_auths(edlist.sub(u'', creators)):
|
||||
if auth:
|
||||
auths.append(creator(auth, editor=True))
|
||||
else:
|
||||
for auth in split_auths(unicode(creators)):
|
||||
if auth:
|
||||
auths.append(creator(auth))
|
||||
for auth in creators:
|
||||
auths.append(creator(auth))
|
||||
return auths
|
||||
|
||||
def load_doab_auths(fname, limit=None):
|
||||
doab_auths = json.load(open(fname))
|
||||
recnum = 0
|
||||
failed = 0
|
||||
for [isbnraw, authlist] in doab_auths:
|
||||
isbn = ISBN(isbnraw).to_string()
|
||||
try:
|
||||
work = models.Identifier.objects.get(type='isbn',value=isbn).work
|
||||
except models.Identifier.DoesNotExist:
|
||||
print 'isbn = {} not found'.format(isbnraw)
|
||||
failed += 1
|
||||
if work.preferred_edition.authors.all().count() < len(authlist):
|
||||
work.preferred_edition.authors.clear()
|
||||
if authlist is None:
|
||||
print "null authlist; isbn={}".format(isbn)
|
||||
DOAB_OAIURL = 'https://www.doabooks.org/oai'
|
||||
DOAB_PATT = re.compile(r'[\./]doabooks\.org/doab\?.*rid:(\d{1,8}).*')
|
||||
mdregistry = MetadataRegistry()
|
||||
mdregistry.registerReader('oai_dc', oai_dc_reader)
|
||||
doab_client = Client(DOAB_OAIURL, mdregistry)
|
||||
|
||||
def add_by_doab(doab_id, record=None):
|
||||
try:
|
||||
record = record if record else doab_client.getRecord(
|
||||
metadataPrefix='oai_dc',
|
||||
identifier='oai:doab-books:{}'.format(doab_id)
|
||||
)
|
||||
metadata = record[1].getMap()
|
||||
isbns = []
|
||||
url = None
|
||||
for ident in metadata.pop('identifier', []):
|
||||
if ident.startswith('ISBN: '):
|
||||
isbn = ISBN(ident[6:])
|
||||
if isbn.error:
|
||||
continue
|
||||
isbn.validate()
|
||||
isbns.append(isbn.to_string())
|
||||
elif ident.find('doabooks.org') >= 0:
|
||||
# should already know the doab_id
|
||||
continue
|
||||
for [rel,auth] in authlist:
|
||||
work.preferred_edition.add_author(auth, rel)
|
||||
recnum +=1
|
||||
if limit and recnum > limit:
|
||||
break
|
||||
logger.info("Number of records processed: " + str(recnum))
|
||||
logger.info("Number of missing isbns: " + str(failed))
|
||||
|
||||
else:
|
||||
url = ident
|
||||
language = doab_lang_to_iso_639_1(unlist(metadata.pop('language', None)))
|
||||
urls = online_to_download(url)
|
||||
edition = None
|
||||
for dl_url in urls:
|
||||
format = type_for_url(dl_url)
|
||||
if 'format' in metadata:
|
||||
del metadata['format']
|
||||
edition = load_doab_edition(
|
||||
unlist(metadata.pop('title', None)),
|
||||
doab_id,
|
||||
dl_url,
|
||||
format,
|
||||
cc.license_from_cc_url(unlist(metadata.pop('rights', None))),
|
||||
language,
|
||||
isbns,
|
||||
url_to_provider(dl_url) if dl_url else None,
|
||||
**metadata
|
||||
)
|
||||
return edition
|
||||
except IdDoesNotExistError:
|
||||
return None
|
||||
|
||||
|
||||
def getdoab(url):
|
||||
id_match = DOAB_PATT.search(url)
|
||||
if id_match:
|
||||
return id_match.group(1)
|
||||
return False
|
||||
|
||||
def load_doab_oai(from_year=2000, limit=100000):
|
||||
'''
|
||||
use oai feed to get oai updates
|
||||
'''
|
||||
from_ = datetime.datetime(year=from_year, month=1, day=1)
|
||||
doab_ids = []
|
||||
for record in doab_client.listRecords(metadataPrefix='oai_dc', from_=from_):
|
||||
if not record[1]:
|
||||
continue
|
||||
idents = record[1].getMap()['identifier']
|
||||
if idents:
|
||||
for ident in idents:
|
||||
doab = getdoab(ident)
|
||||
if doab:
|
||||
doab_ids.append(doab)
|
||||
e = add_by_doab(doab, record=record)
|
||||
logger.info(u'updated:\t{}\t{}'.format(doab, e.title))
|
||||
if len(doab_ids) > limit:
|
||||
break
|
||||
|
|
|
@ -0,0 +1,126 @@
|
|||
"""
|
||||
doab_utils.py
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
from regluit.utils.lang import get_language_code
|
||||
from .utils import get_soup
|
||||
|
||||
# utility functions for converting lists of individual items into individual items
|
||||
|
||||
# let's do a mapping of the DOAB languages into the language codes used
|
||||
# mostly, we just handle mispellings
|
||||
# also null -> xx
|
||||
|
||||
EXTRA_LANG_MAP = dict([
|
||||
(u'chinese', 'de'),
|
||||
(u'deutsch', 'de'),
|
||||
(u'eng', 'en'),
|
||||
(u'englilsh', 'en'),
|
||||
(u'englilsh', 'en'),
|
||||
(u'englisch', 'en'),
|
||||
(u'espanol', 'es'),
|
||||
(u'ger', 'de'),
|
||||
(u'fra', 'fr'),
|
||||
(u'fre', 'fr'),
|
||||
(u'francese', 'fr'),
|
||||
(u'ita', 'it'),
|
||||
(u'italiano', 'it'),
|
||||
(u'norwegian', 'no'),
|
||||
(u'por', 'pt'),
|
||||
(u'portugese', 'pt'),
|
||||
(u'slovene', 'sl'),
|
||||
(u'spa', 'es'),
|
||||
(u'spagnolo', 'es'),
|
||||
])
|
||||
|
||||
sep = re.compile(r'[ \-;^,/]+')
|
||||
def doab_lang_to_iso_639_1(lang):
|
||||
if lang is None or not lang:
|
||||
return "xx"
|
||||
else:
|
||||
lang = sep.split(lang)[0]
|
||||
code = get_language_code(lang)
|
||||
if code:
|
||||
return code
|
||||
else:
|
||||
return EXTRA_LANG_MAP.get(lang.lower(), 'xx')
|
||||
|
||||
|
||||
DOMAIN_TO_PROVIDER = dict([
|
||||
[u'www.doabooks.org', u'Directory of Open Access Books'],
|
||||
[u'www.oapen.org', u'OAPEN Library'],
|
||||
[u'books.openedition.org', u'OpenEdition Books'],
|
||||
[u'digitalcommons.usu.edu', u'DigitalCommons, Utah State University'],
|
||||
[u'www.aupress.ca', u'Athabasca University Press'],
|
||||
[u'dspace.ucalgary.ca', u'Institutional Repository at the University of Calgary'],
|
||||
[u'www.degruyter.com', u'De Gruyter Online'],
|
||||
[u'dx.doi.org', u'DOI Resolver'],
|
||||
[u'www.openbookpublishers.com', u'Open Book Publishers'],
|
||||
[u'www.adelaide.edu.au', u'University of Adelaide'],
|
||||
[u'hdl.handle.net', u'Handle Proxy'],
|
||||
[u'link.springer.com', u'Springer'],
|
||||
[u'www.bloomsburyacademic.com', u'Bloomsbury Academic'],
|
||||
[u'www.ledizioni.it', u'Ledizioni'],
|
||||
[u'ccdigitalpress.org', u'Computers and Composition Digital Press'],
|
||||
[u'leo.cilea.it', u'LEO '],
|
||||
[u'www.springerlink.com', u'Springer'],
|
||||
[u'www.palgraveconnect.com', u'Palgrave Connect'],
|
||||
[u'www.ubiquitypress.com', u'Ubiquity Press'],
|
||||
[u'ebooks.iospress.nl', u'IOS Press Ebooks'],
|
||||
[u'antropologie.zcu.cz', u'AntropoWeb'],
|
||||
[u'www.unito.it', u"University of Turin"],
|
||||
[u'leo.cineca.it', u'Letteratura Elettronica Online'],
|
||||
[u'hw.oeaw.ac.at', u'Austrian Academy of Sciences'],
|
||||
[u'www.co-action.net', u'Co-Action Publishing'],
|
||||
[u'www.aliprandi.org', u'Simone Aliprandi'],
|
||||
[u'www.maestrantonella.it', u'maestrantonella.it'],
|
||||
[u'www.antilia.to.it', u'antilia.to.it'],
|
||||
[u'www.scribd.com', u'Scribd'],
|
||||
[u'ledibooks.com', u'LediBooks'],
|
||||
[u'press.openedition.org', u'OpenEdition Press'],
|
||||
[u'oapen.org', u'OAPEN Library'],
|
||||
[u'www.ebooks.iospress.nl', u'IOS Press Ebooks'],
|
||||
[u'windsor.scholarsportal.info', u'Scholars Portal'],
|
||||
[u'www.unimib.it', u'University of Milano-Bicocca'],
|
||||
[u'books.mdpi.com', u'MDPI Books'],
|
||||
[u'www.dropbox.com', u'Dropbox'],
|
||||
[u'dl.dropboxusercontent.com', u'Dropbox'],
|
||||
])
|
||||
|
||||
def url_to_provider(url):
|
||||
netloc = urlparse.urlparse(url).netloc
|
||||
return DOMAIN_TO_PROVIDER.get(netloc, netloc)
|
||||
|
||||
FRONTIERSIN = re.compile(r'frontiersin.org/books/[^/]+/(\d+)')
|
||||
|
||||
def online_to_download(url):
|
||||
urls = []
|
||||
if url.find(u'mdpi.com/books/pdfview/book/') >= 0:
|
||||
doc = get_soup(url)
|
||||
if doc:
|
||||
obj = doc.find('object', type='application/pdf')
|
||||
if obj:
|
||||
urls.append(obj['data'].split('#')[0])
|
||||
elif url.find(u'books.scielo.org/') >= 0:
|
||||
doc = get_soup(url)
|
||||
if doc:
|
||||
obj = doc.find('a', class_='pdf_file')
|
||||
if obj:
|
||||
urls.append(urlparse.urljoin(url, obj['href']))
|
||||
obj = doc.find('a', class_='epub_file')
|
||||
if obj:
|
||||
urls.append(urlparse.urljoin(url, obj['href']))
|
||||
elif FRONTIERSIN.search(url):
|
||||
booknum = FRONTIERSIN.search(url).group(1)
|
||||
urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=EPUB'.format(booknum))
|
||||
urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=PDF'.format(booknum))
|
||||
else:
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
from django.conf import settings
|
||||
from django.test import TestCase
|
||||
from regluit.core.models import Ebook, Edition, Work
|
||||
from .utils import dl_online
|
||||
|
||||
class LoaderTests(TestCase):
|
||||
def setUp(self):
|
||||
pass
|
||||
|
||||
def test_downloads(self):
|
||||
if not (settings.TEST_INTEGRATION):
|
||||
return
|
||||
|
||||
work = Work(title="online work")
|
||||
work.save()
|
||||
|
||||
edition = Edition(work=work)
|
||||
edition.save()
|
||||
|
||||
dropbox_url = 'https://www.dropbox.com/s/h5jzpb4vknk8n7w/Jakobsson_The_Troll_Inside_You_EBook.pdf?dl=0'
|
||||
dropbox_ebook = Ebook.objects.create(format='online', url=dropbox_url, edition=edition)
|
||||
dropbox_ebf = dl_online(dropbox_ebook)
|
||||
self.assertTrue(dropbox_ebf.ebook.filesize)
|
||||
|
||||
jbe_url = 'http://www.jbe-platform.com/content/books/9789027295958'
|
||||
jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition)
|
||||
jbe_ebf = dl_online(jbe_ebook)
|
||||
self.assertTrue(jbe_ebf.ebook.filesize)
|
|
@ -1,15 +1,24 @@
|
|||
import csv
|
||||
import re
|
||||
import requests
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import unicodedata
|
||||
import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.files.base import ContentFile
|
||||
|
||||
from regluit.core.models import Work, Edition, Author, PublisherName, Identifier, Subject
|
||||
from regluit.core.isbn import ISBN
|
||||
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
|
||||
from regluit.api.crosswalks import inv_relator_contrib
|
||||
from regluit.bisac.models import BisacHeading
|
||||
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
|
||||
from regluit.core.isbn import ISBN
|
||||
from regluit.core.models import (
|
||||
Author, Ebook, EbookFile, Edition, Identifier, path_for_file, PublisherName, Subject, Work,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -34,6 +43,12 @@ def utf8_general_ci_norm(s):
|
|||
s1 = unicodedata.normalize('NFD', s)
|
||||
return ''.join(c for c in s1 if not unicodedata.combining(c)).upper()
|
||||
|
||||
def get_soup(url):
|
||||
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
||||
if response.status_code == 200:
|
||||
return BeautifulSoup(response.content, 'lxml')
|
||||
return None
|
||||
|
||||
def get_authors(book):
|
||||
authors=[]
|
||||
if book.get('AuthorsList',''):
|
||||
|
@ -331,14 +346,15 @@ def loaded_book_ok(book, work, edition):
|
|||
return True
|
||||
|
||||
ID_URLPATTERNS = {
|
||||
'goog': re.compile(r'[\./]google\.com/books\?.*id=([a-zA-Z0-9\-_]{12})'),
|
||||
'olwk': re.compile(r'[\./]openlibrary\.org(/works/OL\d{1,8}W)'),
|
||||
'gdrd': re.compile(r'[\./]goodreads\.com/book/show/(\d{1,8})'),
|
||||
'ltwk': re.compile(r'[\./]librarything\.com/work/(\d{1,8})'),
|
||||
'oclc': re.compile(r'\.worldcat\.org/.*oclc/(\d{8,12})'),
|
||||
'doi': re.compile(r'[\./]doi\.org/(10\.\d+/\S+)'),
|
||||
'gtbg': re.compile(r'[\./]gutenberg\.org/ebooks/(\d{1,6})'),
|
||||
'glue': re.compile(r'[\./]unglue\.it/work/(\d{1,7})'),
|
||||
'goog': re.compile(r'[\./]google\.com/books\?.*id=(?P<id>[a-zA-Z0-9\-_]{12})'),
|
||||
'olwk': re.compile(r'[\./]openlibrary\.org(?P<id>/works/OL\d{1,8}W)'),
|
||||
'doab': re.compile(r'([\./]doabooks\.org/doab\?.*rid:|=oai:doab-books:)(?P<id>\d{1,8})'),
|
||||
'gdrd': re.compile(r'[\./]goodreads\.com/book/show/(?P<id>\d{1,8})'),
|
||||
'ltwk': re.compile(r'[\./]librarything\.com/work/(?P<id>\d{1,8})'),
|
||||
'oclc': re.compile(r'\.worldcat\.org/.*oclc/(?P<id>\d{8,12})'),
|
||||
'doi': re.compile(r'[\./]doi\.org/(?P<id>10\.\d+/\S+)'),
|
||||
'gtbg': re.compile(r'[\./]gutenberg\.org/ebooks/(?P<id>\d{1,6})'),
|
||||
'glue': re.compile(r'[\./]unglue\.it/work/(?P<id>\d{1,7})'),
|
||||
}
|
||||
|
||||
def ids_from_urls(url):
|
||||
|
@ -346,7 +362,111 @@ def ids_from_urls(url):
|
|||
for ident in ID_URLPATTERNS.keys():
|
||||
id_match = ID_URLPATTERNS[ident].search(url)
|
||||
if id_match:
|
||||
ids[ident] = id_match.group(1)
|
||||
ids[ident] = id_match.group('id')
|
||||
return ids
|
||||
|
||||
|
||||
DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"')
|
||||
|
||||
def dl_online(ebook):
|
||||
if ebook.format != 'online':
|
||||
return
|
||||
|
||||
if ebook.url.find(u'dropbox.com/s/') >= 0:
|
||||
response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
|
||||
if response.status_code == 200:
|
||||
match_dl = DROPBOX_DL.search(response.content)
|
||||
if match_dl:
|
||||
return make_dl_ebook(match_dl.group(1), ebook)
|
||||
elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
|
||||
doc = get_soup(ebook.url)
|
||||
if doc:
|
||||
obj = doc.select_one('div.fulltexticoncontainer-PDF a')
|
||||
if obj:
|
||||
dl_url = urlparse.urljoin(ebook.url, obj['href'])
|
||||
return make_dl_ebook(dl_url, ebook)
|
||||
|
||||
def make_dl_ebook(url, ebook):
|
||||
if EbookFile.objects.filter(source=ebook.url):
|
||||
return EbookFile.objects.filter(source=ebook.url)[0]
|
||||
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
||||
if response.status_code == 200:
|
||||
filesize = int(response.headers.get("Content-Length", 0))
|
||||
filesize = filesize if filesize else None
|
||||
format = type_for_url(url, content_type=response.headers.get('content-type'))
|
||||
if format != 'online':
|
||||
new_ebf = EbookFile.objects.create(
|
||||
edition=ebook.edition,
|
||||
format=format,
|
||||
source=ebook.url,
|
||||
)
|
||||
new_ebf.file.save(path_for_file(new_ebf, None), ContentFile(response.content))
|
||||
new_ebf.save()
|
||||
new_ebook = Ebook.objects.create(
|
||||
edition=ebook.edition,
|
||||
format=format,
|
||||
provider='Unglue.it',
|
||||
url=new_ebf.file.url,
|
||||
rights=ebook.rights,
|
||||
filesize=filesize,
|
||||
version_label=ebook.version_label,
|
||||
version_iter=ebook.version_iter,
|
||||
)
|
||||
new_ebf.ebook = new_ebook
|
||||
new_ebf.save()
|
||||
return new_ebf
|
||||
|
||||
def type_for_url(url, content_type=None):
|
||||
if not url:
|
||||
return ''
|
||||
if url.find('books.openedition.org') >= 0:
|
||||
return ('online')
|
||||
ct = content_type if content_type else contenttyper.calc_type(url)
|
||||
if re.search("pdf", ct):
|
||||
return "pdf"
|
||||
elif re.search("octet-stream", ct) and re.search("pdf", url, flags=re.I):
|
||||
return "pdf"
|
||||
elif re.search("octet-stream", ct) and re.search("epub", url, flags=re.I):
|
||||
return "epub"
|
||||
elif re.search("text/plain", ct):
|
||||
return "text"
|
||||
elif re.search("text/html", ct):
|
||||
if url.find('oapen.org/view') >= 0:
|
||||
return "html"
|
||||
return "online"
|
||||
elif re.search("epub", ct):
|
||||
return "epub"
|
||||
elif re.search("mobi", ct):
|
||||
return "mobi"
|
||||
return "other"
|
||||
|
||||
class ContentTyper(object):
|
||||
""" """
|
||||
def __init__(self):
|
||||
self.last_call = dict()
|
||||
|
||||
def content_type(self, url):
|
||||
try:
|
||||
r = requests.head(url)
|
||||
return r.headers.get('content-type')
|
||||
except:
|
||||
return None
|
||||
|
||||
def calc_type(self, url):
|
||||
delay = 1
|
||||
# is there a delay associated with the url
|
||||
netloc = urlparse.urlparse(url).netloc
|
||||
|
||||
# wait if necessary
|
||||
last_call = self.last_call.get(netloc)
|
||||
if last_call is not None:
|
||||
now = time.time()
|
||||
min_time_next_call = last_call + delay
|
||||
if min_time_next_call > now:
|
||||
time.sleep(min_time_next_call-now)
|
||||
|
||||
self.last_call[netloc] = time.time()
|
||||
|
||||
# compute the content-type
|
||||
return self.content_type(url)
|
||||
|
||||
contenttyper = ContentTyper()
|
||||
|
|
|
@ -1,17 +0,0 @@
|
|||
import os
|
||||
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import User
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from regluit.core.loaders import doab
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "load doab books"
|
||||
args = "<limit> <file_name>"
|
||||
|
||||
def handle(self, limit=None, file_name="../../../bookdata/doab.json", **options):
|
||||
|
||||
command_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
file_path = os.path.join(command_dir, file_name)
|
||||
doab.load_doab_records(file_path, limit=int(limit))
|
|
@ -0,0 +1,21 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
|
||||
from regluit.core.loaders.utils import dl_online
|
||||
from regluit.core.models import Ebook
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "harvest downloadable ebooks from 'online' ebooks"
|
||||
args = "<limit>"
|
||||
|
||||
def handle(self, limit=0, **options):
|
||||
limit = int(limit) if limit else 0
|
||||
onlines = Ebook.objects.filter(format='online')
|
||||
done = 0
|
||||
for online in onlines:
|
||||
new_ebf = dl_online(online)
|
||||
if new_ebf:
|
||||
done += 1
|
||||
if done > limit:
|
||||
break
|
||||
print 'harvested {} ebooks'.format(done)
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
|
||||
from regluit.core.loaders import doab
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "load doab books by doab_id via oai"
|
||||
args = "<doab_id>"
|
||||
|
||||
def handle(self, doab_id, **options):
|
||||
doab.add_by_doab(doab_id)
|
|
@ -0,0 +1,18 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
|
||||
from regluit.core.loaders import doab
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "load doab books via oai"
|
||||
args = "<from_year> <limit>"
|
||||
|
||||
def handle(self, from_year= None, limit=None, **options):
|
||||
from_year = int(from_year) if from_year else None
|
||||
limit = int(limit) if limit else None
|
||||
if limit:
|
||||
doab.load_doab_oai(from_year=from_year, limit=limit)
|
||||
else:
|
||||
if from_year:
|
||||
doab.load_doab_oai(from_year=from_year)
|
||||
else:
|
||||
doab.load_doab_oai()
|
|
@ -1083,7 +1083,7 @@ class EbookFile(models.Model):
|
|||
source=self.file.url
|
||||
)
|
||||
|
||||
new_mobi_ebf.file.save(path_for_file('ebf', None), mobi_cf)
|
||||
new_mobi_ebf.file.save(path_for_file(new_mobi_ebf, None), mobi_cf)
|
||||
new_mobi_ebf.save()
|
||||
if self.ebook:
|
||||
new_ebook = Ebook.objects.create(
|
||||
|
|
830
core/tests.py
830
core/tests.py
File diff suppressed because it is too large
Load Diff
|
@ -21,6 +21,7 @@ from regluit.core.bookloader import (
|
|||
from regluit.core.parameters import WORK_IDENTIFIERS
|
||||
|
||||
from regluit.core.loaders import add_by_webpage
|
||||
from regluit.core.loaders.doab import add_by_doab
|
||||
from regluit.core.loaders.utils import ids_from_urls
|
||||
from regluit.frontend.forms import EditionForm, IdentifierForm
|
||||
|
||||
|
@ -106,6 +107,11 @@ def get_edition_for_id(id_type, id_value, user=None):
|
|||
if edition:
|
||||
return user_edition(edition, user)
|
||||
|
||||
if identifiers.has_key('doab'):
|
||||
edition = add_by_doab(identifiers['doab'])
|
||||
if edition:
|
||||
return user_edition(edition, user)
|
||||
|
||||
if identifiers.has_key('oclc'):
|
||||
edition = add_by_oclc(identifiers['oclc'])
|
||||
if edition:
|
||||
|
|
|
@ -6,16 +6,11 @@ PyJWT==1.4.1
|
|||
PyPDF2==1.23
|
||||
PyGithub==1.15.0
|
||||
PyYAML==3.11
|
||||
git+git://github.com/urschrei/pyzotero.git@v0.9.51
|
||||
SPARQLWrapper==1.6.4
|
||||
WebOb==1.2.3
|
||||
WebTest==1.4.0
|
||||
amqp==1.4.9
|
||||
anyjson==0.3.3
|
||||
billiard==3.3.0.23
|
||||
awscli==1.10.26
|
||||
boto==2.42.0
|
||||
#git+ssh://git@github.com/Gluejar/boto.git@2.3.0
|
||||
celery==3.1.23
|
||||
certifi==2016.2.28
|
||||
# pip installing pillow seems to delete distribute
|
||||
|
@ -33,7 +28,6 @@ django-jsonfield==1.0.0
|
|||
#django-kombu==0.9.4
|
||||
django-maintenancemode==0.11.2
|
||||
django-mptt==0.8.5
|
||||
#django-nose-selenium==0.7.3
|
||||
#django-notification==0.2
|
||||
git+git://github.com/eshellman/django-notification.git@412c7a03a327195a1017c2be92c8e2caabc880b6
|
||||
django-registration==2.1.2
|
||||
|
@ -42,9 +36,7 @@ django-smtp-ssl==1.0
|
|||
django-storages==1.4.1
|
||||
django-tastypie==0.13.3
|
||||
django-transmeta==0.7.3
|
||||
feedparser==5.1.2
|
||||
fef-questionnaire==4.0.1
|
||||
freebase==1.0.8
|
||||
#gitenberg.metadata==0.1.6
|
||||
git+https://github.com/gitenberg-dev/gitberg-build
|
||||
#git+ssh://git@github.com/gitenberg-dev/metadata.git@0.1.11
|
||||
|
@ -53,7 +45,7 @@ html5lib==1.0b3
|
|||
httplib2==0.7.5
|
||||
isodate==0.5.1
|
||||
kombu==3.0.35
|
||||
lxml==2.3.5
|
||||
lxml==4.2.1
|
||||
defusedxml==0.4.1
|
||||
mechanize==0.2.5
|
||||
mimeparse==0.1.3
|
||||
|
@ -66,6 +58,7 @@ paramiko==1.14.1
|
|||
postmonkey==1.0b
|
||||
pycrypto==2.6
|
||||
pymarc==3.0.2
|
||||
pyoai==2.5.0
|
||||
pyparsing==2.0.3
|
||||
python-dateutil==2.5.3
|
||||
python-mimeparse==0.1.4
|
||||
|
@ -80,7 +73,7 @@ requests==2.10.0
|
|||
requests-mock==1.2.0
|
||||
requests-oauthlib==0.6.2
|
||||
selenium==2.53.1
|
||||
six==1.9.0
|
||||
six==1.11.0
|
||||
sorl-thumbnail==12.3
|
||||
ssh==1.7.14
|
||||
stevedore==1.12.0
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
from django.conf.global_settings import LANGUAGES
|
||||
|
||||
lang2code = dict([ (lang[1].lower(), lang[0]) for lang in LANGUAGES ])
|
||||
code2lang = dict(LANGUAGES)
|
||||
|
||||
def get_language_code(language):
|
||||
return lang2code.get(language.lower().strip(), '')
|
||||
language = language.lower().strip()
|
||||
if language in code2lang:
|
||||
return language
|
||||
return lang2code.get(language, '')
|
||||
|
|
Loading…
Reference in New Issue