Merge remote-tracking branch 'Gluejar/master' into production

pull/91/head
eric 2018-04-16 20:10:26 -04:00
commit 89b4221dcd
19 changed files with 1067 additions and 54882 deletions

View File

@ -25,7 +25,7 @@ def onix_feed(facet, max=None):
editions = facet.facet_object.filter_model("Edition",editions).distinct() editions = facet.facet_object.filter_model("Edition",editions).distinct()
for edition in editions: for edition in editions:
edition_prod = product(edition, facet.facet_object) edition_prod = product(edition, facet.facet_object)
if edition_prod: if edition_prod is not None:
feed.append(edition_prod) feed.append(edition_prod)
return etree.tostring(feed, pretty_print=True) return etree.tostring(feed, pretty_print=True)
@ -34,7 +34,7 @@ def onix_feed_for_work(work):
feed.append(header(work)) feed.append(header(work))
for edition in models.Edition.objects.filter(work=work,ebooks__isnull=False).distinct(): for edition in models.Edition.objects.filter(work=work,ebooks__isnull=False).distinct():
edition_prod = product(edition) edition_prod = product(edition)
if edition_prod: if edition_prod is not None:
feed.append(product(edition)) feed.append(product(edition))
return etree.tostring(feed, pretty_print=True) return etree.tostring(feed, pretty_print=True)

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -49,7 +49,7 @@ def add_by_oclc(isbn, work=None):
def add_by_oclc_from_google(oclc): def add_by_oclc_from_google(oclc):
if oclc: if oclc:
logger.info("adding book by oclc %s", oclc) logger.info(u"adding book by oclc %s", oclc)
else: else:
return None return None
try: try:
@ -59,10 +59,10 @@ def add_by_oclc_from_google(oclc):
try: try:
results = _get_json(url, {"q": '"OCLC%s"' % oclc}) results = _get_json(url, {"q": '"OCLC%s"' % oclc})
except LookupFailure, e: except LookupFailure, e:
logger.exception("lookup failure for %s", oclc) logger.exception(u"lookup failure for %s", oclc)
return None return None
if not results.has_key('items') or not results['items']: if not results.has_key('items') or not results['items']:
logger.warn("no google hits for %s", oclc) logger.warn(u"no google hits for %s", oclc)
return None return None
try: try:
@ -70,16 +70,16 @@ def add_by_oclc_from_google(oclc):
models.Identifier(type='oclc', value=oclc, edition=e, work=e.work).save() models.Identifier(type='oclc', value=oclc, edition=e, work=e.work).save()
return e return e
except LookupFailure, e: except LookupFailure, e:
logger.exception("failed to add edition for %s", oclc) logger.exception(u"failed to add edition for %s", oclc)
except IntegrityError, e: except IntegrityError, e:
logger.exception("google books data for %s didn't fit our db", oclc) logger.exception(u"google books data for %s didn't fit our db", oclc)
return None return None
def valid_isbn(isbn): def valid_isbn(isbn):
try: try:
return identifier_cleaner('isbn')(isbn) return identifier_cleaner('isbn')(isbn)
except: except:
logger.exception("invalid isbn: %s", isbn) logger.exception(u"invalid isbn: %s", isbn)
return None return None
def add_by_isbn(isbn, work=None, language='xx', title=''): def add_by_isbn(isbn, work=None, language='xx', title=''):
@ -88,13 +88,13 @@ def add_by_isbn(isbn, work=None, language='xx', title=''):
try: try:
e = add_by_isbn_from_google(isbn, work=work) e = add_by_isbn_from_google(isbn, work=work)
except LookupFailure: except LookupFailure:
logger.exception("failed google lookup for %s", isbn) logger.exception(u"failed google lookup for %s", isbn)
# try again some other time # try again some other time
return None return None
if e: if e:
return e return e
logger.info("null came back from add_by_isbn_from_google: %s", isbn) logger.info(u"null came back from add_by_isbn_from_google: %s", isbn)
# if there's a a title, we want to create stub editions and # if there's a a title, we want to create stub editions and
# works, even if google doesn't know about it # but if it's not valid, # works, even if google doesn't know about it # but if it's not valid,
@ -129,10 +129,10 @@ def get_google_isbn_results(isbn):
try: try:
results = _get_json(url, {"q": "isbn:%s" % isbn}) results = _get_json(url, {"q": "isbn:%s" % isbn})
except LookupFailure: except LookupFailure:
logger.exception("lookup failure for %s", isbn) logger.exception(u"lookup failure for %s", isbn)
return None return None
if not results.has_key('items') or not results['items']: if not results.has_key('items') or not results['items']:
logger.warn("no google hits for %s", isbn) logger.warn(u"no google hits for %s", isbn)
return None return None
return results return results
@ -201,7 +201,7 @@ def update_edition(edition):
# if the language of the edition no longer matches that of the parent work, # if the language of the edition no longer matches that of the parent work,
# attach edition to the # attach edition to the
if edition.work.language != language: if edition.work.language != language:
logger.info("reconnecting %s since it is %s instead of %s", logger.info(u"reconnecting %s since it is %s instead of %s",
googlebooks_id, language, edition.work.language) googlebooks_id, language, edition.work.language)
old_work = edition.work old_work = edition.work
@ -210,7 +210,7 @@ def update_edition(edition):
edition.work = new_work edition.work = new_work
edition.save() edition.save()
for identifier in edition.identifiers.all(): for identifier in edition.identifiers.all():
logger.info("moving identifier %s", identifier.value) logger.info(u"moving identifier %s", identifier.value)
identifier.work = new_work identifier.work = new_work
identifier.save() identifier.save()
if old_work and old_work.editions.count() == 0: if old_work and old_work.editions.count() == 0:
@ -256,7 +256,7 @@ def add_by_isbn_from_google(isbn, work=None):
edition.new = False edition.new = False
return edition return edition
logger.info("adding new book by isbn %s", isbn) logger.info(u"adding new book by isbn %s", isbn)
results = get_google_isbn_results(isbn) results = get_google_isbn_results(isbn)
if results: if results:
try: try:
@ -267,9 +267,9 @@ def add_by_isbn_from_google(isbn, work=None):
isbn=isbn isbn=isbn
) )
except LookupFailure, e: except LookupFailure, e:
logger.exception("failed to add edition for %s", isbn) logger.exception(u"failed to add edition for %s", isbn)
except IntegrityError, e: except IntegrityError, e:
logger.exception("google books data for %s didn't fit our db", isbn) logger.exception(u"google books data for %s didn't fit our db", isbn)
return None return None
return None return None
@ -320,7 +320,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
if results: if results:
item = results item = results
else: else:
logger.info("loading metadata from google for %s", googlebooks_id) logger.info(u"loading metadata from google for %s", googlebooks_id)
url = "https://www.googleapis.com/books/v1/volumes/%s" % googlebooks_id url = "https://www.googleapis.com/books/v1/volumes/%s" % googlebooks_id
item = _get_json(url) item = _get_json(url)
d = item['volumeInfo'] d = item['volumeInfo']
@ -343,7 +343,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
if len(language) > 5: if len(language) > 5:
language = language[0:5] language = language[0:5]
if work and work.language != language: if work and work.language != language:
logger.info("not connecting %s since it is %s instead of %s", logger.info(u"not connecting %s since it is %s instead of %s",
googlebooks_id, language, work.language) googlebooks_id, language, work.language)
work = None work = None
# isbn = None # isbn = None
@ -371,7 +371,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
try: try:
e = models.Identifier.objects.get(type='goog', value=googlebooks_id).edition e = models.Identifier.objects.get(type='goog', value=googlebooks_id).edition
e.new = False e.new = False
logger.warning(" whoa nellie, somebody else created an edition while we were working.") logger.warning(u" whoa nellie, somebody else created an edition while we were working.")
if work.new: if work.new:
work.delete() work.delete()
return e return e
@ -404,19 +404,19 @@ def relate_isbn(isbn, cluster_size=1):
"""add a book by isbn and then see if there's an existing work to add it to so as to make a """add a book by isbn and then see if there's an existing work to add it to so as to make a
cluster bigger than cluster_size. cluster bigger than cluster_size.
""" """
logger.info("finding a related work for %s", isbn) logger.info(u"finding a related work for %s", isbn)
edition = add_by_isbn(isbn) edition = add_by_isbn(isbn)
if edition is None: if edition is None:
return None return None
if edition.work is None: if edition.work is None:
logger.info("didn't add related to null work") logger.info(u"didn't add related to null work")
return None return None
if edition.work.editions.count() > cluster_size: if edition.work.editions.count() > cluster_size:
return edition.work return edition.work
for other_isbn in thingisbn(isbn): for other_isbn in thingisbn(isbn):
# 979's come back as 13 # 979's come back as 13
logger.debug("other_isbn: %s", other_isbn) logger.debug(u"other_isbn: %s", other_isbn)
if len(other_isbn) == 10: if len(other_isbn) == 10:
other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn) other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
related_edition = add_by_isbn(other_isbn, work=edition.work) related_edition = add_by_isbn(other_isbn, work=edition.work)
@ -427,7 +427,7 @@ def relate_isbn(isbn, cluster_size=1):
related_edition.work = edition.work related_edition.work = edition.work
related_edition.save() related_edition.save()
elif related_edition.work_id != edition.work_id: elif related_edition.work_id != edition.work_id:
logger.debug("merge_works path 1 %s %s", edition.work_id, related_edition.work_id) logger.debug(u"merge_works path 1 %s %s", edition.work_id, related_edition.work_id)
merge_works(related_edition.work, edition.work) merge_works(related_edition.work, edition.work)
if related_edition.work.editions.count() > cluster_size: if related_edition.work.editions.count() > cluster_size:
return related_edition.work return related_edition.work
@ -438,7 +438,7 @@ def add_related(isbn):
The initial seed ISBN will be added if it's not already there. The initial seed ISBN will be added if it's not already there.
""" """
# make sure the seed edition is there # make sure the seed edition is there
logger.info("adding related editions for %s", isbn) logger.info(u"adding related editions for %s", isbn)
new_editions = [] new_editions = []
@ -446,14 +446,14 @@ def add_related(isbn):
if edition is None: if edition is None:
return new_editions return new_editions
if edition.work is None: if edition.work is None:
logger.warning("didn't add related to null work") logger.warning(u"didn't add related to null work")
return new_editions return new_editions
# this is the work everything will hang off # this is the work everything will hang off
work = edition.work work = edition.work
other_editions = {} other_editions = {}
for other_isbn in thingisbn(isbn): for other_isbn in thingisbn(isbn):
# 979's come back as 13 # 979's come back as 13
logger.debug("other_isbn: %s", other_isbn) logger.debug(u"other_isbn: %s", other_isbn)
if len(other_isbn) == 10: if len(other_isbn) == 10:
other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn) other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
related_edition = add_by_isbn(other_isbn, work=work) related_edition = add_by_isbn(other_isbn, work=work)
@ -466,7 +466,7 @@ def add_related(isbn):
related_edition.work = work related_edition.work = work
related_edition.save() related_edition.save()
elif related_edition.work_id != work.id: elif related_edition.work_id != work.id:
logger.debug("merge_works path 1 %s %s", work.id, related_edition.work_id) logger.debug(u"merge_works path 1 %s %s", work.id, related_edition.work_id)
work = merge_works(work, related_edition.work) work = merge_works(work, related_edition.work)
else: else:
if other_editions.has_key(related_language): if other_editions.has_key(related_language):
@ -476,14 +476,14 @@ def add_related(isbn):
# group the other language editions together # group the other language editions together
for lang_group in other_editions.itervalues(): for lang_group in other_editions.itervalues():
logger.debug("lang_group (ed, work): %s", [(ed.id, ed.work_id) for ed in lang_group]) logger.debug(u"lang_group (ed, work): %s", [(ed.id, ed.work_id) for ed in lang_group])
if len(lang_group) > 1: if len(lang_group) > 1:
lang_edition = lang_group[0] lang_edition = lang_group[0]
logger.debug("lang_edition.id: %s", lang_edition.id) logger.debug(u"lang_edition.id: %s", lang_edition.id)
# compute the distinct set of works to merge into lang_edition.work # compute the distinct set of works to merge into lang_edition.work
works_to_merge = set([ed.work for ed in lang_group[1:]]) - set([lang_edition.work]) works_to_merge = set([ed.work for ed in lang_group[1:]]) - set([lang_edition.work])
for w in works_to_merge: for w in works_to_merge:
logger.debug("merge_works path 2 %s %s", lang_edition.work_id, w.id) logger.debug(u"merge_works path 2 %s %s", lang_edition.work_id, w.id)
merged_work = merge_works(lang_edition.work, w) merged_work = merge_works(lang_edition.work, w)
models.WorkRelation.objects.get_or_create( models.WorkRelation.objects.get_or_create(
to_work=lang_group[0].work, to_work=lang_group[0].work,
@ -498,17 +498,21 @@ def thingisbn(isbn):
Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns, Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns,
which come back as isbn_13') which come back as isbn_13')
""" """
logger.info("looking up %s at ThingISBN", isbn) logger.info(u"looking up %s at ThingISBN", isbn)
url = "https://www.librarything.com/api/thingISBN/%s" % isbn url = "https://www.librarything.com/api/thingISBN/%s" % isbn
xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content
doc = ElementTree.fromstring(xml) try:
return [e.text for e in doc.findall('isbn')] doc = ElementTree.fromstring(xml)
return [e.text for e in doc.findall('isbn')]
except SyntaxError:
# LibraryThing down
return []
def merge_works(w1, w2, user=None): def merge_works(w1, w2, user=None):
"""will merge the second work (w2) into the first (w1) """will merge the second work (w2) into the first (w1)
""" """
logger.info("merging work %s into %s", w2.id, w1.id) logger.info(u"merging work %s into %s", w2.id, w1.id)
# don't merge if the works are the same or at least one of the works has no id # don't merge if the works are the same or at least one of the works has no id
#(for example, when w2 has already been deleted) #(for example, when w2 has already been deleted)
if w1 is None or w2 is None or w1.id == w2.id or w1.id is None or w2.id is None: if w1 is None or w2 is None or w1.id == w2.id or w1.id is None or w2.id is None:
@ -583,7 +587,7 @@ def detach_edition(e):
will detach edition from its work, creating a new stub work. if remerge=true, will see if will detach edition from its work, creating a new stub work. if remerge=true, will see if
there's another work to attach to there's another work to attach to
""" """
logger.info("splitting edition %s from %s", e, e.work) logger.info(u"splitting edition %s from %s", e, e.work)
w = models.Work(title=e.title, language=e.work.language) w = models.Work(title=e.title, language=e.work.language)
w.save() w.save()
@ -618,7 +622,7 @@ def add_openlibrary(work, hard_refresh=False):
work.save() work.save()
# find the first ISBN match in OpenLibrary # find the first ISBN match in OpenLibrary
logger.info("looking up openlibrary data for work %s", work.id) logger.info(u"looking up openlibrary data for work %s", work.id)
e = None # openlibrary edition json e = None # openlibrary edition json
w = None # openlibrary work json w = None # openlibrary work json
@ -633,7 +637,7 @@ def add_openlibrary(work, hard_refresh=False):
try: try:
e = _get_json(url, params, type='ol') e = _get_json(url, params, type='ol')
except LookupFailure: except LookupFailure:
logger.exception("OL lookup failed for %s", isbn_key) logger.exception(u"OL lookup failed for %s", isbn_key)
e = {} e = {}
if e.has_key(isbn_key): if e.has_key(isbn_key):
if e[isbn_key].has_key('details'): if e[isbn_key].has_key('details'):
@ -673,7 +677,7 @@ def add_openlibrary(work, hard_refresh=False):
) )
if e[isbn_key]['details'].has_key('works'): if e[isbn_key]['details'].has_key('works'):
work_key = e[isbn_key]['details']['works'].pop(0)['key'] work_key = e[isbn_key]['details']['works'].pop(0)['key']
logger.info("got openlibrary work %s for isbn %s", work_key, isbn_key) logger.info(u"got openlibrary work %s for isbn %s", work_key, isbn_key)
models.Identifier.get_or_add(type='olwk', value=work_key, work=work) models.Identifier.get_or_add(type='olwk', value=work_key, work=work)
try: try:
w = _get_json("https://openlibrary.org" + work_key, type='ol') w = _get_json("https://openlibrary.org" + work_key, type='ol')
@ -691,14 +695,14 @@ def add_openlibrary(work, hard_refresh=False):
if w.has_key('subjects') and len(w['subjects']) > len(subjects): if w.has_key('subjects') and len(w['subjects']) > len(subjects):
subjects = w['subjects'] subjects = w['subjects']
except LookupFailure: except LookupFailure:
logger.exception("OL lookup failed for %s", work_key) logger.exception(u"OL lookup failed for %s", work_key)
if not subjects: if not subjects:
logger.warn("unable to find work %s at openlibrary", work.id) logger.warn(u"unable to find work %s at openlibrary", work.id)
return return
# add the subjects to the Work # add the subjects to the Work
for s in subjects: for s in subjects:
logger.info("adding subject %s to work %s", s, work.id) logger.info(u"adding subject %s to work %s", s, work.id)
subject = models.Subject.set_by_name(s, work=work) subject = models.Subject.set_by_name(s, work=work)
work.save() work.save()
@ -716,9 +720,9 @@ def _get_json(url, params={}, type='gb'):
if response.status_code == 200: if response.status_code == 200:
return json.loads(response.content) return json.loads(response.content)
else: else:
logger.error("unexpected HTTP response: %s", response) logger.error(u"unexpected HTTP response: %s", response)
if response.content: if response.content:
logger.error("response content: %s", response.content) logger.error(u"response content: %s", response.content)
raise LookupFailure("GET failed: url=%s and params=%s" % (url, params)) raise LookupFailure("GET failed: url=%s and params=%s" % (url, params))
@ -766,7 +770,7 @@ def load_gutenberg_edition(title, gutenberg_etext_id, ol_work_id, seed_isbn, url
ebook = models.Ebook() ebook = models.Ebook()
if len(ebooks) > 1: if len(ebooks) > 1:
logger.warning("There is more than one Ebook matching url {0}".format(url)) logger.warning(u"There is more than one Ebook matching url {0}".format(url))
ebook.format = format ebook.format = format
@ -826,8 +830,6 @@ def edition_for_etype(etype, metadata, default=None):
for key in metadata.edition_identifiers.keys(): for key in metadata.edition_identifiers.keys():
return edition_for_ident(key, metadata.identifiers[key]) return edition_for_ident(key, metadata.identifiers[key])
MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
def load_ebookfile(url, etype): def load_ebookfile(url, etype):
''' '''
return a ContentFile if a new ebook has been loaded return a ContentFile if a new ebook has been loaded
@ -960,8 +962,7 @@ class BasePandataLoader(object):
if contentfile: if contentfile:
contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key) contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
path = default_storage.save(contentfile_name, contentfile) path = default_storage.save(contentfile_name, contentfile)
lic = MATCH_LICENSE.search(metadata.rights_url) license = cc.license_from_cc_url(metadata.rights_url)
license = 'CC {}'.format(lic.group(1).upper()) if lic else ''
ebf = models.EbookFile.objects.create( ebf = models.EbookFile.objects.create(
format=key, format=key,
edition=edition, edition=edition,

View File

@ -1,8 +1,11 @@
# coding=utf-8 # coding=utf-8
# mostly constants related to Creative Commons ''' mostly constants related to Creative Commons
# let's be DRY with these parameters # let's be DRY with these parameters
## need to add versioned CC entries ## need to add versioned CC entries
'''
import re
INFO_CC = ( INFO_CC = (
('CC BY-NC-ND', 'by-nc-nd', 'Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported (CC BY-NC-ND 3.0)', 'https://creativecommons.org/licenses/by-nc-nd/3.0/', 'Creative Commons Attribution-NonCommercial-NoDerivs'), ('CC BY-NC-ND', 'by-nc-nd', 'Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported (CC BY-NC-ND 3.0)', 'https://creativecommons.org/licenses/by-nc-nd/3.0/', 'Creative Commons Attribution-NonCommercial-NoDerivs'),
@ -162,3 +165,15 @@ def match_license(license_string):
except ValueError: except ValueError:
pass pass
return RIGHTS_ALIAS.get(license_string, None) return RIGHTS_ALIAS.get(license_string, None)
MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
def license_from_cc_url(rights_url):
if not rights_url:
return None
lic = MATCH_LICENSE.search(rights_url)
if lic:
return 'CC {}'.format(lic.group(1).upper())
if rights_url.find('openedition.org') >= 0:
return 'OPENEDITION'
return ''

View File

@ -52,3 +52,9 @@ def add_by_webpage(url, work=None, user=None):
def add_by_sitemap(url, maxnum=None): def add_by_sitemap(url, maxnum=None):
return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum)) return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum))
def scrape_language(url):
scraper = get_scraper(url)
return scraper.metadata.get('language')

View File

@ -1,42 +1,54 @@
#!/usr/bin/env python #!/usr/bin/env python
# encoding: utf-8 # encoding: utf-8
import logging import datetime
import json import json
import logging
import re import re
from itertools import islice
import requests import requests
from django.db.models import (Q, F) from django.db.models import Q
from django.core.files.storage import default_storage
from django.core.files.base import ContentFile from django.core.files.base import ContentFile
from django.core.files.storage import default_storage
import regluit from oaipmh.client import Client
from oaipmh.error import IdDoesNotExistError
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
from regluit.core import bookloader, cc
from regluit.core import models, tasks from regluit.core import models, tasks
from regluit.core import bookloader from regluit.core.bookloader import merge_works
from regluit.core.bookloader import add_by_isbn, merge_works
from regluit.core.isbn import ISBN from regluit.core.isbn import ISBN
from regluit.core.loaders.utils import type_for_url
from regluit.core.validation import valid_subject from regluit.core.validation import valid_subject
from . import scrape_language
from .doab_utils import doab_lang_to_iso_639_1, online_to_download, url_to_provider
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
springercover = re.compile(r'ftp.+springer\.de.+(\d{13}\.jpg)$', flags=re.U) def unlist(alist):
if not alist:
return None
return alist[0]
SPRINGER_COVER = re.compile(r'ftp.+springer\.de.+(\d{13}\.jpg)$', flags=re.U)
SPRINGER_IMAGE = u'https://images.springer.com/sgw/books/medium/{}.jpg'
def store_doab_cover(doab_id, redo=False): def store_doab_cover(doab_id, redo=False):
""" """
returns tuple: 1) cover URL, 2) whether newly created (boolean) returns tuple: 1) cover URL, 2) whether newly created (boolean)
""" """
cover_file_name= '/doab/%s/cover' % (doab_id) cover_file_name = '/doab/%s/cover' % (doab_id)
# if we don't want to redo and the cover exists, return the URL of the cover # if we don't want to redo and the cover exists, return the URL of the cover
if not redo and default_storage.exists(cover_file_name): if not redo and default_storage.exists(cover_file_name):
return (default_storage.url(cover_file_name), False) return (default_storage.url(cover_file_name), False)
# download cover image to cover_file # download cover image to cover_file
url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id) url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)
try: try:
@ -44,16 +56,16 @@ def store_doab_cover(doab_id, redo=False):
if r.status_code == 302: if r.status_code == 302:
redirurl = r.headers['Location'] redirurl = r.headers['Location']
if redirurl.startswith(u'ftp'): if redirurl.startswith(u'ftp'):
springerftp = springercover.match(redirurl) springerftp = SPRINGER_COVER.match(redirurl)
if springerftp: if springerftp:
redirurl = u'https://images.springer.com/sgw/books/medium/{}.jpg'.format(springerftp.groups(1)) redirurl = SPRINGER_IMAGE.format(springerftp.groups(1))
r = requests.get(redirurl) r = requests.get(redirurl)
else: else:
r = requests.get(url) r = requests.get(url)
cover_file = ContentFile(r.content) cover_file = ContentFile(r.content)
cover_file.content_type = r.headers.get('content-type', '') cover_file.content_type = r.headers.get('content-type', '')
path = default_storage.save(cover_file_name, cover_file) default_storage.save(cover_file_name, cover_file)
return (default_storage.url(cover_file_name), True) return (default_storage.url(cover_file_name), True)
except Exception, e: except Exception, e:
# if there is a problem, return None for cover URL # if there is a problem, return None for cover URL
@ -74,52 +86,51 @@ def update_cover_doab(doab_id, edition, store_cover=True):
edition.cover_image = cover_url edition.cover_image = cover_url
edition.save() edition.save()
return cover_url return cover_url
else: return None
return None
def attach_more_doab_metadata(edition, description, subjects, def attach_more_doab_metadata(edition, description, subjects,
publication_date, publisher_name=None, language=None, authors=u''): publication_date, publisher_name=None, language=None, authors=u''):
""" """
for given edition, attach description, subjects, publication date to for given edition, attach description, subjects, publication date to
corresponding Edition and Work corresponding Edition and Work
""" """
# if edition doesn't have a publication date, update it # if edition doesn't have a publication date, update it
if not edition.publication_date: if not edition.publication_date:
edition.publication_date = publication_date edition.publication_date = publication_date
# if edition.publisher_name is empty, set it # if edition.publisher_name is empty, set it
if not edition.publisher_name: if not edition.publisher_name:
edition.set_publisher(publisher_name) edition.set_publisher(publisher_name)
edition.save() edition.save()
# attach description to work if it's not empty # attach description to work if it's not empty
work = edition.work work = edition.work
if not work.description: if not work.description:
work.description = description work.description = description
# update subjects # update subjects
for s in subjects: for s in subjects:
if valid_subject(s): if valid_subject(s):
models.Subject.set_by_name(s, work=work) models.Subject.set_by_name(s, work=work)
# set reading level of work if it's empty; doab is for adults. # set reading level of work if it's empty; doab is for adults.
if not work.age_level: if not work.age_level:
work.age_level = '18-' work.age_level = '18-'
if language: if language and language != 'xx':
work.language = language work.language = language
work.save() work.save()
if authors and authors == authors: # test for authors != NaN if authors and authors == authors: # test for authors != NaN
authlist = creator_list(authors) authlist = creator_list(authors)
if edition.authors.all().count() < len(authlist): if edition.authors.all().count() < len(authlist):
edition.authors.clear() edition.authors.clear()
if authlist is not None: if authlist is not None:
for [rel,auth] in authlist: for [rel, auth] in authlist:
edition.add_author(auth, rel) edition.add_author(auth, rel)
return edition return edition
def add_all_isbns(isbns, work, language=None, title=None): def add_all_isbns(isbns, work, language=None, title=None):
@ -128,69 +139,73 @@ def add_all_isbns(isbns, work, language=None, title=None):
first_edition = None first_edition = None
edition = bookloader.add_by_isbn(isbn, work, language=language, title=title) edition = bookloader.add_by_isbn(isbn, work, language=language, title=title)
if edition: if edition:
first_edition = first_edition if first_edition else edition first_edition = first_edition if first_edition else edition
if work and (edition.work_id != work.id): if work and (edition.work_id != work.id):
if work.created < edition.work.created: if work.created < edition.work.created:
work = merge_works(work, edition.work) work = merge_works(work, edition.work)
else: else:
work = merge_works(edition.work, work) work = merge_works(edition.work, work)
else: else:
work = edition.work work = edition.work
return first_edition return first_edition
def load_doab_edition(title, doab_id, url, format, rights, def load_doab_edition(title, doab_id, url, format, rights,
language, isbns, language, isbns,
provider, **kwargs): provider, **kwargs):
""" """
load a record from doabooks.org represented by input parameters and return an ebook load a record from doabooks.org represented by input parameters and return an ebook
""" """
logger.info('load doab {} {} {} {} {}'.format(doab_id, format, rights, language, provider))
if language and isinstance(language, list): if language and isinstance(language, list):
language = language[0] language = language[0]
if language == 'xx' and format == 'online':
language = scrape_language(url)
# check to see whether the Edition hasn't already been loaded first # check to see whether the Edition hasn't already been loaded first
# search by url # search by url
ebooks = models.Ebook.objects.filter(url=url) ebooks = models.Ebook.objects.filter(url=url)
# 1 match # 1 match
# > 1 matches # > 1 matches
# 0 match # 0 match
# simplest case -- if match (1 or more), we could check whether any # simplest case -- if match (1 or more), we could check whether any
# ebook.edition.work has a doab id matching given doab_id # ebook.edition.work has a doab id matching given doab_id
# put a migration to force Ebook.url to be unique id # put a migration to force Ebook.url to be unique id
# if yes, then return one of the Edition(s) whose work is doab_id # if yes, then return one of the Edition(s) whose work is doab_id
# if no, then # if no, then
ebook = None ebook = None
if len(ebooks) > 1: if len(ebooks) > 1:
raise Exception("There is more than one Ebook matching url {0}".format(url)) raise Exception("There is more than one Ebook matching url {0}".format(url))
elif len(ebooks) == 1: elif len(ebooks) == 1:
ebook = ebooks[0] ebook = ebooks[0]
doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id, doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
work=ebook.edition.work) work=ebook.edition.work)
# update the cover id # update the cover id
cover_url = update_cover_doab(doab_id, ebook.edition) cover_url = update_cover_doab(doab_id, ebook.edition)
# attach more metadata # attach more metadata
attach_more_doab_metadata(ebook.edition, attach_more_doab_metadata(
description=kwargs.get('description'), ebook.edition,
subjects=kwargs.get('subject'), description=unlist(kwargs.get('description')),
publication_date=kwargs.get('date'), subjects=kwargs.get('subject'),
publisher_name=kwargs.get('publisher'), publication_date=unlist(kwargs.get('date')),
language=language, publisher_name=unlist(kwargs.get('publisher')),
authors=kwargs.get('authors'),) language=language,
authors=kwargs.get('creator'),
)
# make sure all isbns are added # make sure all isbns are added
add_all_isbns(isbns, None, language=language, title=title) add_all_isbns(isbns, ebook.edition.work, language=language, title=title)
return ebook return ebook.edition
# remaining case --> no ebook, load record, create ebook if there is one. # remaining case --> no ebook, load record, create ebook if there is one.
assert len(ebooks) == 0 assert not ebooks
# we need to find the right Edition/Work to tie Ebook to... # we need to find the right Edition/Work to tie Ebook to...
# look for the Edition with which to associate ebook. # look for the Edition with which to associate ebook.
# loop through the isbns to see whether we get one that is not None # loop through the isbns to see whether we get one that is not None
work = None work = None
@ -206,16 +221,16 @@ def load_doab_edition(title, doab_id, url, format, rights,
edition = ident.work.preferred_edition edition = ident.work.preferred_edition
work = edition.work work = edition.work
break break
if edition is not None: if edition is not None:
# if this is a new edition, then add related editions asynchronously # if this is a new edition, then add related editions asynchronously
if getattr(edition,'new', False): if getattr(edition, 'new', False):
tasks.populate_edition.delay(edition.isbn_13) tasks.populate_edition.delay(edition.isbn_13)
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id, doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
work=edition.work) work=edition.work)
# we need to create Edition(s) de novo # we need to create Edition(s) de novo
else: else:
# if there is a Work with doab_id already, attach any new Edition(s) # if there is a Work with doab_id already, attach any new Edition(s)
try: try:
work = models.Identifier.objects.get(type='doab', value=doab_id).work work = models.Identifier.objects.get(type='doab', value=doab_id).work
@ -226,11 +241,11 @@ def load_doab_edition(title, doab_id, url, format, rights,
work = models.Work(language='xx', title=title, age_level='18-') work = models.Work(language='xx', title=title, age_level='18-')
work.save() work.save()
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id, doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
work=work) work=work)
# if work has any ebooks already, attach the ebook to the corresponding edition # if work has any ebooks already, attach the ebook to the corresponding edition
# otherwise pick the first one # otherwise pick the first one
# pick the first edition as the one to tie ebook to # pick the first edition as the one to tie ebook to
editions_with_ebooks = models.Edition.objects.filter(Q(work__id=work.id) & \ editions_with_ebooks = models.Edition.objects.filter(Q(work__id=work.id) & \
Q(ebooks__isnull=False)).distinct() Q(ebooks__isnull=False)).distinct()
if editions_with_ebooks: if editions_with_ebooks:
@ -240,73 +255,41 @@ def load_doab_edition(title, doab_id, url, format, rights,
else: else:
edition = models.Edition(work=work, title=title) edition = models.Edition(work=work, title=title)
edition.save() edition.save()
# make the edition the selected_edition of the work # make the edition the selected_edition of the work
work.selected_edition = edition work.selected_edition = edition
work.save() work.save()
if format in ('pdf', 'epub', 'mobi'): if format in ('pdf', 'epub', 'mobi', 'html', 'online'):
ebook = models.Ebook() ebook = models.Ebook()
ebook.format = format ebook.format = format
ebook.provider = provider ebook.provider = provider
ebook.url = url ebook.url = url
ebook.rights = rights ebook.rights = rights
# tie the edition to ebook # tie the edition to ebook
ebook.edition = edition ebook.edition = edition
if format == "online":
ebook.active = False
ebook.save() ebook.save()
# update the cover id (could be done separately) # update the cover id (could be done separately)
cover_url = update_cover_doab(doab_id, edition) cover_url = update_cover_doab(doab_id, edition)
# attach more metadata # attach more metadata
attach_more_doab_metadata(edition, attach_more_doab_metadata(
description=kwargs.get('description'), edition,
subjects=kwargs.get('subject'), description=unlist(kwargs.get('description')),
publication_date=kwargs.get('date'), subjects=kwargs.get('subject'),
publisher_name=kwargs.get('publisher'), publication_date=unlist(kwargs.get('date')),
authors=kwargs.get('authors'),) publisher_name=unlist(kwargs.get('publisher')),
return ebook authors=kwargs.get('creator'),
)
return edition
#
def load_doab_records(fname, limit=None):
success_count = 0
ebook_count = 0
records = json.load(open(fname))
for (i, book) in enumerate(islice(records,limit)):
d = dict(book)
d['isbns'] = split_isbns(d['isbns_raw']) # use stricter isbn string parsing.
try:
ebook = load_doab_edition(**d)
success_count += 1
if ebook:
ebook_count +=1
except Exception, e:
logger.error(e)
logger.error(book)
logger.info("Number of records processed: " + str(success_count))
logger.info("Number of ebooks processed: " + str(ebook_count))
"""
#tools to parse the author lists in doab.csv #tools to parse the author lists in doab.csv
from pandas import DataFrame #
url = "http://www.doabooks.org/doab?func=csv"
df_csv = DataFrame.from_csv(url)
out=[]
for val in df_csv.values:
isbn = split_isbns(val[0])
if isbn:
auths = []
if val[2] == val[2] and val[-2] == val[-2]: # test for NaN auths and licenses
auths = creator_list(val[2])
out.append(( isbn[0], auths))
open("/Users/eric/doab_auths.json","w+").write(json.dumps(out,indent=2, separators=(',', ': ')))
"""
au = re.compile(r'\(Authors?\)', flags=re.U) au = re.compile(r'\(Authors?\)', flags=re.U)
ed = re.compile(r'\([^\)]*(dir.|[Eeé]ds?.|org.|coord.|Editor|a cura di|archivist)[^\)]*\)', flags=re.U) ed = re.compile(r'\([^\)]*(dir.|[Eeé]ds?.|org.|coord.|Editor|a cura di|archivist)[^\)]*\)', flags=re.U)
tr = re.compile(r'\([^\)]*([Tt]rans.|tr.|translated by)[^\)]*\)', flags=re.U) tr = re.compile(r'\([^\)]*([Tt]rans.|tr.|translated by)[^\)]*\)', flags=re.U)
@ -326,14 +309,14 @@ def fnf(auth):
if len(parts) == 1: if len(parts) == 1:
return parts[0].strip() return parts[0].strip()
elif len(parts) == 2: elif len(parts) == 2:
return u'{} {}'.format(parts[1].strip(),parts[0].strip()) return u'{} {}'.format(parts[1].strip(), parts[0].strip())
else: else:
if parts[1].strip() in ('der','van', 'von', 'de', 'ter'): if parts[1].strip() in ('der', 'van', 'von', 'de', 'ter'):
return u'{} {} {}'.format(parts[2].strip(),parts[1].strip(),parts[0].strip()) return u'{} {} {}'.format(parts[2].strip(), parts[1].strip(), parts[0].strip())
#print auth #print auth
#print re.search(namelist,auth).group(0) #print re.search(namelist,auth).group(0)
return u'{} {}, {}'.format(parts[2].strip(),parts[0].strip(),parts[1].strip()) return u'{} {}, {}'.format(parts[2].strip(), parts[0].strip(), parts[1].strip())
def creator(auth, editor=False): def creator(auth, editor=False):
auth = auth.strip() auth = auth.strip()
@ -349,68 +332,88 @@ def creator(auth, editor=False):
return [u'dsr', fnf(ds.sub(u'', auth))] return [u'dsr', fnf(ds.sub(u'', auth))]
if re.search(cm, auth): if re.search(cm, auth):
return [u'com', fnf(cm.sub(u'', auth))] return [u'com', fnf(cm.sub(u'', auth))]
auth = au.sub('', auth) auth = au.sub('', auth)
return ['aut', fnf(auth)] return ['aut', fnf(auth)]
def split_auths(auths):
if ';' in auths or '/' in auths:
return namesep2.split(auths)
else:
nl = namelist.match(auths.strip())
if nl:
if nl.group(3).endswith(' de') \
or ' de ' in nl.group(3) \
or nl.group(3).endswith(' da') \
or nl.group(1).endswith(' Jr.') \
or ' e ' in nl.group(1):
return [auths]
else:
return namesep.split(auths)
else :
return [auths]
def split_isbns(isbns):
result = []
for isbn in isbnsep.split(isbns):
isbn = ISBN(isbn)
if isbn.valid:
result.append(isbn.to_string())
return result
def creator_list(creators): def creator_list(creators):
auths = [] auths = []
if re.search(edlist, creators): for auth in creators:
for auth in split_auths(edlist.sub(u'', creators)): auths.append(creator(auth))
if auth:
auths.append(creator(auth, editor=True))
else:
for auth in split_auths(unicode(creators)):
if auth:
auths.append(creator(auth))
return auths return auths
def load_doab_auths(fname, limit=None): DOAB_OAIURL = 'https://www.doabooks.org/oai'
doab_auths = json.load(open(fname)) DOAB_PATT = re.compile(r'[\./]doabooks\.org/doab\?.*rid:(\d{1,8}).*')
recnum = 0 mdregistry = MetadataRegistry()
failed = 0 mdregistry.registerReader('oai_dc', oai_dc_reader)
for [isbnraw, authlist] in doab_auths: doab_client = Client(DOAB_OAIURL, mdregistry)
isbn = ISBN(isbnraw).to_string()
try: def add_by_doab(doab_id, record=None):
work = models.Identifier.objects.get(type='isbn',value=isbn).work try:
except models.Identifier.DoesNotExist: record = record if record else doab_client.getRecord(
print 'isbn = {} not found'.format(isbnraw) metadataPrefix='oai_dc',
failed += 1 identifier='oai:doab-books:{}'.format(doab_id)
if work.preferred_edition.authors.all().count() < len(authlist): )
work.preferred_edition.authors.clear() metadata = record[1].getMap()
if authlist is None: isbns = []
print "null authlist; isbn={}".format(isbn) url = None
for ident in metadata.pop('identifier', []):
if ident.startswith('ISBN: '):
isbn = ISBN(ident[6:])
if isbn.error:
continue
isbn.validate()
isbns.append(isbn.to_string())
elif ident.find('doabooks.org') >= 0:
# should already know the doab_id
continue continue
for [rel,auth] in authlist: else:
work.preferred_edition.add_author(auth, rel) url = ident
recnum +=1 language = doab_lang_to_iso_639_1(unlist(metadata.pop('language', None)))
if limit and recnum > limit: urls = online_to_download(url)
break edition = None
logger.info("Number of records processed: " + str(recnum)) for dl_url in urls:
logger.info("Number of missing isbns: " + str(failed)) format = type_for_url(dl_url)
if 'format' in metadata:
del metadata['format']
edition = load_doab_edition(
unlist(metadata.pop('title', None)),
doab_id,
dl_url,
format,
cc.license_from_cc_url(unlist(metadata.pop('rights', None))),
language,
isbns,
url_to_provider(dl_url) if dl_url else None,
**metadata
)
return edition
except IdDoesNotExistError:
return None
def getdoab(url):
id_match = DOAB_PATT.search(url)
if id_match:
return id_match.group(1)
return False
def load_doab_oai(from_year=2000, limit=100000):
'''
use oai feed to get oai updates
'''
from_ = datetime.datetime(year=from_year, month=1, day=1)
doab_ids = []
for record in doab_client.listRecords(metadataPrefix='oai_dc', from_=from_):
if not record[1]:
continue
idents = record[1].getMap()['identifier']
if idents:
for ident in idents:
doab = getdoab(ident)
if doab:
doab_ids.append(doab)
e = add_by_doab(doab, record=record)
logger.info(u'updated:\t{}\t{}'.format(doab, e.title))
if len(doab_ids) > limit:
break

126
core/loaders/doab_utils.py Normal file
View File

@ -0,0 +1,126 @@
"""
doab_utils.py
"""
import re
import urlparse
import requests
from regluit.utils.lang import get_language_code
from .utils import get_soup
# utility functions for converting lists of individual items into individual items
# let's do a mapping of the DOAB languages into the language codes used
# mostly, we just handle mispellings
# also null -> xx
EXTRA_LANG_MAP = dict([
(u'chinese', 'de'),
(u'deutsch', 'de'),
(u'eng', 'en'),
(u'englilsh', 'en'),
(u'englilsh', 'en'),
(u'englisch', 'en'),
(u'espanol', 'es'),
(u'ger', 'de'),
(u'fra', 'fr'),
(u'fre', 'fr'),
(u'francese', 'fr'),
(u'ita', 'it'),
(u'italiano', 'it'),
(u'norwegian', 'no'),
(u'por', 'pt'),
(u'portugese', 'pt'),
(u'slovene', 'sl'),
(u'spa', 'es'),
(u'spagnolo', 'es'),
])
sep = re.compile(r'[ \-;^,/]+')
def doab_lang_to_iso_639_1(lang):
if lang is None or not lang:
return "xx"
else:
lang = sep.split(lang)[0]
code = get_language_code(lang)
if code:
return code
else:
return EXTRA_LANG_MAP.get(lang.lower(), 'xx')
DOMAIN_TO_PROVIDER = dict([
[u'www.doabooks.org', u'Directory of Open Access Books'],
[u'www.oapen.org', u'OAPEN Library'],
[u'books.openedition.org', u'OpenEdition Books'],
[u'digitalcommons.usu.edu', u'DigitalCommons, Utah State University'],
[u'www.aupress.ca', u'Athabasca University Press'],
[u'dspace.ucalgary.ca', u'Institutional Repository at the University of Calgary'],
[u'www.degruyter.com', u'De Gruyter Online'],
[u'dx.doi.org', u'DOI Resolver'],
[u'www.openbookpublishers.com', u'Open Book Publishers'],
[u'www.adelaide.edu.au', u'University of Adelaide'],
[u'hdl.handle.net', u'Handle Proxy'],
[u'link.springer.com', u'Springer'],
[u'www.bloomsburyacademic.com', u'Bloomsbury Academic'],
[u'www.ledizioni.it', u'Ledizioni'],
[u'ccdigitalpress.org', u'Computers and Composition Digital Press'],
[u'leo.cilea.it', u'LEO '],
[u'www.springerlink.com', u'Springer'],
[u'www.palgraveconnect.com', u'Palgrave Connect'],
[u'www.ubiquitypress.com', u'Ubiquity Press'],
[u'ebooks.iospress.nl', u'IOS Press Ebooks'],
[u'antropologie.zcu.cz', u'AntropoWeb'],
[u'www.unito.it', u"University of Turin"],
[u'leo.cineca.it', u'Letteratura Elettronica Online'],
[u'hw.oeaw.ac.at', u'Austrian Academy of Sciences'],
[u'www.co-action.net', u'Co-Action Publishing'],
[u'www.aliprandi.org', u'Simone Aliprandi'],
[u'www.maestrantonella.it', u'maestrantonella.it'],
[u'www.antilia.to.it', u'antilia.to.it'],
[u'www.scribd.com', u'Scribd'],
[u'ledibooks.com', u'LediBooks'],
[u'press.openedition.org', u'OpenEdition Press'],
[u'oapen.org', u'OAPEN Library'],
[u'www.ebooks.iospress.nl', u'IOS Press Ebooks'],
[u'windsor.scholarsportal.info', u'Scholars Portal'],
[u'www.unimib.it', u'University of Milano-Bicocca'],
[u'books.mdpi.com', u'MDPI Books'],
[u'www.dropbox.com', u'Dropbox'],
[u'dl.dropboxusercontent.com', u'Dropbox'],
])
def url_to_provider(url):
netloc = urlparse.urlparse(url).netloc
return DOMAIN_TO_PROVIDER.get(netloc, netloc)
FRONTIERSIN = re.compile(r'frontiersin.org/books/[^/]+/(\d+)')
def online_to_download(url):
urls = []
if url.find(u'mdpi.com/books/pdfview/book/') >= 0:
doc = get_soup(url)
if doc:
obj = doc.find('object', type='application/pdf')
if obj:
urls.append(obj['data'].split('#')[0])
elif url.find(u'books.scielo.org/') >= 0:
doc = get_soup(url)
if doc:
obj = doc.find('a', class_='pdf_file')
if obj:
urls.append(urlparse.urljoin(url, obj['href']))
obj = doc.find('a', class_='epub_file')
if obj:
urls.append(urlparse.urljoin(url, obj['href']))
elif FRONTIERSIN.search(url):
booknum = FRONTIERSIN.search(url).group(1)
urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=EPUB'.format(booknum))
urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=PDF'.format(booknum))
else:
urls.append(url)
return urls

28
core/loaders/tests.py Normal file
View File

@ -0,0 +1,28 @@
from django.conf import settings
from django.test import TestCase
from regluit.core.models import Ebook, Edition, Work
from .utils import dl_online
class LoaderTests(TestCase):
def setUp(self):
pass
def test_downloads(self):
if not (settings.TEST_INTEGRATION):
return
work = Work(title="online work")
work.save()
edition = Edition(work=work)
edition.save()
dropbox_url = 'https://www.dropbox.com/s/h5jzpb4vknk8n7w/Jakobsson_The_Troll_Inside_You_EBook.pdf?dl=0'
dropbox_ebook = Ebook.objects.create(format='online', url=dropbox_url, edition=edition)
dropbox_ebf = dl_online(dropbox_ebook)
self.assertTrue(dropbox_ebf.ebook.filesize)
jbe_url = 'http://www.jbe-platform.com/content/books/9789027295958'
jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition)
jbe_ebf = dl_online(jbe_ebook)
self.assertTrue(jbe_ebf.ebook.filesize)

View File

@ -1,15 +1,24 @@
import csv import csv
import re
import requests
import logging import logging
import re
import sys import sys
import time
import unicodedata import unicodedata
import urlparse
from bs4 import BeautifulSoup
import requests
from django.conf import settings
from django.core.files.base import ContentFile
from regluit.core.models import Work, Edition, Author, PublisherName, Identifier, Subject
from regluit.core.isbn import ISBN
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
from regluit.api.crosswalks import inv_relator_contrib from regluit.api.crosswalks import inv_relator_contrib
from regluit.bisac.models import BisacHeading from regluit.bisac.models import BisacHeading
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
from regluit.core.isbn import ISBN
from regluit.core.models import (
Author, Ebook, EbookFile, Edition, Identifier, path_for_file, PublisherName, Subject, Work,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -34,6 +43,12 @@ def utf8_general_ci_norm(s):
s1 = unicodedata.normalize('NFD', s) s1 = unicodedata.normalize('NFD', s)
return ''.join(c for c in s1 if not unicodedata.combining(c)).upper() return ''.join(c for c in s1 if not unicodedata.combining(c)).upper()
def get_soup(url):
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
return BeautifulSoup(response.content, 'lxml')
return None
def get_authors(book): def get_authors(book):
authors=[] authors=[]
if book.get('AuthorsList',''): if book.get('AuthorsList',''):
@ -331,14 +346,15 @@ def loaded_book_ok(book, work, edition):
return True return True
ID_URLPATTERNS = { ID_URLPATTERNS = {
'goog': re.compile(r'[\./]google\.com/books\?.*id=([a-zA-Z0-9\-_]{12})'), 'goog': re.compile(r'[\./]google\.com/books\?.*id=(?P<id>[a-zA-Z0-9\-_]{12})'),
'olwk': re.compile(r'[\./]openlibrary\.org(/works/OL\d{1,8}W)'), 'olwk': re.compile(r'[\./]openlibrary\.org(?P<id>/works/OL\d{1,8}W)'),
'gdrd': re.compile(r'[\./]goodreads\.com/book/show/(\d{1,8})'), 'doab': re.compile(r'([\./]doabooks\.org/doab\?.*rid:|=oai:doab-books:)(?P<id>\d{1,8})'),
'ltwk': re.compile(r'[\./]librarything\.com/work/(\d{1,8})'), 'gdrd': re.compile(r'[\./]goodreads\.com/book/show/(?P<id>\d{1,8})'),
'oclc': re.compile(r'\.worldcat\.org/.*oclc/(\d{8,12})'), 'ltwk': re.compile(r'[\./]librarything\.com/work/(?P<id>\d{1,8})'),
'doi': re.compile(r'[\./]doi\.org/(10\.\d+/\S+)'), 'oclc': re.compile(r'\.worldcat\.org/.*oclc/(?P<id>\d{8,12})'),
'gtbg': re.compile(r'[\./]gutenberg\.org/ebooks/(\d{1,6})'), 'doi': re.compile(r'[\./]doi\.org/(?P<id>10\.\d+/\S+)'),
'glue': re.compile(r'[\./]unglue\.it/work/(\d{1,7})'), 'gtbg': re.compile(r'[\./]gutenberg\.org/ebooks/(?P<id>\d{1,6})'),
'glue': re.compile(r'[\./]unglue\.it/work/(?P<id>\d{1,7})'),
} }
def ids_from_urls(url): def ids_from_urls(url):
@ -346,7 +362,111 @@ def ids_from_urls(url):
for ident in ID_URLPATTERNS.keys(): for ident in ID_URLPATTERNS.keys():
id_match = ID_URLPATTERNS[ident].search(url) id_match = ID_URLPATTERNS[ident].search(url)
if id_match: if id_match:
ids[ident] = id_match.group(1) ids[ident] = id_match.group('id')
return ids return ids
DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"')
def dl_online(ebook):
if ebook.format != 'online':
return
if ebook.url.find(u'dropbox.com/s/') >= 0:
response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
match_dl = DROPBOX_DL.search(response.content)
if match_dl:
return make_dl_ebook(match_dl.group(1), ebook)
elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
doc = get_soup(ebook.url)
if doc:
obj = doc.select_one('div.fulltexticoncontainer-PDF a')
if obj:
dl_url = urlparse.urljoin(ebook.url, obj['href'])
return make_dl_ebook(dl_url, ebook)
def make_dl_ebook(url, ebook):
if EbookFile.objects.filter(source=ebook.url):
return EbookFile.objects.filter(source=ebook.url)[0]
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
filesize = int(response.headers.get("Content-Length", 0))
filesize = filesize if filesize else None
format = type_for_url(url, content_type=response.headers.get('content-type'))
if format != 'online':
new_ebf = EbookFile.objects.create(
edition=ebook.edition,
format=format,
source=ebook.url,
)
new_ebf.file.save(path_for_file(new_ebf, None), ContentFile(response.content))
new_ebf.save()
new_ebook = Ebook.objects.create(
edition=ebook.edition,
format=format,
provider='Unglue.it',
url=new_ebf.file.url,
rights=ebook.rights,
filesize=filesize,
version_label=ebook.version_label,
version_iter=ebook.version_iter,
)
new_ebf.ebook = new_ebook
new_ebf.save()
return new_ebf
def type_for_url(url, content_type=None):
if not url:
return ''
if url.find('books.openedition.org') >= 0:
return ('online')
ct = content_type if content_type else contenttyper.calc_type(url)
if re.search("pdf", ct):
return "pdf"
elif re.search("octet-stream", ct) and re.search("pdf", url, flags=re.I):
return "pdf"
elif re.search("octet-stream", ct) and re.search("epub", url, flags=re.I):
return "epub"
elif re.search("text/plain", ct):
return "text"
elif re.search("text/html", ct):
if url.find('oapen.org/view') >= 0:
return "html"
return "online"
elif re.search("epub", ct):
return "epub"
elif re.search("mobi", ct):
return "mobi"
return "other"
class ContentTyper(object):
""" """
def __init__(self):
self.last_call = dict()
def content_type(self, url):
try:
r = requests.head(url)
return r.headers.get('content-type')
except:
return None
def calc_type(self, url):
delay = 1
# is there a delay associated with the url
netloc = urlparse.urlparse(url).netloc
# wait if necessary
last_call = self.last_call.get(netloc)
if last_call is not None:
now = time.time()
min_time_next_call = last_call + delay
if min_time_next_call > now:
time.sleep(min_time_next_call-now)
self.last_call[netloc] = time.time()
# compute the content-type
return self.content_type(url)
contenttyper = ContentTyper()

View File

@ -1,17 +0,0 @@
import os
from django.conf import settings
from django.contrib.auth.models import User
from django.core.management.base import BaseCommand
from regluit.core.loaders import doab
class Command(BaseCommand):
help = "load doab books"
args = "<limit> <file_name>"
def handle(self, limit=None, file_name="../../../bookdata/doab.json", **options):
command_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(command_dir, file_name)
doab.load_doab_records(file_path, limit=int(limit))

View File

@ -0,0 +1,21 @@
from django.core.management.base import BaseCommand
from regluit.core.loaders.utils import dl_online
from regluit.core.models import Ebook
class Command(BaseCommand):
help = "harvest downloadable ebooks from 'online' ebooks"
args = "<limit>"
def handle(self, limit=0, **options):
limit = int(limit) if limit else 0
onlines = Ebook.objects.filter(format='online')
done = 0
for online in onlines:
new_ebf = dl_online(online)
if new_ebf:
done += 1
if done > limit:
break
print 'harvested {} ebooks'.format(done)

View File

@ -0,0 +1,10 @@
from django.core.management.base import BaseCommand
from regluit.core.loaders import doab
class Command(BaseCommand):
help = "load doab books by doab_id via oai"
args = "<doab_id>"
def handle(self, doab_id, **options):
doab.add_by_doab(doab_id)

View File

@ -0,0 +1,18 @@
from django.core.management.base import BaseCommand
from regluit.core.loaders import doab
class Command(BaseCommand):
help = "load doab books via oai"
args = "<from_year> <limit>"
def handle(self, from_year= None, limit=None, **options):
from_year = int(from_year) if from_year else None
limit = int(limit) if limit else None
if limit:
doab.load_doab_oai(from_year=from_year, limit=limit)
else:
if from_year:
doab.load_doab_oai(from_year=from_year)
else:
doab.load_doab_oai()

View File

@ -1083,7 +1083,7 @@ class EbookFile(models.Model):
source=self.file.url source=self.file.url
) )
new_mobi_ebf.file.save(path_for_file('ebf', None), mobi_cf) new_mobi_ebf.file.save(path_for_file(new_mobi_ebf, None), mobi_cf)
new_mobi_ebf.save() new_mobi_ebf.save()
if self.ebook: if self.ebook:
new_ebook = Ebook.objects.create( new_ebook = Ebook.objects.create(

File diff suppressed because it is too large Load Diff

View File

@ -21,6 +21,7 @@ from regluit.core.bookloader import (
from regluit.core.parameters import WORK_IDENTIFIERS from regluit.core.parameters import WORK_IDENTIFIERS
from regluit.core.loaders import add_by_webpage from regluit.core.loaders import add_by_webpage
from regluit.core.loaders.doab import add_by_doab
from regluit.core.loaders.utils import ids_from_urls from regluit.core.loaders.utils import ids_from_urls
from regluit.frontend.forms import EditionForm, IdentifierForm from regluit.frontend.forms import EditionForm, IdentifierForm
@ -106,6 +107,11 @@ def get_edition_for_id(id_type, id_value, user=None):
if edition: if edition:
return user_edition(edition, user) return user_edition(edition, user)
if identifiers.has_key('doab'):
edition = add_by_doab(identifiers['doab'])
if edition:
return user_edition(edition, user)
if identifiers.has_key('oclc'): if identifiers.has_key('oclc'):
edition = add_by_oclc(identifiers['oclc']) edition = add_by_oclc(identifiers['oclc'])
if edition: if edition:

View File

@ -6,16 +6,11 @@ PyJWT==1.4.1
PyPDF2==1.23 PyPDF2==1.23
PyGithub==1.15.0 PyGithub==1.15.0
PyYAML==3.11 PyYAML==3.11
git+git://github.com/urschrei/pyzotero.git@v0.9.51
SPARQLWrapper==1.6.4
WebOb==1.2.3
WebTest==1.4.0
amqp==1.4.9 amqp==1.4.9
anyjson==0.3.3 anyjson==0.3.3
billiard==3.3.0.23 billiard==3.3.0.23
awscli==1.10.26 awscli==1.10.26
boto==2.42.0 boto==2.42.0
#git+ssh://git@github.com/Gluejar/boto.git@2.3.0
celery==3.1.23 celery==3.1.23
certifi==2016.2.28 certifi==2016.2.28
# pip installing pillow seems to delete distribute # pip installing pillow seems to delete distribute
@ -33,7 +28,6 @@ django-jsonfield==1.0.0
#django-kombu==0.9.4 #django-kombu==0.9.4
django-maintenancemode==0.11.2 django-maintenancemode==0.11.2
django-mptt==0.8.5 django-mptt==0.8.5
#django-nose-selenium==0.7.3
#django-notification==0.2 #django-notification==0.2
git+git://github.com/eshellman/django-notification.git@412c7a03a327195a1017c2be92c8e2caabc880b6 git+git://github.com/eshellman/django-notification.git@412c7a03a327195a1017c2be92c8e2caabc880b6
django-registration==2.1.2 django-registration==2.1.2
@ -42,9 +36,7 @@ django-smtp-ssl==1.0
django-storages==1.4.1 django-storages==1.4.1
django-tastypie==0.13.3 django-tastypie==0.13.3
django-transmeta==0.7.3 django-transmeta==0.7.3
feedparser==5.1.2
fef-questionnaire==4.0.1 fef-questionnaire==4.0.1
freebase==1.0.8
#gitenberg.metadata==0.1.6 #gitenberg.metadata==0.1.6
git+https://github.com/gitenberg-dev/gitberg-build git+https://github.com/gitenberg-dev/gitberg-build
#git+ssh://git@github.com/gitenberg-dev/metadata.git@0.1.11 #git+ssh://git@github.com/gitenberg-dev/metadata.git@0.1.11
@ -53,7 +45,7 @@ html5lib==1.0b3
httplib2==0.7.5 httplib2==0.7.5
isodate==0.5.1 isodate==0.5.1
kombu==3.0.35 kombu==3.0.35
lxml==2.3.5 lxml==4.2.1
defusedxml==0.4.1 defusedxml==0.4.1
mechanize==0.2.5 mechanize==0.2.5
mimeparse==0.1.3 mimeparse==0.1.3
@ -66,6 +58,7 @@ paramiko==1.14.1
postmonkey==1.0b postmonkey==1.0b
pycrypto==2.6 pycrypto==2.6
pymarc==3.0.2 pymarc==3.0.2
pyoai==2.5.0
pyparsing==2.0.3 pyparsing==2.0.3
python-dateutil==2.5.3 python-dateutil==2.5.3
python-mimeparse==0.1.4 python-mimeparse==0.1.4
@ -80,7 +73,7 @@ requests==2.10.0
requests-mock==1.2.0 requests-mock==1.2.0
requests-oauthlib==0.6.2 requests-oauthlib==0.6.2
selenium==2.53.1 selenium==2.53.1
six==1.9.0 six==1.11.0
sorl-thumbnail==12.3 sorl-thumbnail==12.3
ssh==1.7.14 ssh==1.7.14
stevedore==1.12.0 stevedore==1.12.0

View File

@ -1,6 +1,10 @@
from django.conf.global_settings import LANGUAGES from django.conf.global_settings import LANGUAGES
lang2code = dict([ (lang[1].lower(), lang[0]) for lang in LANGUAGES ]) lang2code = dict([ (lang[1].lower(), lang[0]) for lang in LANGUAGES ])
code2lang = dict(LANGUAGES)
def get_language_code(language): def get_language_code(language):
return lang2code.get(language.lower().strip(), '') language = language.lower().strip()
if language in code2lang:
return language
return lang2code.get(language, '')