regluit/core/bookloader.py

1066 lines
40 KiB
Python
Raw Normal View History

2013-06-03 16:31:39 +00:00
"""
external library imports
"""
import json
import logging
import re
import requests
2017-08-07 20:17:00 +00:00
from django.core.files.base import ContentFile
from django.core.files.storage import default_storage
from regluit.core.validation import test_file
from datetime import timedelta
2013-06-03 16:31:39 +00:00
from xml.etree import ElementTree
from urlparse import (urljoin, urlparse)
2017-07-28 16:45:17 +00:00
# django imports
2011-11-06 22:42:09 +00:00
from django.conf import settings
2017-08-07 20:17:00 +00:00
from django.core.files.base import ContentFile
from django_comments.models import Comment
2013-06-03 16:31:39 +00:00
from django.db import IntegrityError
2017-08-07 20:17:00 +00:00
from django.forms import ValidationError
from github3 import (login, GitHub)
from github3.repos.release import Release
2015-08-03 14:04:07 +00:00
from gitenberg.metadata.pandata import Pandata
2015-07-30 03:01:43 +00:00
2017-07-28 16:45:17 +00:00
# regluit imports
import regluit
import regluit.core.isbn
from regluit.marc.models import inverse_marc_rels
2013-06-03 16:31:39 +00:00
from regluit.utils.localdatetime import now
from . import cc
from . import models
from .parameters import WORK_IDENTIFIERS
from .validation import identifier_cleaner
2017-08-23 16:21:56 +00:00
from .loaders.scrape import BaseScraper, scrape_sitemap
logger = logging.getLogger(__name__)
request_log = logging.getLogger("requests")
request_log.setLevel(logging.WARNING)
def add_by_oclc(isbn, work=None):
# this is indirection in case we have a data source other than google
return add_by_oclc_from_google(isbn)
2011-11-06 21:33:04 +00:00
def add_by_oclc_from_google(oclc):
if oclc:
logger.info("adding book by oclc %s" , oclc)
else:
2011-11-06 21:33:04 +00:00
return None
try:
return models.Identifier.objects.get(type='oclc', value=oclc).edition
except:
url = "https://www.googleapis.com/books/v1/volumes"
try:
results = _get_json(url, {"q": '"OCLC%s"' % oclc})
except LookupFailure, e:
logger.exception("lookup failure for %s", oclc)
return None
if not results.has_key('items') or len(results['items']) == 0:
logger.warn("no google hits for %s" , oclc)
return None
2017-07-28 16:45:17 +00:00
try:
2012-01-18 04:22:07 +00:00
e = add_by_googlebooks_id(results['items'][0]['id'], results=results['items'][0])
models.Identifier(type='oclc', value=oclc, edition=e, work=e.work).save()
return e
except LookupFailure, e:
logger.exception("failed to add edition for %s", oclc)
except IntegrityError, e:
logger.exception("google books data for %s didn't fit our db", oclc)
return None
2011-11-06 22:42:09 +00:00
def valid_isbn(isbn):
try:
return identifier_cleaner('isbn')(isbn)
except:
logger.exception("invalid isbn: %s", isbn)
return None
2016-10-28 00:05:43 +00:00
def add_by_isbn(isbn, work=None, language='xx', title=''):
if not isbn:
return None
try:
e = add_by_isbn_from_google(isbn, work=work)
except LookupFailure:
logger.exception("failed google lookup for %s", isbn)
# try again some other time
return None
if e:
return e
2017-07-28 16:45:17 +00:00
logger.info("null came back from add_by_isbn_from_google: %s", isbn)
2017-07-28 16:45:17 +00:00
# if there's a a title, we want to create stub editions and
# works, even if google doesn't know about it # but if it's not valid,
# forget it!
2016-10-28 00:05:43 +00:00
if work:
title = work.title if work.title else title
if not title:
return None
if not title:
return None
2017-07-28 16:45:17 +00:00
isbn = valid_isbn(isbn)
if not isbn:
return None
2017-07-28 16:45:17 +00:00
if not language or language == 'xx': # don't add unknown language
2016-10-28 00:05:43 +00:00
# we don't know the language ->'xx'
work = models.Work(title=title, language='xx')
work.save()
elif not work:
work = models.Work(title=title, language=language)
work.save()
e = models.Edition(title=title, work=work)
e.save()
2012-01-18 04:22:07 +00:00
e.new = True
2016-10-28 00:05:43 +00:00
models.Identifier(type='isbn', value=isbn, work=work, edition=e).save()
return e
def get_google_isbn_results(isbn):
url = "https://www.googleapis.com/books/v1/volumes"
try:
results = _get_json(url, {"q": "isbn:%s" % isbn})
except LookupFailure:
logger.exception("lookup failure for %s", isbn)
return None
if not results.has_key('items') or len(results['items']) == 0:
logger.warn("no google hits for %s" , isbn)
return None
else:
return results
2017-07-28 16:45:17 +00:00
def add_ebooks(item, edition):
access_info = item.get('accessInfo')
if access_info:
epub = access_info.get('epub')
if epub and epub.get('downloadLink'):
ebook = models.Ebook(edition=edition, format='epub',
url=epub.get('downloadLink'),
provider='Google Books')
try:
ebook.save()
except IntegrityError:
pass
2017-07-28 16:45:17 +00:00
pdf = access_info.get('pdf')
if pdf and pdf.get('downloadLink'):
ebook = models.Ebook(edition=edition, format='pdf',
url=pdf.get('downloadLink', None),
provider='Google Books')
try:
ebook.save()
except IntegrityError:
pass
def update_edition(edition):
"""
attempt to update data associated with input edition and return that updated edition
"""
# if there is no ISBN associated with edition, just return the input edition
try:
2017-07-28 16:45:17 +00:00
isbn = edition.identifiers.filter(type='isbn')[0].value
2012-08-03 02:36:11 +00:00
except (models.Identifier.DoesNotExist, IndexError):
return edition
# do a Google Books lookup on the isbn associated with the edition (there should be either 0 or 1 isbns associated
# with an edition because of integrity constraint in Identifier)
2017-07-28 16:45:17 +00:00
# if we get some data about this isbn back from Google, update the edition data accordingly
2017-07-28 16:45:17 +00:00
results = get_google_isbn_results(isbn)
if not results:
return edition
2017-07-28 16:45:17 +00:00
item = results['items'][0]
googlebooks_id = item['id']
d = item['volumeInfo']
if d.has_key('title'):
title = d['title']
else:
2017-07-28 16:45:17 +00:00
title = ''
if len(title) == 0:
# need a title to make an edition record; some crap records in GB. use title from parent if available
2017-07-28 16:45:17 +00:00
title = edition.work.title
# check for language change
language = d['language']
# allow variants in main language (e.g., 'zh-tw')
2017-07-28 16:45:17 +00:00
if len(language) > 5:
language = language[0:5]
# if the language of the edition no longer matches that of the parent work, attach edition to the
if edition.work.language != language:
logger.info("reconnecting %s since it is %s instead of %s" %(googlebooks_id, language, edition.work.language))
2017-07-28 16:45:17 +00:00
old_work = edition.work
new_work = models.Work(title=title, language=language)
new_work.save()
edition.work = new_work
edition.save()
for identifier in edition.identifiers.all():
logger.info("moving identifier %s" % identifier.value)
2017-07-28 16:45:17 +00:00
identifier.work = new_work
identifier.save()
2012-08-05 17:03:37 +00:00
if old_work and old_work.editions.count()==0:
#a dangling work; make sure nothing else is attached!
2017-07-28 16:45:17 +00:00
merge_works(new_work, old_work)
# update the edition
edition.title = title
edition.publication_date = d.get('publishedDate', '')
edition.set_publisher(d.get('publisher'))
edition.save()
2017-07-28 16:45:17 +00:00
# create identifier if needed
2017-07-28 16:45:17 +00:00
models.Identifier.get_or_add(type='goog', value=googlebooks_id, edition=edition, work=edition.work)
for a in d.get('authors', []):
2014-10-20 20:54:19 +00:00
edition.add_author(a)
2017-07-28 16:45:17 +00:00
add_ebooks(item, edition)
2017-07-28 16:45:17 +00:00
return edition
2017-07-28 16:45:17 +00:00
def add_by_isbn_from_google(isbn, work=None):
"""add a book to the UnglueIt database from google based on ISBN. The work parameter
2011-10-14 04:12:20 +00:00
is optional, and if not supplied the edition will be associated with
a stub work.
"""
if not isbn:
return None
if len(isbn)==10:
2012-01-18 04:22:07 +00:00
isbn = regluit.core.isbn.convert_10_to_13(isbn)
2017-07-28 16:45:17 +00:00
# check if we already have this isbn
2017-07-28 16:45:17 +00:00
edition = get_edition_by_id(type='isbn', value=isbn)
if edition:
edition.new = False
return edition
logger.info("adding new book by isbn %s", isbn)
2017-07-28 16:45:17 +00:00
results = get_google_isbn_results(isbn)
if results:
try:
return add_by_googlebooks_id(results['items'][0]['id'], work=work, results=results['items'][0], isbn=isbn)
except LookupFailure, e:
logger.exception("failed to add edition for %s", isbn)
except IntegrityError, e:
logger.exception("google books data for %s didn't fit our db", isbn)
return None
else:
return None
2017-07-28 16:45:17 +00:00
def get_work_by_id(type, value):
if value:
try:
2017-07-28 16:45:17 +00:00
return models.Identifier.objects.get(type=type, value=value).work
except models.Identifier.DoesNotExist:
return None
2017-07-28 16:45:17 +00:00
def get_edition_by_id(type, value):
if value:
try:
2017-07-28 16:45:17 +00:00
return models.Identifier.objects.get(type=type, value=value).edition
except models.Identifier.DoesNotExist:
return None
def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
2011-10-14 04:12:20 +00:00
"""add a book to the UnglueIt database based on the GoogleBooks ID. The
2017-07-28 16:45:17 +00:00
work parameter is optional, and if not supplied the edition will be
associated with a stub work. isbn can be passed because sometimes passed data won't include it
2011-10-14 04:12:20 +00:00
"""
isbn = valid_isbn(isbn)
2017-07-28 16:45:17 +00:00
# don't ping google again if we already know about the edition
try:
edition = models.Identifier.objects.get(type='goog', value=googlebooks_id).edition
edition.new = False
if isbn:
# check that the isbn is in db; if not, then there are two isbns for the edition
try:
2017-07-28 16:45:17 +00:00
models.Identifier.objects.get(type='isbn', value=isbn).edition
# not going to worry about isbn_edition != edition
except models.Identifier.DoesNotExist:
models.Identifier.objects.create(type='isbn', value=isbn, edition=edition, work=edition.work)
return edition
except models.Identifier.DoesNotExist:
pass
2017-07-28 16:45:17 +00:00
# if google has been queried by caller, don't call again
if results:
2017-07-28 16:45:17 +00:00
item = results
else:
logger.info("loading metadata from google for %s", googlebooks_id)
url = "https://www.googleapis.com/books/v1/volumes/%s" % googlebooks_id
2017-07-28 16:45:17 +00:00
item = _get_json(url)
2011-11-06 21:33:04 +00:00
d = item['volumeInfo']
2017-07-28 16:45:17 +00:00
if d.has_key('title'):
title = d['title']
else:
2017-07-28 16:45:17 +00:00
title = ''
2012-01-31 04:56:20 +00:00
if len(title)==0:
# need a title to make an edition record; some crap records in GB. use title from parent if available
if work:
2017-07-28 16:45:17 +00:00
title = work.title
2012-01-31 04:56:20 +00:00
else:
return None
# don't add the edition to a work with a different language
# https://www.pivotaltracker.com/story/show/17234433
language = d['language']
2017-07-28 16:45:17 +00:00
if len(language) > 5:
language = language[0:5]
if work and work.language != language:
logger.info("not connecting %s since it is %s instead of %s" %
(googlebooks_id, language, work.language))
2012-01-18 04:22:07 +00:00
work = None
# isbn = None
2017-07-28 16:45:17 +00:00
if not isbn:
for i in d.get('industryIdentifiers', []):
if i['type'] == 'ISBN_10' and not isbn:
isbn = regluit.core.isbn.convert_10_to_13(i['identifier'])
elif i['type'] == 'ISBN_13':
isbn = i['identifier']
# now check to see if there's an existing Work
if work:
work.new = False
if isbn and not work:
2017-07-28 16:45:17 +00:00
work = get_work_by_id(type='isbn', value=isbn)
if work:
work.new = False
if not work:
2012-01-31 04:56:20 +00:00
work = models.Work.objects.create(title=title, language=language)
2012-01-18 04:22:07 +00:00
work.new = True
work.save()
# going off to google can take some time, so we want to make sure this edition has not
# been created in another thread while we were waiting
try:
e = models.Identifier.objects.get(type='goog', value=googlebooks_id).edition
e.new = False
2016-10-28 18:40:16 +00:00
logger.warning( " whoa nellie, somebody else created an edition while we were working.")
if work.new:
work.delete()
return e
except models.Identifier.DoesNotExist:
pass
2017-07-28 16:45:17 +00:00
# because this is a new google id, we have to create a new edition
e = models.Edition(work=work)
2012-01-31 04:56:20 +00:00
e.title = title
e.publication_date = d.get('publishedDate', '')
e.set_publisher(d.get('publisher'))
e.save()
2012-01-18 04:22:07 +00:00
e.new = True
2017-07-28 16:45:17 +00:00
# create identifier where needed
2017-07-28 16:45:17 +00:00
models.Identifier(type='goog', value=googlebooks_id, edition=e, work=work).save()
if isbn:
2017-07-28 16:45:17 +00:00
models.Identifier.get_or_add(type='isbn', value=isbn, edition=e, work=work)
for a in d.get('authors', []):
a, created = models.Author.objects.get_or_create(name=a)
2015-07-28 13:41:55 +00:00
e.add_author(a)
add_ebooks(item, e)
2017-07-28 16:45:17 +00:00
return e
def relate_isbn(isbn, cluster_size=1):
2017-07-28 16:45:17 +00:00
"""add a book by isbn and then see if there's an existing work to add it to so as to make a cluster
bigger than cluster_size.
"""
logger.info("finding a related work for %s", isbn)
edition = add_by_isbn(isbn)
if edition is None:
return None
if edition.work is None:
logger.info("didn't add related to null work")
return None
if edition.work.editions.count()>cluster_size:
return edition.work
for other_isbn in thingisbn(isbn):
# 979's come back as 13
logger.debug("other_isbn: %s", other_isbn)
if len(other_isbn)==10:
other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
related_edition = add_by_isbn(other_isbn, work=edition.work)
if related_edition:
related_language = related_edition.work.language
if edition.work.language == related_language:
if related_edition.work is None:
related_edition.work = edition.work
related_edition.save()
elif related_edition.work.id != edition.work.id:
logger.debug("merge_works path 1 %s %s", edition.work.id, related_edition.work.id )
merge_works(related_edition.work, edition.work)
2017-07-28 16:45:17 +00:00
if related_edition.work.editions.count()>cluster_size:
return related_edition.work
return edition.work
2011-10-14 04:12:20 +00:00
def add_related(isbn):
"""add all books related to a particular ISBN to the UnglueIt database.
The initial seed ISBN will be added if it's not already there.
"""
# make sure the seed edition is there
logger.info("adding related editions for %s", isbn)
2017-07-28 16:45:17 +00:00
new_editions = []
edition = add_by_isbn(isbn)
if edition is None:
return new_editions
2012-08-05 17:03:37 +00:00
if edition.work is None:
logger.warning("didn't add related to null work")
return new_editions
# this is the work everything will hang off
work = edition.work
other_editions = {}
for other_isbn in thingisbn(isbn):
# 979's come back as 13
logger.debug("other_isbn: %s", other_isbn)
if len(other_isbn)==10:
2012-01-18 04:22:07 +00:00
other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
related_edition = add_by_isbn(other_isbn, work=work)
if related_edition:
2012-01-18 04:22:07 +00:00
related_language = related_edition.work.language
if edition.work.language == related_language:
new_editions.append(related_edition)
2012-08-05 17:03:37 +00:00
if related_edition.work is None:
related_edition.work = work
related_edition.save()
elif related_edition.work.id != work.id:
2012-08-05 17:03:37 +00:00
logger.debug("merge_works path 1 %s %s", work.id, related_edition.work.id )
merge_works(work, related_edition.work)
else:
if other_editions.has_key(related_language):
other_editions[related_language].append(related_edition)
else:
2017-07-28 16:45:17 +00:00
other_editions[related_language] = [related_edition]
# group the other language editions together
for lang_group in other_editions.itervalues():
logger.debug("lang_group (ed, work): %s", [(ed.id, ed.work.id) for ed in lang_group])
if len(lang_group)>1:
lang_edition = lang_group[0]
logger.debug("lang_edition.id: %s", lang_edition.id)
# compute the distinct set of works to merge into lang_edition.work
works_to_merge = set([ed.work for ed in lang_group[1:]]) - set([lang_edition.work])
for w in works_to_merge:
logger.debug("merge_works path 2 %s %s", lang_edition.work.id, w.id )
merge_works(lang_edition.work, w)
models.WorkRelation.objects.get_or_create(to_work=lang_edition.work, from_work=work, relation='translation')
2017-07-28 16:45:17 +00:00
return new_editions
2017-07-28 16:45:17 +00:00
2011-10-14 04:12:20 +00:00
2011-10-13 01:59:46 +00:00
def thingisbn(isbn):
2017-07-28 16:45:17 +00:00
"""given an ISBN return a list of related edition ISBNs, according to
Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns, which come back as isbn_13')
2011-10-14 04:12:20 +00:00
"""
logger.info("looking up %s at ThingISBN" , isbn)
2017-07-27 14:33:13 +00:00
url = "https://www.librarything.com/api/thingISBN/%s" % isbn
2011-10-13 01:59:46 +00:00
xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content
doc = ElementTree.fromstring(xml)
return [e.text for e in doc.findall('isbn')]
def merge_works(w1, w2, user=None):
"""will merge the second work (w2) into the first (w1)
"""
logger.info("merging work %s into %s", w2.id, w1.id)
# don't merge if the works are the same or at least one of the works has no id (for example, when w2 has already been deleted)
if w1 is None or w2 is None or w1.id == w2.id or w1.id is None or w2.id is None:
return
if w2.selected_edition != None and w1.selected_edition == None:
#the merge should be reversed
temp = w1
w1 = w2
w2 = temp
models.WasWork(was=w2.pk, work=w1, user=user).save()
for ww in models.WasWork.objects.filter(work = w2):
ww.work = w1
ww.save()
2014-07-01 17:07:07 +00:00
if w2.description and not w1.description:
w1.description = w2.description
if w2.featured and not w1.featured:
w1.featured = w2.featured
2014-12-13 17:37:35 +00:00
if w2.is_free and not w1.is_free:
w1.is_free = True
2016-10-12 17:55:27 +00:00
if w2.age_level and not w1.age_level:
w1.age_level = w2.age_level
2014-07-01 17:07:07 +00:00
w1.save()
for wishlist in models.Wishlist.objects.filter(works__in=[w2]):
w2source = wishlist.work_source(w2)
wishlist.remove_work(w2)
wishlist.add_work(w1, w2source)
for identifier in w2.identifiers.all():
identifier.work = w1
identifier.save()
2012-02-10 01:51:10 +00:00
for comment in Comment.objects.for_model(w2):
comment.object_pk = w1.pk
comment.save()
for edition in w2.editions.all():
edition.work = w1
edition.save()
for campaign in w2.campaigns.all():
campaign.work = w1
campaign.save()
2012-10-05 21:55:14 +00:00
for claim in w2.claim.all():
claim.work = w1
claim.dont_notify = True
2012-10-05 21:55:14 +00:00
claim.save()
for offer in w2.offers.all():
offer.work = w1
offer.save()
for acq in w2.acqs.all():
acq.work = w1
acq.save()
for hold in w2.holds.all():
hold.work = w1
hold.save()
2017-03-16 15:50:10 +00:00
for landing in w2.landings.all():
landing.object_id = w1.id
landing.save()
for subject in w2.subjects.all():
if subject not in w1.subjects.all():
w1.subjects.add(subject)
2017-07-28 16:45:17 +00:00
w2.delete()
2017-07-28 16:45:17 +00:00
2013-04-16 20:46:25 +00:00
def detach_edition(e):
"""will detach edition from its work, creating a new stub work. if remerge=true, will see if there's another work to attach to
"""
logger.info("splitting edition %s from %s", e, e.work)
w = models.Work(title=e.title, language = e.work.language)
w.save()
2017-07-28 16:45:17 +00:00
2013-04-16 20:46:25 +00:00
for identifier in e.identifiers.all():
identifier.work = w
identifier.save()
2017-07-28 16:45:17 +00:00
2013-04-16 20:46:25 +00:00
e.work = w
e.save()
2011-10-14 04:12:20 +00:00
def despam_description(description):
""" a lot of descriptions from openlibrary have free-book promotion text; this removes some of it."""
if description.find("GeneralBooksClub.com")>-1 or description.find("AkashaPublishing.Com")>-1:
return ""
2017-07-28 16:45:17 +00:00
pieces = description.split("1stWorldLibrary.ORG -")
if len(pieces) > 1:
return pieces[1]
2017-07-28 16:45:17 +00:00
pieces = description.split("a million books for free.")
if len(pieces) > 1:
return pieces[1]
return description
def add_openlibrary(work, hard_refresh = False):
if (not hard_refresh) and work.openlibrary_lookup is not None:
# don't hit OL if we've visited in the past month or so
if now()- work.openlibrary_lookup < timedelta(days=30):
2017-07-28 16:45:17 +00:00
return
work.openlibrary_lookup = now()
work.save()
# find the first ISBN match in OpenLibrary
logger.info("looking up openlibrary data for work %s", work.id)
2017-07-28 16:45:17 +00:00
e = None # openlibrary edition json
w = None # openlibrary work json
# get the 1st openlibrary match by isbn that has an associated work
2017-07-27 14:33:13 +00:00
url = "https://openlibrary.org/api/books"
params = {"format": "json", "jscmd": "details"}
subjects = []
for edition in work.editions.all():
isbn_key = "ISBN:%s" % edition.isbn_13
params['bibkeys'] = isbn_key
try:
e = _get_json(url, params, type='ol')
except LookupFailure:
logger.exception("OL lookup failed for %s", isbn_key)
e = {}
if e.has_key(isbn_key):
if e[isbn_key].has_key('details'):
if e[isbn_key]['details'].has_key('oclc_numbers'):
for oclcnum in e[isbn_key]['details']['oclc_numbers']:
2017-07-28 16:45:17 +00:00
models.Identifier.get_or_add(type='oclc', value=oclcnum, work=work, edition=edition)
if e[isbn_key]['details'].has_key('identifiers'):
ids = e[isbn_key]['details']['identifiers']
if ids.has_key('goodreads'):
2017-07-28 16:45:17 +00:00
models.Identifier.get_or_add(type='gdrd', value=ids['goodreads'][0], work=work, edition=edition)
if ids.has_key('librarything'):
2017-07-28 16:45:17 +00:00
models.Identifier.get_or_add(type='ltwk', value=ids['librarything'][0], work=work)
if ids.has_key('google'):
2017-07-28 16:45:17 +00:00
models.Identifier.get_or_add(type='goog', value=ids['google'][0], work=work)
if ids.has_key('project_gutenberg'):
2017-07-28 16:45:17 +00:00
models.Identifier.get_or_add(type='gute', value=ids['project_gutenberg'][0], work=work)
if e[isbn_key]['details'].has_key('works'):
work_key = e[isbn_key]['details']['works'].pop(0)['key']
logger.info("got openlibrary work %s for isbn %s", work_key, isbn_key)
2017-07-28 16:45:17 +00:00
models.Identifier.get_or_add(type='olwk', value=work_key, work=work)
try:
2017-07-28 16:45:17 +00:00
w = _get_json("https://openlibrary.org" + work_key, type='ol')
if w.has_key('description'):
2017-07-28 16:45:17 +00:00
description = w['description']
if isinstance(description, dict):
if description.has_key('value'):
2017-07-28 16:45:17 +00:00
description = description['value']
description = despam_description(description)
if not work.description or work.description.startswith('{') or len(description) > len(work.description):
work.description = description
work.save()
if w.has_key('subjects') and len(w['subjects']) > len(subjects):
subjects = w['subjects']
except LookupFailure:
logger.exception("OL lookup failed for %s", work_key)
if not subjects:
logger.warn("unable to find work %s at openlibrary", work.id)
2017-07-28 16:45:17 +00:00
return
# add the subjects to the Work
for s in subjects:
if valid_subject(s):
logger.info("adding subject %s to work %s", s, work.id)
subject, created = models.Subject.objects.get_or_create(name=s)
work.subjects.add(subject)
2017-07-28 16:45:17 +00:00
work.save()
def valid_xml_char_ordinal(c):
codepoint = ord(c)
# conditions ordered by presumed frequency
return (
0x20 <= codepoint <= 0xD7FF or
codepoint in (0x9, 0xA, 0xD) or
0xE000 <= codepoint <= 0xFFFD or
0x10000 <= codepoint <= 0x10FFFF
)
def valid_subject( subject_name ):
num_commas = 0
for c in subject_name:
if not valid_xml_char_ordinal(c):
return False
if c == ',':
2015-02-02 18:31:10 +00:00
num_commas += 1
if num_commas > 2:
return False
return True
2017-07-28 16:45:17 +00:00
def _get_json(url, params={}, type='gb'):
# TODO: should X-Forwarded-For change based on the request from client?
2017-07-28 16:45:17 +00:00
headers = {'User-Agent': settings.USER_AGENT,
'Accept': 'application/json',
'X-Forwarded-For': '69.174.114.214'}
2012-01-18 04:22:07 +00:00
if type == 'gb':
params['key'] = settings.GOOGLE_BOOKS_API_KEY
params['country'] = 'us'
response = requests.get(url, params=params, headers=headers)
if response.status_code == 200:
return json.loads(response.content)
else:
logger.error("unexpected HTTP response: %s" % response)
if response.content:
logger.error("response content: %s" % response.content)
raise LookupFailure("GET failed: url=%s and params=%s" % (url, params))
def load_gutenberg_edition(title, gutenberg_etext_id, ol_work_id, seed_isbn, url, format, license, lang, publication_date):
2017-07-28 16:45:17 +00:00
# let's start with instantiating the relevant Work and Edition if they don't already exist
2017-07-28 16:45:17 +00:00
try:
2017-07-28 16:45:17 +00:00
work = models.Identifier.objects.get(type='olwk', value=ol_work_id).work
except models.Identifier.DoesNotExist: # try to find an Edition with the seed_isbn and use that work to hang off of
sister_edition = add_by_isbn(seed_isbn)
if sister_edition.new:
# add related editions asynchronously
2012-02-16 18:19:36 +00:00
regluit.core.tasks.populate_edition.delay(sister_edition.isbn_13)
work = sister_edition.work
# attach the olwk identifier to this work if it's not none.
if ol_work_id is not None:
2017-07-28 16:45:17 +00:00
models.Identifier.get_or_add(type='olwk', value=ol_work_id, work=work)
# Now pull out any existing Gutenberg editions tied to the work with the proper Gutenberg ID
try:
2017-07-28 16:45:17 +00:00
edition = models.Identifier.objects.get(type='gtbg', value=gutenberg_etext_id ).edition
except models.Identifier.DoesNotExist:
edition = models.Edition()
edition.title = title
edition.work = work
2017-07-28 16:45:17 +00:00
edition.save()
2017-07-28 16:45:17 +00:00
models.Identifier.get_or_add(type='gtbg', value=gutenberg_etext_id, edition=edition, work=work)
# check to see whether the Edition hasn't already been loaded first
# search by url
ebooks = models.Ebook.objects.filter(url=url)
2017-07-28 16:45:17 +00:00
# format: what's the controlled vocab? -- from Google -- alternative would be mimetype
2017-07-28 16:45:17 +00:00
if len(ebooks):
ebook = ebooks[0]
elif len(ebooks) == 0: # need to create new ebook
ebook = models.Ebook()
if len(ebooks) > 1:
2017-07-28 16:45:17 +00:00
logger.warning("There is more than one Ebook matching url {0}".format(url))
ebook.format = format
ebook.provider = 'Project Gutenberg'
ebook.url = url
ebook.rights = license
2017-07-28 16:45:17 +00:00
# is an Ebook instantiable without a corresponding Edition? (No, I think)
2017-07-28 16:45:17 +00:00
ebook.edition = edition
ebook.save()
2017-07-28 16:45:17 +00:00
return ebook
class LookupFailure(Exception):
pass
2017-08-08 18:06:29 +00:00
IDTABLE = [('librarything', 'ltwk'), ('goodreads', 'gdrd'), ('openlibrary', 'olwk'),
('gutenberg', 'gtbg'), ('isbn', 'isbn'), ('oclc', 'oclc'),
('edition_id', 'edid'), ('googlebooks', 'goog'), ('doi', 'doi'),
2017-08-03 21:09:42 +00:00
]
2015-07-30 03:01:43 +00:00
def unreverse(name):
if not ',' in name:
return name
(last, rest) = name.split(',', 1)
if not ',' in rest:
2017-07-28 16:45:17 +00:00
return '%s %s' % (rest.strip(), last.strip())
(first, rest) = rest.split(',', 1)
2017-07-28 16:45:17 +00:00
return '%s %s, %s' % (first.strip(), last.strip(), rest.strip())
2016-03-07 21:17:33 +00:00
def load_from_yaml(yaml_url, test_mode=False):
"""
2017-07-27 19:13:04 +00:00
This really should be called 'load_from_github_yaml'
2017-07-28 16:45:17 +00:00
if mock_ebook is True, don't construct list of ebooks from a release -- rather use an epub
"""
2015-09-24 21:58:34 +00:00
all_metadata = Pandata(yaml_url)
2017-07-27 19:13:04 +00:00
loader = GithubLoader(yaml_url)
2015-09-24 21:58:34 +00:00
for metadata in all_metadata.get_edition_list():
2017-07-28 16:45:17 +00:00
edition = loader.load_from_pandata(metadata)
loader.load_ebooks(metadata, edition, test_mode)
return edition.work.id if edition else None
2017-07-27 19:13:04 +00:00
2017-08-07 20:17:00 +00:00
def edition_for_ident(id_type, id_value):
2017-08-23 16:21:56 +00:00
#print 'returning edition for {}: {}'.format(id_type, id_value)
2017-08-07 20:17:00 +00:00
for ident in models.Identifier.objects.filter(type=id_type, value=id_value):
return ident.edition if ident.edition else ident.work.editions[0]
def edition_for_etype(etype, metadata, default=None):
'''
assumes the metadata contains the isbn_etype attributes, and that the editions have been created.
etype is 'epub', 'pdf', etc.
'''
2017-08-08 18:06:29 +00:00
isbn = metadata.identifiers.get('isbn_{}'.format(etype), None)
2017-08-07 20:17:00 +00:00
if not isbn:
2017-08-08 18:06:29 +00:00
isbn = metadata.identifiers.get('isbn_electronic', None)
2017-08-07 20:17:00 +00:00
if isbn:
return edition_for_ident('isbn', isbn)
else:
if default:
return default
# just return some edition
for key in metadata.identifiers.keys():
return edition_for_ident(key, metadata.identifiers[key])
for key in metadata.edition_identifiers.keys():
return edition_for_ident(key, metadata.identifiers[key])
MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
def load_ebookfile(url, etype):
'''
return a ContentFile if a new ebook has been loaded
'''
ebfs = models.EbookFile.objects.filter(source=url)
if ebfs:
return None
2017-08-07 20:17:00 +00:00
try:
r = requests.get(url)
contentfile = ContentFile(r.content)
test_file(contentfile, etype)
return contentfile
2017-08-07 20:17:00 +00:00
except IOError, e:
logger.error(u'could not open {}'.format(url))
except ValidationError, e:
logger.error(u'downloaded {} was not a valid {}'.format(url, etype))
2017-07-27 19:13:04 +00:00
class BasePandataLoader(object):
def __init__(self, url):
self.base_url = url
2017-07-28 16:45:17 +00:00
def load_from_pandata(self, metadata, work=None):
''' metadata is a Pandata object'''
2015-09-24 21:58:34 +00:00
#find an work to associate
edition = None
has_ed_id = False
2015-09-24 21:58:34 +00:00
if metadata.url:
2017-07-28 16:45:17 +00:00
new_ids = [('http', 'http', metadata.url)]
2015-07-30 03:01:43 +00:00
else:
2015-09-24 21:58:34 +00:00
new_ids = []
for (identifier, id_code) in IDTABLE:
# note that the work chosen is the last associated
2017-07-28 16:45:17 +00:00
value = metadata.edition_identifiers.get(identifier, None)
value = identifier_cleaner(id_code)(value)
2015-09-24 21:58:34 +00:00
if not value:
2017-07-28 16:45:17 +00:00
value = metadata.identifiers.get(identifier, None)
2015-09-24 21:58:34 +00:00
if value:
if id_code not in WORK_IDENTIFIERS:
has_ed_id = True
2015-09-24 21:58:34 +00:00
value = value[0] if isinstance(value, list) else value
try:
id = models.Identifier.objects.get(type=id_code, value=value)
work = id.work
if id.edition and not edition:
edition = id.edition
except models.Identifier.DoesNotExist:
if id_code != 'edid' or not has_ed_id: #last in loop
# only need to create edid if there is no edition id for the edition
new_ids.append((identifier, id_code, value))
2017-07-28 16:45:17 +00:00
2015-09-24 21:58:34 +00:00
if not work:
work = models.Work.objects.create(title=metadata.title, language=metadata.language)
if not edition:
2017-08-10 01:14:38 +00:00
if metadata.edition_note:
(note, created) = models.EditionNote.objects.get_or_create(note=metadata.edition_note)
else:
note = None
edition = models.Edition.objects.create(
title=metadata.title,
work=work,
note=note,
)
2015-09-24 21:58:34 +00:00
for (identifier, id_code, value) in new_ids:
2017-08-10 01:14:38 +00:00
models.Identifier.set(
type=id_code,
value=value,
edition=edition if id_code not in WORK_IDENTIFIERS else None,
work=work,
)
2015-09-24 21:58:34 +00:00
if metadata.publisher: #always believe yaml
edition.set_publisher(metadata.publisher)
if metadata.publication_date: #always believe yaml
edition.publication_date = metadata.publication_date
if metadata.description and len(metadata.description) > len(work.description):
#be careful about overwriting the work description
2015-09-24 21:58:34 +00:00
work.description = metadata.description
if metadata.creator and not edition.authors.count():
2015-09-24 21:58:34 +00:00
edition.authors.clear()
for key in metadata.creator.keys():
2017-07-28 16:45:17 +00:00
creators = metadata.creator[key]
rel_code = inverse_marc_rels.get(key, 'aut')
creators = creators if isinstance(creators, list) else [creators]
2015-09-24 21:58:34 +00:00
for creator in creators:
2017-07-28 16:45:17 +00:00
edition.add_author(unreverse(creator.get('agent_name', '')), relation=rel_code)
2015-09-24 21:58:34 +00:00
for yaml_subject in metadata.subjects: #always add yaml subjects (don't clear)
if isinstance(yaml_subject, tuple):
(authority, heading) = yaml_subject
elif isinstance(yaml_subject, str):
(authority, heading) = ( '', yaml_subject)
else:
continue
(subject, created) = models.Subject.objects.get_or_create(name=heading)
if not subject.authority and authority:
subject.authority = authority
subject.save()
subject.works.add(work)
# the default edition uses the first cover in covers.
for cover in metadata.covers:
if cover.get('image_path', False):
2017-07-28 16:45:17 +00:00
edition.cover_image = urljoin(self.base_url, cover['image_path'])
2015-09-24 21:58:34 +00:00
break
elif cover.get('image_url', False):
edition.cover_image = cover['image_url']
break
work.save()
2015-09-24 21:58:34 +00:00
edition.save()
2017-07-28 16:45:17 +00:00
return edition
2017-07-27 19:13:04 +00:00
2017-08-08 16:38:54 +00:00
def load_ebooks(self, metadata, edition, test_mode=False, user=None):
2017-08-07 20:17:00 +00:00
default_edition = edition
for key in ['epub', 'pdf', 'mobi']:
url = metadata.metadata.get('download_url_{}'.format(key), None)
if url:
edition = edition_for_etype(key, metadata, default=default_edition)
if edition:
contentfile = load_ebookfile(url, key)
if contentfile:
contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
path = default_storage.save(contentfile_name, contentfile)
lic = MATCH_LICENSE.search(metadata.rights_url)
license = 'CC {}'.format(lic.group(1).upper()) if lic else ''
ebf = models.EbookFile.objects.create(
format=key,
edition=edition,
source=url,
2017-08-07 20:17:00 +00:00
)
ebf.file.save(contentfile_name, contentfile)
ebf.file.close()
2017-08-08 16:38:54 +00:00
ebook = models.Ebook.objects.create(
2017-08-07 20:17:00 +00:00
url=ebf.file.url,
provider='Unglue.it',
rights=license,
format=key,
edition=edition,
filesize=contentfile.size,
active=False,
2017-08-08 16:38:54 +00:00
user=user,
2017-08-07 20:17:00 +00:00
)
ebf.ebook = ebook
ebf.save()
2017-07-27 19:13:04 +00:00
class GithubLoader(BasePandataLoader):
2017-07-28 16:45:17 +00:00
def load_ebooks(self, metadata, edition, test_mode=False):
# create Ebook for any ebook in the corresponding GitHub release
# assuming yaml_url of form (from GitHub, though not necessarily GITenberg)
# https://github.com/GITenberg/Adventures-of-Huckleberry-Finn_76/raw/master/metadata.yaml
2017-07-28 16:45:17 +00:00
2017-07-27 19:13:04 +00:00
url_path = urlparse(self.base_url).path.split("/")
(repo_owner, repo_name) = (url_path[1], url_path[2])
repo_tag = metadata._version
# allow for there not to be a token in the settings
try:
token = settings.GITHUB_PUBLIC_TOKEN
except:
2017-07-28 16:45:17 +00:00
token = None
2015-09-24 21:58:34 +00:00
if metadata._version and not metadata._version.startswith('0.0.'):
# use GitHub API to compute the ebooks in release until we're in test mode
2016-03-07 21:17:33 +00:00
if test_mode:
# not using ebook_name in this code
ebooks_in_release = [('epub', 'book.epub')]
else:
ebooks_in_release = ebooks_in_github_release(repo_owner, repo_name, repo_tag, token=token)
for (ebook_format, ebook_name) in ebooks_in_release:
2017-08-07 20:17:00 +00:00
(book_name_prefix, _ ) = re.search(r'(.*)\.([^\.]*)$', ebook_name).groups()
(ebook, created) = models.Ebook.objects.get_or_create(
url=git_download_from_yaml_url(
self.base_url,
metadata._version,
edition_name=book_name_prefix,
format_= ebook_format
),
provider='Github',
2017-08-07 20:17:00 +00:00
rights=cc.match_license(metadata.rights),
format=ebook_format,
edition=edition,
)
2016-09-23 18:53:54 +00:00
ebook.set_version(metadata._version)
2017-07-28 16:45:17 +00:00
def git_download_from_yaml_url(yaml_url, version, edition_name='book', format_='epub'):
2015-07-30 03:01:43 +00:00
# go from https://github.com/GITenberg/Adventures-of-Huckleberry-Finn_76/raw/master/metadata.yaml
# to https://github.com/GITenberg/Adventures-of-Huckleberry-Finn_76/releases/download/v0.0.3/Adventures-of-Huckleberry-Finn.epub
if yaml_url.endswith('raw/master/metadata.yaml'):
repo_url = yaml_url[0:-24]
2015-08-03 14:04:07 +00:00
#print (repo_url,version,edition_name)
ebook_url = repo_url + 'releases/download/' + version + '/' + edition_name + '.' + format_
return ebook_url
def release_from_tag(repo, tag_name):
"""Get a release by tag name.
release_from_tag() returns a release with specified tag
while release() returns a release with specified release id
:param str tag_name: (required) name of tag
:returns: :class:`Release <github3.repos.release.Release>`
"""
2017-07-28 16:45:17 +00:00
# release_from_tag adapted from
# https://github.com/sigmavirus24/github3.py/blob/38de787e465bffc63da73d23dc51f50d86dc903d/github3/repos/repo.py#L1781-L1793
url = repo._build_url('releases', 'tags', tag_name,
base_url=repo._api)
2017-07-28 16:45:17 +00:00
json_obj = repo._json(repo._get(url), 200)
return Release(json_obj, repo) if json_obj else None
def ebooks_in_github_release(repo_owner, repo_name, tag, token=None):
"""
returns a list of (book_type, book_name) for a given GitHub release (specified by
owner, name, tag). token is a GitHub authorization token -- useful for accessing
higher rate limit in the GitHub API
"""
# map mimetype to file extension
2017-07-28 16:45:17 +00:00
EBOOK_FORMATS = dict([(v, k) for (k, v) in settings.CONTENT_TYPES.items()])
if token is not None:
gh = login(token=token)
else:
# anonymous access
gh = GitHub()
repo = gh.repository(repo_owner, repo_name)
release = release_from_tag(repo, tag)
return [(EBOOK_FORMATS.get(asset.content_type), asset.name)
for asset in release.iter_assets()
if EBOOK_FORMATS.get(asset.content_type) is not None]
2017-07-28 16:45:17 +00:00
2017-08-08 16:38:54 +00:00
def add_by_webpage(url, work=None, user=None):
2017-08-07 20:17:00 +00:00
edition = None
scraper = BaseScraper(url)
loader = BasePandataLoader(url)
pandata = Pandata()
pandata.metadata = scraper.metadata
for metadata in pandata.get_edition_list():
edition = loader.load_from_pandata(metadata, work)
work = edition.work
2017-08-08 16:38:54 +00:00
loader.load_ebooks(pandata, edition, user=user)
return edition if edition else None
2017-07-28 16:45:17 +00:00
2017-08-23 16:21:56 +00:00
def add_by_sitemap(url, maxnum=None):
editions = []
scraper = BaseScraper(url)
for bookdata in scrape_sitemap(url, maxnum=maxnum):
edition = None
loader = BasePandataLoader(bookdata.base)
pandata = Pandata()
pandata.metadata = bookdata.metadata
for metadata in pandata.get_edition_list():
edition = loader.load_from_pandata(metadata, None)
work = edition.work
loader.load_ebooks(pandata, edition)
if edition:
editions.append(edition)
return editions
2017-07-28 16:45:17 +00:00