add ebook loading code

pull/43/head
eric 2017-08-07 16:17:00 -04:00
parent 389e9a1749
commit 489790fa2f
2 changed files with 102 additions and 9 deletions

View File

@ -5,6 +5,10 @@ import json
import logging
import re
import requests
import urllib2
from django.core.files.base import ContentFile
from django.core.files.storage import default_storage
from regluit.core.validation import test_file
from datetime import timedelta
from xml.etree import ElementTree
@ -14,8 +18,10 @@ from urlparse import (urljoin, urlparse)
# django imports
from django.conf import settings
from django.core.files.base import ContentFile
from django_comments.models import Comment
from django.db import IntegrityError
from django.forms import ValidationError
from github3 import (login, GitHub)
from github3.repos.release import Release
@ -770,6 +776,42 @@ def load_from_yaml(yaml_url, test_mode=False):
loader.load_ebooks(metadata, edition, test_mode)
return edition.work.id if edition else None
def edition_for_ident(id_type, id_value):
for ident in models.Identifier.objects.filter(type=id_type, value=id_value):
return ident.edition if ident.edition else ident.work.editions[0]
def edition_for_etype(etype, metadata, default=None):
'''
assumes the metadata contains the isbn_etype attributes, and that the editions have been created.
etype is 'epub', 'pdf', etc.
'''
isbn = metadata.metadata.get('isbn_{}'.format(etype), None)
if not isbn:
isbn = metadata.metadata.get('isbn_electronic', None)
if isbn:
return edition_for_ident('isbn', isbn)
else:
if default:
return default
# just return some edition
for key in metadata.identifiers.keys():
return edition_for_ident(key, metadata.identifiers[key])
for key in metadata.edition_identifiers.keys():
return edition_for_ident(key, metadata.identifiers[key])
MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
def load_ebookfile(url, etype):
try:
r = requests.get(url)
ebookfile = ContentFile(r.content)
test_file(ebookfile, etype)
return ebookfile
except IOError, e:
logger.error(u'could not open {}'.format(url))
except ValidationError, e:
logger.error(u'downloaded {} was not a valid {}'.format(url, etype))
class BasePandataLoader(object):
def __init__(self, url):
self.base_url = url
@ -810,7 +852,7 @@ class BasePandataLoader(object):
if not edition:
edition = models.Edition.objects.create(title=metadata.title, work=work)
for (identifier, id_code, value) in new_ids:
models.Identifier.set(type=id_code, value=value, edition=edition if id_code in EDITION_IDENTIFIERS else None, work=work)
models.Identifier.set(type=id_code, value=value, edition=edition if id_code not in WORK_IDENTIFIERS else None, work=work)
if metadata.publisher: #always believe yaml
edition.set_publisher(metadata.publisher)
if metadata.publication_date: #always believe yaml
@ -851,7 +893,36 @@ class BasePandataLoader(object):
return edition
def load_ebooks(self, metadata, edition, test_mode=False):
pass
default_edition = edition
for key in ['epub', 'pdf', 'mobi']:
url = metadata.metadata.get('download_url_{}'.format(key), None)
if url:
edition = edition_for_etype(key, metadata, default=default_edition)
if edition:
contentfile = load_ebookfile(url, key)
if contentfile:
contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
path = default_storage.save(contentfile_name, contentfile)
lic = MATCH_LICENSE.search(metadata.rights_url)
license = 'CC {}'.format(lic.group(1).upper()) if lic else ''
ebf = models.EbookFile.objects.create(
format=key,
edition=edition,
)
ebf.file.save(contentfile_name, contentfile)
ebf.file.close()
(ebook, created) = models.Ebook.objects.get_or_create(
url=ebf.file.url,
provider='Unglue.it',
rights=license,
format=key,
edition=edition,
filesize=contentfile.size,
active=False,
)
ebf.ebook = ebook
ebf.save()
class GithubLoader(BasePandataLoader):
def load_ebooks(self, metadata, edition, test_mode=False):
@ -877,14 +948,18 @@ class GithubLoader(BasePandataLoader):
ebooks_in_release = ebooks_in_github_release(repo_owner, repo_name, repo_tag, token=token)
for (ebook_format, ebook_name) in ebooks_in_release:
(book_name_prefix, _ ) = re.search(r'(.*)\.([^\.]*)$', ebook_name).groups()
(ebook, created)= models.Ebook.objects.get_or_create(
url=git_download_from_yaml_url(self.base_url,metadata._version,edition_name=book_name_prefix,
format_= ebook_format),
(book_name_prefix, _ ) = re.search(r'(.*)\.([^\.]*)$', ebook_name).groups()
(ebook, created) = models.Ebook.objects.get_or_create(
url=git_download_from_yaml_url(
self.base_url,
metadata._version,
edition_name=book_name_prefix,
format_= ebook_format
),
provider='Github',
rights = cc.match_license(metadata.rights),
format = ebook_format,
edition = edition,
rights=cc.match_license(metadata.rights),
format=ebook_format,
edition=edition,
)
ebook.set_version(metadata._version)
@ -938,6 +1013,7 @@ def ebooks_in_github_release(repo_owner, repo_name, tag, token=None):
if EBOOK_FORMATS.get(asset.content_type) is not None]
def add_by_webpage(url, work=None):
edition = None
scraper = BaseScraper(url)
loader = BasePandataLoader(url)
pandata = Pandata()
@ -945,6 +1021,7 @@ def add_by_webpage(url, work=None):
for metadata in pandata.get_edition_list():
edition = loader.load_from_pandata(metadata, work)
work = edition.work
loader.load_ebooks(pandata, edition)
return edition if edition else None

View File

@ -12,6 +12,7 @@ from regluit.core.validation import identifier_cleaner
logger = logging.getLogger(__name__)
CONTAINS_COVER = re.compile('cover')
CONTAINS_CC = re.compile('creativecommons.org')
class BaseScraper(object):
def __init__(self, url):
@ -32,6 +33,8 @@ class BaseScraper(object):
self.get_pubdate()
self.get_authors()
self.get_cover()
self.get_downloads()
self.get_license()
if not self.metadata.get('title', None):
self.set('title', '!!! missing title !!!')
if not self.metadata.get('language', None):
@ -164,3 +167,16 @@ class BaseScraper(object):
cover_uri = img[0].get('src', None)
if cover_uri:
self.set('covers', [{'image_url': urljoin(self.base, cover_uri)}])
def get_downloads(self):
for dl_type in ['epub', 'mobi', 'pdf']:
dl_meta = 'citation_{}_url'.format(dl_type)
value = self.check_metas([dl_meta])
if value:
self.set('download_url_{}'.format(dl_type), value)
def get_license(self):
'''only looks for cc licenses'''
links = self.doc.find_all(href=CONTAINS_CC)
for link in links:
self.set('rights_url', link['href'])