refactor ebf(url)

pull/94/head
eric 2020-08-15 20:21:56 -04:00
parent d977e70e94
commit a6b02d387e
13 changed files with 261 additions and 230 deletions

View File

@ -14,7 +14,6 @@ import requests
# django imports
from django.conf import settings
from django.core.files.base import ContentFile
from django.core.files.storage import default_storage
from django.db import IntegrityError
from django.db.models import Sum
@ -31,7 +30,6 @@ from gitenberg.metadata.pandata import Pandata
import regluit
import regluit.core.isbn
from regluit.core.validation import test_file
from regluit.marc.models import inverse_marc_rels
from regluit.utils.lang import lang_to_language_code
@ -39,6 +37,7 @@ from . import cc
from . import models
from .parameters import WORK_IDENTIFIERS
from .validation import identifier_cleaner, unreverse_name
from .models import loader
logger = logging.getLogger(__name__)
request_log = logging.getLogger("requests")
@ -884,22 +883,6 @@ def edition_for_etype(etype, metadata, default=None):
for key in metadata.edition_identifiers.keys():
return edition_for_ident(key, metadata.identifiers[key])
def load_ebookfile(url, etype):
'''
return a ContentFile if a new ebook has been loaded
'''
ebfs = models.EbookFile.objects.filter(source=url)
if ebfs:
return None
try:
r = requests.get(url)
contentfile = ContentFile(r.content)
test_file(contentfile, etype)
return contentfile
except IOError as e:
logger.error(u'could not open {}'.format(url))
except ValidationError as e:
logger.error(u'downloaded {} was not a valid {}'.format(url, etype))
class BasePandataLoader(object):
def __init__(self, url):
@ -1016,8 +999,8 @@ class BasePandataLoader(object):
if url:
edition = edition_for_etype(key, metadata, default=default_edition)
if edition:
contentfile = load_ebookfile(url, key)
if contentfile:
contentfile, fmt = loader.load_ebookfile(url, key)
if contentfile and fmt == key:
contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
path = default_storage.save(contentfile_name, contentfile)
ebf = models.EbookFile.objects.create(

View File

@ -18,7 +18,7 @@ from oaipmh.metadata import MetadataRegistry, oai_dc_reader
from regluit.core import bookloader, cc
from regluit.core import models, tasks
from regluit.core.bookloader import merge_works
from regluit.core.loaders.utils import type_for_url
from regluit.core.models.loader import type_for_url
from regluit.core.validation import identifier_cleaner, valid_subject
from . import scrape_language

View File

@ -9,7 +9,7 @@ from urllib.parse import urlparse, urljoin
import requests
from regluit.utils.lang import lang_to_language_code
from .utils import get_soup
from .soup import get_soup
logger = logging.getLogger(__name__)

View File

@ -4,20 +4,19 @@ code for harvesting 'online' ebooks
import logging
import re
import time
from urllib.parse import urlparse, urljoin
from urllib.parse import urljoin
import requests
from django.conf import settings
from django.core.files.base import ContentFile
from regluit.core.models import (
Ebook, EbookFile, path_for_file,
)
from regluit.core import models
from regluit.core.models import loader
from regluit.core.parameters import GOOD_PROVIDERS
from regluit.core.pdf import staple_pdf
from .utils import get_soup, type_for_url
from .soup import get_soup
logger = logging.getLogger(__name__)
@ -40,9 +39,11 @@ class RateLimiter(object):
rl = RateLimiter()
def dl_online(ebook, limiter=rl.delay):
if ebook.format != 'online':
def dl_online(ebook, limiter=rl.delay, format='online'):
if ebook.format != format or ebook.provider in DONT_HARVEST:
return None, 0
if ebook.ebook_files.exists():
return ebook.ebook_files.first(), 0
for do_harvest, harvester in harvesters(ebook):
if do_harvest:
for ebf in ebf_if_harvested(ebook.url):
@ -63,7 +64,13 @@ CMPPROVIDERS = [
'editorial.uniagustiniana.edu.co',
'monographs.uc.pt',
]
DONT_HARVEST = [
'Unglue.it',
'Github',
'Project Gutenberg',
'Google Books',
'OpenEdition Books',
]
def harvesters(ebook):
yield ebook.provider in GOOD_PROVIDERS, harvest_generic
@ -109,10 +116,10 @@ def harvesters(ebook):
def ebf_if_harvested(url):
onlines = EbookFile.objects.filter(source=url)
onlines = models.EbookFile.objects.filter(source=url)
if onlines:
return onlines
return EbookFile.objects.none()
return models.EbookFile.objects.none()
def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET'):
@ -122,75 +129,66 @@ def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET'):
logger.info('making %s' % url)
# check to see if url already harvested
new_prev = []
for ebf in ebf_if_harvested(url):
new_ebf = EbookFile.objects.create(
if ebf.ebook == ebook:
return ebf, 0
new_ebf = models.EbookFile.objects.create(
edition=ebf.edition,
format=ebf.format,
file=ebf.file,
source=ebook.url,
ebook=ebook,
)
new_prev.append(new_ebf)
if new_prev:
logger.info("Previously harvested")
return new_prev[0], len(new_prev)
return new_ebf, 0
try:
if method == 'POST':
response = requests.post(url, headers={"User-Agent": user_agent})
else:
response = requests.get(url, headers={"User-Agent": user_agent})
except requests.exceptions.SSLError:
logger.error('bad certificate? for %s', url)
return None, 0
if response.status_code == 200:
filesize = int(response.headers.get("Content-Length", 0))
filesize = filesize if filesize else None
logger.debug(response.headers.get('content-type', ''))
format = type_for_url(url,
content_type=response.headers.get('content-type', ''),
disposition=response.headers.get('content-disposition', ''))
if format != 'online':
return make_harvested_ebook(response.content, ebook, format, filesize=filesize)
else:
logger.warning('download format %s for %s is not ebook', format, url)
dl_cf, fmt = loader.load_ebookfile(url, ebook.format, user_agent=user_agent, method=method)
if dl_cf:
return make_harvested_ebook(dl_cf, ebook, fmt, filesize=dl_cf.size)
else:
logger.warning('couldn\'t get %s', url)
logger.warning('download format %s for %s is not ebook', ebook.format, url)
return None, 0
def make_stapled_ebook(urllist, ebook, user_agent=settings.USER_AGENT, strip_covers=False):
pdffile = staple_pdf(urllist, user_agent, strip_covers=strip_covers)
if not pdffile:
return None, 0
return make_harvested_ebook(pdffile.getvalue(), ebook, 'pdf')
return make_harvested_ebook(ContentFile(pdffile.getvalue()), ebook, 'pdf')
def make_harvested_ebook(content, ebook, format, filesize=0):
if not filesize:
filesize = len(content)
new_ebf = EbookFile.objects.create(
new_ebf = models.EbookFile.objects.create(
edition=ebook.edition,
format=format,
source=ebook.url,
)
try:
new_ebf.file.save(path_for_file(new_ebf, None), ContentFile(content))
new_ebf.file.save(models.path_for_file(new_ebf, None), content)
new_ebf.save()
except MemoryError: #huge pdf files cause problems here
logger.error("memory error saving ebook file for %s", ebook.url)
new_ebf.delete()
return None, 0
new_ebook = Ebook.objects.create(
edition=ebook.edition,
format=format,
provider='Unglue.it',
url=new_ebf.file.url,
rights=ebook.rights,
filesize=filesize,
version_label=ebook.version_label,
version_iter=ebook.version_iter,
)
new_ebf.ebook = new_ebook
if ebook.format == "online":
harvested_ebook = models.Ebook.objects.create(
edition=ebook.edition,
format=format,
provider='Unglue.it',
url=new_ebf.file.url,
rights=ebook.rights,
filesize=filesize if filesize < 2147483647 else 2147483647, # largest safe integer
version_label=ebook.version_label,
version_iter=ebook.version_iter,
)
else:
if not ebook.filesize:
ebook.filesize = filesize if filesize < 2147483647 else 2147483647
ebook.save()
harvested_ebook = ebook
new_ebf.ebook = harvested_ebook
new_ebf.save()
return new_ebf, 1

31
core/loaders/soup.py Normal file
View File

@ -0,0 +1,31 @@
import logging
from bs4 import BeautifulSoup
import requests
from django.conf import settings
def get_soup(url, user_agent=settings.USER_AGENT):
try:
response = requests.get(url, headers={"User-Agent": user_agent})
except requests.exceptions.MissingSchema:
response = requests.get('http://%s' % url, headers={"User-Agent": user_agent})
except requests.exceptions.ConnectionError as e:
logger.error("Connection refused for %s", url)
logger.error(e)
return None
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'lxml')
# make sure document has a base
if not soup.find('base'):
obj = soup.find('head')
if obj:
obj.append(soup.new_tag("base", href=response.url))
else:
logger.error('No head for %s', url)
return soup
else:
logger.error('%s returned code %s', url, response.status_code)
return None

View File

@ -23,6 +23,7 @@ class LoaderTests(TestCase):
self.assertTrue(dropbox_ebf.ebook.filesize)
jbe_url = 'http://www.jbe-platform.com/content/books/9789027295958'
jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition)
jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition,
provider='jbe-platform.com')
jbe_ebf, new_ebf = dl_online(jbe_ebook)
self.assertTrue(jbe_ebf.ebook.filesize)

View File

@ -10,6 +10,7 @@ import requests
from django.conf import settings
from regluit.api.crosswalks import inv_relator_contrib
from regluit.bisac.models import BisacHeading
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
@ -18,6 +19,8 @@ from regluit.core.models import (
Ebook, Edition, Identifier, Subject, Work,
)
from .soup import get_soup
logger = logging.getLogger(__name__)
def UnicodeDictReader(utf8_data, **kwargs):
@ -41,29 +44,6 @@ def utf8_general_ci_norm(s):
s1 = unicodedata.normalize('NFD', s)
return ''.join(c for c in s1 if not unicodedata.combining(c)).upper()
def get_soup(url, user_agent=settings.USER_AGENT):
try:
response = requests.get(url, headers={"User-Agent": user_agent})
except requests.exceptions.MissingSchema:
response = requests.get('http://%s' % url, headers={"User-Agent": user_agent})
except requests.exceptions.ConnectionError as e:
logger.error("Connection refused for %s", url)
logger.error(e)
return None
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'lxml')
# make sure document has a base
if not soup.find('base'):
obj = soup.find('head')
if obj:
obj.append(soup.new_tag("base", href=response.url))
else:
logger.error('No head for %s', url)
return soup
else:
logger.error('%s returned code %s', url, response.status_code)
return None
def get_authors(book):
authors = []
@ -378,89 +358,3 @@ def ids_from_urls(url):
ids[ident] = id_match.group('id')
return ids
def type_for_url(url, content_type=None, force=False, disposition=''):
url_disp = url + disposition
if not url:
return ''
# check to see if we already know
for ebook in Ebook.objects.filter(url=url):
if ebook.format != 'online':
return ebook.format
if not force:
if url.find('books.openedition.org') >= 0:
return 'online'
if content_type:
ct = content_type
else:
ct, disposition = contenttyper.calc_type(url)
url_disp = url + disposition
binary_type = re.search("octet-stream", ct) or re.search("application/binary", ct)
if re.search("pdf", ct):
return "pdf"
elif binary_type and re.search("pdf", url_disp, flags=re.I):
return "pdf"
elif binary_type and re.search("epub", url_disp, flags=re.I):
return "epub"
elif binary_type and re.search("mobi", url_disp, flags=re.I):
return "mobi"
elif re.search("text/plain", ct):
return "text"
elif re.search("text/html", ct):
if url.find('oapen.org/view') >= 0:
return "html"
return "online"
elif re.search("epub", ct):
return "epub"
elif re.search("mobi", ct):
return "mobi"
elif ct == '404':
return ct
# no content-type header!
elif ct == '' and re.search("epub", url_disp, flags=re.I):
return "epub"
elif ct == '' and re.search("pdf", url_disp, flags=re.I):
return "pdf"
elif ct == '' and re.search("mobi", url_disp, flags=re.I):
return "mobi"
return "other"
class ContentTyper(object):
""" """
def __init__(self):
self.last_call = dict()
def content_type(self, url):
try:
r = requests.head(url, allow_redirects=True)
if r.status_code == 405:
r = requests.get(url)
elif r.status_code == 404:
logger.error('File not found (404) for %s', url)
return '404', ''
return r.headers.get('content-type', ''), r.headers.get('content-disposition', '')
except:
return '', ''
def calc_type(self, url):
logger.info(url)
delay = 1
# is there a delay associated with the url
netloc = urlparse(url).netloc
# wait if necessary
last_call = self.last_call.get(netloc)
if last_call is not None:
now = time.time()
min_time_next_call = last_call + delay
if min_time_next_call > now:
time.sleep(min_time_next_call-now)
self.last_call[netloc] = time.time()
# compute the content-type
return self.content_type(url)
contenttyper = ContentTyper()

View File

@ -61,6 +61,7 @@ from regluit.core.parameters import (
THANKED,
OFFER_CHOICES,
ACQ_CHOICES,
GOOD_PROVIDERS,
)
from regluit.core.epub import personalize, ungluify, ask_epub
from regluit.core.pdf import ask_pdf, pdf_append
@ -79,7 +80,6 @@ from .bibmodels import (
EbookFile,
Edition,
EditionNote,
good_providers,
Identifier,
path_for_file,
Publisher,
@ -893,9 +893,9 @@ class Campaign(models.Model):
def make_mobis(self):
# make archive files for ebooks, make mobi files for epubs
versions = set()
for ebook in self.work.ebooks().filter(provider__in=good_providers, format='mobi'):
for ebook in self.work.ebooks().filter(provider__in=GOOD_PROVIDERS, format='mobi'):
versions.add(ebook.version_label)
for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=good_providers, format='epub'):
for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=GOOD_PROVIDERS, format='epub'):
if not ebook.version_label in versions:
# now make the mobi file
ebf = ebook.get_archive_ebf()
@ -912,7 +912,7 @@ class Campaign(models.Model):
ebf.file.open()
to_dos.append({'content': ebf.file.read(), 'ebook': ebf.ebook})
format_versions.append(format_version)
for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=good_providers):
for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=GOOD_PROVIDERS):
format_version = '{}_{}'.format(ebook.format, ebook.version_label)
if ebook.format in ('pdf', 'epub') and not format_version in format_versions:
to_dos.append({'content': ebook.get_archive().read(), 'ebook': ebook})
@ -1018,6 +1018,7 @@ class Campaign(models.Model):
provider="Unglue.it",
url=settings.BASE_URL_SECURE + reverse('download_campaign', args=[self.work_id, format]),
version_label='unglued',
filesize=ebf.file.size,
)
old_ebooks = Ebook.objects.exclude(pk=ebook.pk).filter(
edition=self.work.preferred_edition,

View File

@ -35,8 +35,8 @@ from regluit.core import mobi
import regluit.core.cc as cc
from regluit.core.epub import test_epub
from regluit.core.links import id_url
from regluit.core.loaders.harvest import dl_online
from regluit.core.validation import valid_subject
from regluit.core.parameters import (
AGE_LEVEL_CHOICES,
BORROWED,
@ -56,7 +56,6 @@ from regluit.core.parameters import (
ImageFile.LOAD_TRUNCATED_IMAGES = True
logger = logging.getLogger(__name__)
good_providers = ('Internet Archive', 'Unglue.it', 'Github', 'OAPEN Library', 'SciELO')
def id_for(obj, type):
if not obj.pk:
@ -1143,7 +1142,7 @@ class EbookFile(models.Model):
edition=self.edition,
format='mobi',
asking=self.asking,
source=self.file.url
source=self.file.url,
)
new_mobi_ebf.file.save(path_for_file(new_mobi_ebf, None), mobi_cf)
@ -1157,6 +1156,7 @@ class EbookFile(models.Model):
rights=self.ebook.rights,
version_label=self.ebook.version_label,
version_iter=self.ebook.version_iter,
filesize=mobi_cf.size,
)
new_mobi_ebf.ebook = new_ebook
new_mobi_ebf.save()
@ -1205,40 +1205,15 @@ class Ebook(models.Model):
return ebf.file
def get_archive_ebf(self): # returns an ebf
if not self.ebook_files.filter(asking=False).exists():
if not self.provider in good_providers:
return None
try:
r = requests.get(self.url)
if r.status_code == 200:
self.filesize = len(r.content)
if self.save:
self.filesize = self.filesize if self.filesize < 2147483647 else 2147483647 # largest safe positive integer
self.save()
ebf = EbookFile.objects.create(
edition=self.edition,
ebook=self,
format=self.format,
source=self.url
)
ebf.file.save(path_for_file(ebf, None), ContentFile(r.content))
ebf.file.close()
ebf.save()
return ebf
else:
logging.error('Bad link error: {}'.format(self.url))
except IOError:
logger.error(u'could not open {}'.format(self.url))
if self.ebook_files.filter(asking=False):
ebf = self.ebook_files.filter(asking=False).last()
elif EbookFile.objects.filter(source=self.url, format=self.format):
ebf = self.ebook_files.filter(asking=False).last()
else:
ebf = self.ebook_files.filter(asking=False).order_by('-created')[0]
if not self.filesize:
try:
self.filesize = ebf.file.size
self.save()
except ClientError:
# error thrown when the can't access the S3 bucket
pass
return ebf
ebf, num = dl_online(self, format=self.format)
if not ebf:
return None
return ebf
def set_provider(self):
self.provider = Ebook.infer_provider(self.url)

147
core/models/loader.py Normal file
View File

@ -0,0 +1,147 @@
import logging
import re
import requests
from urllib.parse import urlparse
from django.apps import apps
from django.conf import settings
from django.core.files.base import ContentFile
from django.forms import ValidationError
from regluit.core.validation import test_file
from regluit.core import models
#from . import Ebook, EbookFile
#Ebook = apps.get_model('core', 'Ebook')
#EbookFile = apps.get_model('core', 'EbookFile')
logger = logging.getLogger(__name__)
def type_for_url(url, content_type=None, force=False, disposition=''):
url_disp = url + disposition
if not url:
return ''
# check to see if we already know
for ebook in models.Ebook.objects.filter(url=url):
if ebook.format != 'online':
return ebook.format
if not force:
if url.find('books.openedition.org') >= 0:
return 'online'
if content_type:
ct = content_type
else:
ct, disposition = contenttyper.calc_type(url)
url_disp = url + disposition
binary_type = re.search("octet-stream", ct) or re.search("application/binary", ct)
if re.search("pdf", ct):
return "pdf"
elif binary_type and re.search("pdf", url_disp, flags=re.I):
return "pdf"
elif binary_type and re.search("epub", url_disp, flags=re.I):
return "epub"
elif binary_type and re.search("mobi", url_disp, flags=re.I):
return "mobi"
elif re.search("text/plain", ct):
return "text"
elif re.search("text/html", ct):
if url.find('oapen.org/view') >= 0:
return "html"
return "online"
elif re.search("epub", ct):
return "epub"
elif re.search("mobi", ct):
return "mobi"
elif ct == '404':
return ct
# no content-type header!
elif ct == '' and re.search("epub", url_disp, flags=re.I):
return "epub"
elif ct == '' and re.search("pdf", url_disp, flags=re.I):
return "pdf"
elif ct == '' and re.search("mobi", url_disp, flags=re.I):
return "mobi"
return "other"
class ContentTyper(object):
""" """
def __init__(self):
self.last_call = dict()
def content_type(self, url):
try:
r = requests.head(url, allow_redirects=True)
if r.status_code == 405:
r = requests.get(url)
elif r.status_code == 404:
logger.error('File not found (404) for %s', url)
return '404', ''
return r.headers.get('content-type', ''), r.headers.get('content-disposition', '')
except:
return '', ''
def calc_type(self, url):
logger.info(url)
delay = 1
# is there a delay associated with the url
netloc = urlparse(url).netloc
# wait if necessary
last_call = self.last_call.get(netloc)
if last_call is not None:
now = time.time()
min_time_next_call = last_call + delay
if min_time_next_call > now:
time.sleep(min_time_next_call-now)
self.last_call[netloc] = time.time()
# compute the content-type
return self.content_type(url)
contenttyper = ContentTyper()
def load_ebookfile(url, format, user_agent=settings.USER_AGENT, method='GET'):
'''
return a ContentFile, format if a new ebook has been loaded
'''
ebfs = models.EbookFile.objects.filter(source=url)
if ebfs:
return None, ''
try:
if method == 'POST':
response = requests.post(url, headers={"User-Agent": user_agent})
else:
response = requests.get(url, headers={"User-Agent": user_agent})
except requests.exceptions.SSLError:
logger.error('bad certificate? for %s', url)
return None, ''
except IOError as e:
logger.error('could not open %', url)
return None, ''
if response.status_code == 200:
logger.debug(response.headers.get('content-type', ''))
resp_format = type_for_url(url,
content_type=response.headers.get('content-type', ''),
disposition=response.headers.get('content-disposition', ''))
if resp_format == 'online' or (format != 'online' and resp_format != format):
logger.warning('response format %s for %s is not correct', resp_format, url)
return None, resp_format
else:
logger.warning('couldn\'t get %s', url)
return None, ''
contentfile = ContentFile(response.content)
try:
test_file(contentfile, resp_format)
return contentfile, resp_format
except ValidationError as e:
logger.error('downloaded %s was not a valid %s', url, format)
None, resp_format

View File

@ -48,8 +48,5 @@ WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk', 'http', 'doab')
ID_CHOICES_MAP = dict(ID_CHOICES)
GOOD_PROVIDERS = ('Internet Archive', 'Unglue.it', 'Github', 'OAPEN Library', 'SciELO')

View File

@ -4,8 +4,9 @@ from io import StringIO
from regluit.core.facets import BaseFacet
from regluit.core.models import Work, good_providers
from regluit.core.models import Work
from regluit.api.onix import onix_feed
from regluit.core.parameters import GOOD_PROVIDERS
from .models import Target
@ -45,7 +46,7 @@ def get_target_facet(target, start=datetime(1900,1,1), new=False):
editions__ebooks__created__gt = start,
identifiers__type="isbn",
editions__ebooks__format__in = formats,
editions__ebooks__provider__in = good_providers,
editions__ebooks__provider__in = GOOD_PROVIDERS,
).distinct().order_by('-featured')
model_filters = {"Ebook": format_filter, "Edition": edition_format_filter}

View File

@ -494,8 +494,9 @@ def manage_ebooks(request, edition_id, by=None):
ebook_form = EbookForm(data = request.POST, files=request.FILES,)
if ebook_form.is_valid():
if ebook_form.cleaned_data.get('file', None):
file=ebook_form.cleaned_data['file']
new_ebf = models.EbookFile.objects.create(
file=ebook_form.cleaned_data['file'],
file=file,
format=ebook_form.cleaned_data['format'],
edition=edition,
)
@ -504,6 +505,8 @@ def manage_ebooks(request, edition_id, by=None):
ebook_form.instance.save()
new_ebf.ebook = ebook_form.instance
new_ebf.save()
new_ebf.ebook.filesize = new_ebf.file.size
new_ebf.ebook.save()
else:
ebook_form.save()
ebook_form.instance.set_next_iter()