refactor ebf(url)
parent
d977e70e94
commit
a6b02d387e
|
@ -14,7 +14,6 @@ import requests
|
|||
# django imports
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.files.base import ContentFile
|
||||
from django.core.files.storage import default_storage
|
||||
from django.db import IntegrityError
|
||||
from django.db.models import Sum
|
||||
|
@ -31,7 +30,6 @@ from gitenberg.metadata.pandata import Pandata
|
|||
|
||||
import regluit
|
||||
import regluit.core.isbn
|
||||
from regluit.core.validation import test_file
|
||||
from regluit.marc.models import inverse_marc_rels
|
||||
from regluit.utils.lang import lang_to_language_code
|
||||
|
||||
|
@ -39,6 +37,7 @@ from . import cc
|
|||
from . import models
|
||||
from .parameters import WORK_IDENTIFIERS
|
||||
from .validation import identifier_cleaner, unreverse_name
|
||||
from .models import loader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
request_log = logging.getLogger("requests")
|
||||
|
@ -884,22 +883,6 @@ def edition_for_etype(etype, metadata, default=None):
|
|||
for key in metadata.edition_identifiers.keys():
|
||||
return edition_for_ident(key, metadata.identifiers[key])
|
||||
|
||||
def load_ebookfile(url, etype):
|
||||
'''
|
||||
return a ContentFile if a new ebook has been loaded
|
||||
'''
|
||||
ebfs = models.EbookFile.objects.filter(source=url)
|
||||
if ebfs:
|
||||
return None
|
||||
try:
|
||||
r = requests.get(url)
|
||||
contentfile = ContentFile(r.content)
|
||||
test_file(contentfile, etype)
|
||||
return contentfile
|
||||
except IOError as e:
|
||||
logger.error(u'could not open {}'.format(url))
|
||||
except ValidationError as e:
|
||||
logger.error(u'downloaded {} was not a valid {}'.format(url, etype))
|
||||
|
||||
class BasePandataLoader(object):
|
||||
def __init__(self, url):
|
||||
|
@ -1016,8 +999,8 @@ class BasePandataLoader(object):
|
|||
if url:
|
||||
edition = edition_for_etype(key, metadata, default=default_edition)
|
||||
if edition:
|
||||
contentfile = load_ebookfile(url, key)
|
||||
if contentfile:
|
||||
contentfile, fmt = loader.load_ebookfile(url, key)
|
||||
if contentfile and fmt == key:
|
||||
contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
|
||||
path = default_storage.save(contentfile_name, contentfile)
|
||||
ebf = models.EbookFile.objects.create(
|
||||
|
|
|
@ -18,7 +18,7 @@ from oaipmh.metadata import MetadataRegistry, oai_dc_reader
|
|||
from regluit.core import bookloader, cc
|
||||
from regluit.core import models, tasks
|
||||
from regluit.core.bookloader import merge_works
|
||||
from regluit.core.loaders.utils import type_for_url
|
||||
from regluit.core.models.loader import type_for_url
|
||||
from regluit.core.validation import identifier_cleaner, valid_subject
|
||||
|
||||
from . import scrape_language
|
||||
|
|
|
@ -9,7 +9,7 @@ from urllib.parse import urlparse, urljoin
|
|||
import requests
|
||||
|
||||
from regluit.utils.lang import lang_to_language_code
|
||||
from .utils import get_soup
|
||||
from .soup import get_soup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
|
@ -4,20 +4,19 @@ code for harvesting 'online' ebooks
|
|||
import logging
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.files.base import ContentFile
|
||||
|
||||
from regluit.core.models import (
|
||||
Ebook, EbookFile, path_for_file,
|
||||
)
|
||||
from regluit.core import models
|
||||
from regluit.core.models import loader
|
||||
from regluit.core.parameters import GOOD_PROVIDERS
|
||||
from regluit.core.pdf import staple_pdf
|
||||
|
||||
from .utils import get_soup, type_for_url
|
||||
|
||||
from .soup import get_soup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -40,9 +39,11 @@ class RateLimiter(object):
|
|||
|
||||
rl = RateLimiter()
|
||||
|
||||
def dl_online(ebook, limiter=rl.delay):
|
||||
if ebook.format != 'online':
|
||||
def dl_online(ebook, limiter=rl.delay, format='online'):
|
||||
if ebook.format != format or ebook.provider in DONT_HARVEST:
|
||||
return None, 0
|
||||
if ebook.ebook_files.exists():
|
||||
return ebook.ebook_files.first(), 0
|
||||
for do_harvest, harvester in harvesters(ebook):
|
||||
if do_harvest:
|
||||
for ebf in ebf_if_harvested(ebook.url):
|
||||
|
@ -63,7 +64,13 @@ CMPPROVIDERS = [
|
|||
'editorial.uniagustiniana.edu.co',
|
||||
'monographs.uc.pt',
|
||||
]
|
||||
|
||||
DONT_HARVEST = [
|
||||
'Unglue.it',
|
||||
'Github',
|
||||
'Project Gutenberg',
|
||||
'Google Books',
|
||||
'OpenEdition Books',
|
||||
]
|
||||
|
||||
def harvesters(ebook):
|
||||
yield ebook.provider in GOOD_PROVIDERS, harvest_generic
|
||||
|
@ -109,10 +116,10 @@ def harvesters(ebook):
|
|||
|
||||
|
||||
def ebf_if_harvested(url):
|
||||
onlines = EbookFile.objects.filter(source=url)
|
||||
onlines = models.EbookFile.objects.filter(source=url)
|
||||
if onlines:
|
||||
return onlines
|
||||
return EbookFile.objects.none()
|
||||
return models.EbookFile.objects.none()
|
||||
|
||||
|
||||
def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET'):
|
||||
|
@ -122,75 +129,66 @@ def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET'):
|
|||
logger.info('making %s' % url)
|
||||
|
||||
# check to see if url already harvested
|
||||
new_prev = []
|
||||
for ebf in ebf_if_harvested(url):
|
||||
new_ebf = EbookFile.objects.create(
|
||||
if ebf.ebook == ebook:
|
||||
return ebf, 0
|
||||
new_ebf = models.EbookFile.objects.create(
|
||||
edition=ebf.edition,
|
||||
format=ebf.format,
|
||||
file=ebf.file,
|
||||
source=ebook.url,
|
||||
ebook=ebook,
|
||||
)
|
||||
new_prev.append(new_ebf)
|
||||
if new_prev:
|
||||
logger.info("Previously harvested")
|
||||
return new_prev[0], len(new_prev)
|
||||
return new_ebf, 0
|
||||
|
||||
try:
|
||||
if method == 'POST':
|
||||
response = requests.post(url, headers={"User-Agent": user_agent})
|
||||
|
||||
dl_cf, fmt = loader.load_ebookfile(url, ebook.format, user_agent=user_agent, method=method)
|
||||
if dl_cf:
|
||||
return make_harvested_ebook(dl_cf, ebook, fmt, filesize=dl_cf.size)
|
||||
else:
|
||||
response = requests.get(url, headers={"User-Agent": user_agent})
|
||||
except requests.exceptions.SSLError:
|
||||
logger.error('bad certificate? for %s', url)
|
||||
return None, 0
|
||||
if response.status_code == 200:
|
||||
filesize = int(response.headers.get("Content-Length", 0))
|
||||
filesize = filesize if filesize else None
|
||||
logger.debug(response.headers.get('content-type', ''))
|
||||
format = type_for_url(url,
|
||||
content_type=response.headers.get('content-type', ''),
|
||||
disposition=response.headers.get('content-disposition', ''))
|
||||
if format != 'online':
|
||||
return make_harvested_ebook(response.content, ebook, format, filesize=filesize)
|
||||
else:
|
||||
logger.warning('download format %s for %s is not ebook', format, url)
|
||||
else:
|
||||
logger.warning('couldn\'t get %s', url)
|
||||
logger.warning('download format %s for %s is not ebook', ebook.format, url)
|
||||
return None, 0
|
||||
|
||||
def make_stapled_ebook(urllist, ebook, user_agent=settings.USER_AGENT, strip_covers=False):
|
||||
pdffile = staple_pdf(urllist, user_agent, strip_covers=strip_covers)
|
||||
if not pdffile:
|
||||
return None, 0
|
||||
return make_harvested_ebook(pdffile.getvalue(), ebook, 'pdf')
|
||||
return make_harvested_ebook(ContentFile(pdffile.getvalue()), ebook, 'pdf')
|
||||
|
||||
def make_harvested_ebook(content, ebook, format, filesize=0):
|
||||
if not filesize:
|
||||
filesize = len(content)
|
||||
new_ebf = EbookFile.objects.create(
|
||||
new_ebf = models.EbookFile.objects.create(
|
||||
edition=ebook.edition,
|
||||
format=format,
|
||||
source=ebook.url,
|
||||
)
|
||||
try:
|
||||
new_ebf.file.save(path_for_file(new_ebf, None), ContentFile(content))
|
||||
new_ebf.file.save(models.path_for_file(new_ebf, None), content)
|
||||
new_ebf.save()
|
||||
except MemoryError: #huge pdf files cause problems here
|
||||
logger.error("memory error saving ebook file for %s", ebook.url)
|
||||
new_ebf.delete()
|
||||
return None, 0
|
||||
|
||||
new_ebook = Ebook.objects.create(
|
||||
if ebook.format == "online":
|
||||
harvested_ebook = models.Ebook.objects.create(
|
||||
edition=ebook.edition,
|
||||
format=format,
|
||||
provider='Unglue.it',
|
||||
url=new_ebf.file.url,
|
||||
rights=ebook.rights,
|
||||
filesize=filesize,
|
||||
filesize=filesize if filesize < 2147483647 else 2147483647, # largest safe integer
|
||||
version_label=ebook.version_label,
|
||||
version_iter=ebook.version_iter,
|
||||
)
|
||||
new_ebf.ebook = new_ebook
|
||||
else:
|
||||
if not ebook.filesize:
|
||||
ebook.filesize = filesize if filesize < 2147483647 else 2147483647
|
||||
ebook.save()
|
||||
harvested_ebook = ebook
|
||||
|
||||
new_ebf.ebook = harvested_ebook
|
||||
new_ebf.save()
|
||||
return new_ebf, 1
|
||||
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
import logging
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
def get_soup(url, user_agent=settings.USER_AGENT):
|
||||
try:
|
||||
response = requests.get(url, headers={"User-Agent": user_agent})
|
||||
except requests.exceptions.MissingSchema:
|
||||
response = requests.get('http://%s' % url, headers={"User-Agent": user_agent})
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
logger.error("Connection refused for %s", url)
|
||||
logger.error(e)
|
||||
return None
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.content, 'lxml')
|
||||
|
||||
# make sure document has a base
|
||||
if not soup.find('base'):
|
||||
obj = soup.find('head')
|
||||
if obj:
|
||||
obj.append(soup.new_tag("base", href=response.url))
|
||||
else:
|
||||
logger.error('No head for %s', url)
|
||||
return soup
|
||||
else:
|
||||
logger.error('%s returned code %s', url, response.status_code)
|
||||
return None
|
|
@ -23,6 +23,7 @@ class LoaderTests(TestCase):
|
|||
self.assertTrue(dropbox_ebf.ebook.filesize)
|
||||
|
||||
jbe_url = 'http://www.jbe-platform.com/content/books/9789027295958'
|
||||
jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition)
|
||||
jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition,
|
||||
provider='jbe-platform.com')
|
||||
jbe_ebf, new_ebf = dl_online(jbe_ebook)
|
||||
self.assertTrue(jbe_ebf.ebook.filesize)
|
||||
|
|
|
@ -10,6 +10,7 @@ import requests
|
|||
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
from regluit.api.crosswalks import inv_relator_contrib
|
||||
from regluit.bisac.models import BisacHeading
|
||||
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
|
||||
|
@ -18,6 +19,8 @@ from regluit.core.models import (
|
|||
Ebook, Edition, Identifier, Subject, Work,
|
||||
)
|
||||
|
||||
from .soup import get_soup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def UnicodeDictReader(utf8_data, **kwargs):
|
||||
|
@ -41,29 +44,6 @@ def utf8_general_ci_norm(s):
|
|||
s1 = unicodedata.normalize('NFD', s)
|
||||
return ''.join(c for c in s1 if not unicodedata.combining(c)).upper()
|
||||
|
||||
def get_soup(url, user_agent=settings.USER_AGENT):
|
||||
try:
|
||||
response = requests.get(url, headers={"User-Agent": user_agent})
|
||||
except requests.exceptions.MissingSchema:
|
||||
response = requests.get('http://%s' % url, headers={"User-Agent": user_agent})
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
logger.error("Connection refused for %s", url)
|
||||
logger.error(e)
|
||||
return None
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.content, 'lxml')
|
||||
|
||||
# make sure document has a base
|
||||
if not soup.find('base'):
|
||||
obj = soup.find('head')
|
||||
if obj:
|
||||
obj.append(soup.new_tag("base", href=response.url))
|
||||
else:
|
||||
logger.error('No head for %s', url)
|
||||
return soup
|
||||
else:
|
||||
logger.error('%s returned code %s', url, response.status_code)
|
||||
return None
|
||||
|
||||
def get_authors(book):
|
||||
authors = []
|
||||
|
@ -378,89 +358,3 @@ def ids_from_urls(url):
|
|||
ids[ident] = id_match.group('id')
|
||||
return ids
|
||||
|
||||
def type_for_url(url, content_type=None, force=False, disposition=''):
|
||||
url_disp = url + disposition
|
||||
if not url:
|
||||
return ''
|
||||
|
||||
# check to see if we already know
|
||||
for ebook in Ebook.objects.filter(url=url):
|
||||
if ebook.format != 'online':
|
||||
return ebook.format
|
||||
|
||||
if not force:
|
||||
if url.find('books.openedition.org') >= 0:
|
||||
return 'online'
|
||||
if content_type:
|
||||
ct = content_type
|
||||
else:
|
||||
ct, disposition = contenttyper.calc_type(url)
|
||||
url_disp = url + disposition
|
||||
binary_type = re.search("octet-stream", ct) or re.search("application/binary", ct)
|
||||
if re.search("pdf", ct):
|
||||
return "pdf"
|
||||
elif binary_type and re.search("pdf", url_disp, flags=re.I):
|
||||
return "pdf"
|
||||
elif binary_type and re.search("epub", url_disp, flags=re.I):
|
||||
return "epub"
|
||||
elif binary_type and re.search("mobi", url_disp, flags=re.I):
|
||||
return "mobi"
|
||||
elif re.search("text/plain", ct):
|
||||
return "text"
|
||||
elif re.search("text/html", ct):
|
||||
if url.find('oapen.org/view') >= 0:
|
||||
return "html"
|
||||
return "online"
|
||||
elif re.search("epub", ct):
|
||||
return "epub"
|
||||
elif re.search("mobi", ct):
|
||||
return "mobi"
|
||||
elif ct == '404':
|
||||
return ct
|
||||
# no content-type header!
|
||||
elif ct == '' and re.search("epub", url_disp, flags=re.I):
|
||||
return "epub"
|
||||
elif ct == '' and re.search("pdf", url_disp, flags=re.I):
|
||||
return "pdf"
|
||||
elif ct == '' and re.search("mobi", url_disp, flags=re.I):
|
||||
return "mobi"
|
||||
|
||||
return "other"
|
||||
|
||||
class ContentTyper(object):
|
||||
""" """
|
||||
def __init__(self):
|
||||
self.last_call = dict()
|
||||
|
||||
def content_type(self, url):
|
||||
try:
|
||||
r = requests.head(url, allow_redirects=True)
|
||||
if r.status_code == 405:
|
||||
r = requests.get(url)
|
||||
elif r.status_code == 404:
|
||||
logger.error('File not found (404) for %s', url)
|
||||
return '404', ''
|
||||
return r.headers.get('content-type', ''), r.headers.get('content-disposition', '')
|
||||
except:
|
||||
return '', ''
|
||||
|
||||
def calc_type(self, url):
|
||||
logger.info(url)
|
||||
delay = 1
|
||||
# is there a delay associated with the url
|
||||
netloc = urlparse(url).netloc
|
||||
|
||||
# wait if necessary
|
||||
last_call = self.last_call.get(netloc)
|
||||
if last_call is not None:
|
||||
now = time.time()
|
||||
min_time_next_call = last_call + delay
|
||||
if min_time_next_call > now:
|
||||
time.sleep(min_time_next_call-now)
|
||||
|
||||
self.last_call[netloc] = time.time()
|
||||
|
||||
# compute the content-type
|
||||
return self.content_type(url)
|
||||
|
||||
contenttyper = ContentTyper()
|
||||
|
|
|
@ -61,6 +61,7 @@ from regluit.core.parameters import (
|
|||
THANKED,
|
||||
OFFER_CHOICES,
|
||||
ACQ_CHOICES,
|
||||
GOOD_PROVIDERS,
|
||||
)
|
||||
from regluit.core.epub import personalize, ungluify, ask_epub
|
||||
from regluit.core.pdf import ask_pdf, pdf_append
|
||||
|
@ -79,7 +80,6 @@ from .bibmodels import (
|
|||
EbookFile,
|
||||
Edition,
|
||||
EditionNote,
|
||||
good_providers,
|
||||
Identifier,
|
||||
path_for_file,
|
||||
Publisher,
|
||||
|
@ -893,9 +893,9 @@ class Campaign(models.Model):
|
|||
def make_mobis(self):
|
||||
# make archive files for ebooks, make mobi files for epubs
|
||||
versions = set()
|
||||
for ebook in self.work.ebooks().filter(provider__in=good_providers, format='mobi'):
|
||||
for ebook in self.work.ebooks().filter(provider__in=GOOD_PROVIDERS, format='mobi'):
|
||||
versions.add(ebook.version_label)
|
||||
for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=good_providers, format='epub'):
|
||||
for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=GOOD_PROVIDERS, format='epub'):
|
||||
if not ebook.version_label in versions:
|
||||
# now make the mobi file
|
||||
ebf = ebook.get_archive_ebf()
|
||||
|
@ -912,7 +912,7 @@ class Campaign(models.Model):
|
|||
ebf.file.open()
|
||||
to_dos.append({'content': ebf.file.read(), 'ebook': ebf.ebook})
|
||||
format_versions.append(format_version)
|
||||
for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=good_providers):
|
||||
for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=GOOD_PROVIDERS):
|
||||
format_version = '{}_{}'.format(ebook.format, ebook.version_label)
|
||||
if ebook.format in ('pdf', 'epub') and not format_version in format_versions:
|
||||
to_dos.append({'content': ebook.get_archive().read(), 'ebook': ebook})
|
||||
|
@ -1018,6 +1018,7 @@ class Campaign(models.Model):
|
|||
provider="Unglue.it",
|
||||
url=settings.BASE_URL_SECURE + reverse('download_campaign', args=[self.work_id, format]),
|
||||
version_label='unglued',
|
||||
filesize=ebf.file.size,
|
||||
)
|
||||
old_ebooks = Ebook.objects.exclude(pk=ebook.pk).filter(
|
||||
edition=self.work.preferred_edition,
|
||||
|
|
|
@ -35,8 +35,8 @@ from regluit.core import mobi
|
|||
import regluit.core.cc as cc
|
||||
from regluit.core.epub import test_epub
|
||||
from regluit.core.links import id_url
|
||||
from regluit.core.loaders.harvest import dl_online
|
||||
from regluit.core.validation import valid_subject
|
||||
|
||||
from regluit.core.parameters import (
|
||||
AGE_LEVEL_CHOICES,
|
||||
BORROWED,
|
||||
|
@ -56,7 +56,6 @@ from regluit.core.parameters import (
|
|||
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
good_providers = ('Internet Archive', 'Unglue.it', 'Github', 'OAPEN Library', 'SciELO')
|
||||
|
||||
def id_for(obj, type):
|
||||
if not obj.pk:
|
||||
|
@ -1143,7 +1142,7 @@ class EbookFile(models.Model):
|
|||
edition=self.edition,
|
||||
format='mobi',
|
||||
asking=self.asking,
|
||||
source=self.file.url
|
||||
source=self.file.url,
|
||||
)
|
||||
|
||||
new_mobi_ebf.file.save(path_for_file(new_mobi_ebf, None), mobi_cf)
|
||||
|
@ -1157,6 +1156,7 @@ class EbookFile(models.Model):
|
|||
rights=self.ebook.rights,
|
||||
version_label=self.ebook.version_label,
|
||||
version_iter=self.ebook.version_iter,
|
||||
filesize=mobi_cf.size,
|
||||
)
|
||||
new_mobi_ebf.ebook = new_ebook
|
||||
new_mobi_ebf.save()
|
||||
|
@ -1205,39 +1205,14 @@ class Ebook(models.Model):
|
|||
return ebf.file
|
||||
|
||||
def get_archive_ebf(self): # returns an ebf
|
||||
if not self.ebook_files.filter(asking=False).exists():
|
||||
if not self.provider in good_providers:
|
||||
if self.ebook_files.filter(asking=False):
|
||||
ebf = self.ebook_files.filter(asking=False).last()
|
||||
elif EbookFile.objects.filter(source=self.url, format=self.format):
|
||||
ebf = self.ebook_files.filter(asking=False).last()
|
||||
else:
|
||||
ebf, num = dl_online(self, format=self.format)
|
||||
if not ebf:
|
||||
return None
|
||||
try:
|
||||
r = requests.get(self.url)
|
||||
if r.status_code == 200:
|
||||
self.filesize = len(r.content)
|
||||
if self.save:
|
||||
self.filesize = self.filesize if self.filesize < 2147483647 else 2147483647 # largest safe positive integer
|
||||
self.save()
|
||||
ebf = EbookFile.objects.create(
|
||||
edition=self.edition,
|
||||
ebook=self,
|
||||
format=self.format,
|
||||
source=self.url
|
||||
)
|
||||
ebf.file.save(path_for_file(ebf, None), ContentFile(r.content))
|
||||
ebf.file.close()
|
||||
ebf.save()
|
||||
return ebf
|
||||
else:
|
||||
logging.error('Bad link error: {}'.format(self.url))
|
||||
except IOError:
|
||||
logger.error(u'could not open {}'.format(self.url))
|
||||
else:
|
||||
ebf = self.ebook_files.filter(asking=False).order_by('-created')[0]
|
||||
if not self.filesize:
|
||||
try:
|
||||
self.filesize = ebf.file.size
|
||||
self.save()
|
||||
except ClientError:
|
||||
# error thrown when the can't access the S3 bucket
|
||||
pass
|
||||
return ebf
|
||||
|
||||
def set_provider(self):
|
||||
|
|
|
@ -0,0 +1,147 @@
|
|||
import logging
|
||||
import re
|
||||
import requests
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from django.apps import apps
|
||||
from django.conf import settings
|
||||
from django.core.files.base import ContentFile
|
||||
from django.forms import ValidationError
|
||||
|
||||
from regluit.core.validation import test_file
|
||||
from regluit.core import models
|
||||
#from . import Ebook, EbookFile
|
||||
|
||||
#Ebook = apps.get_model('core', 'Ebook')
|
||||
#EbookFile = apps.get_model('core', 'EbookFile')
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def type_for_url(url, content_type=None, force=False, disposition=''):
|
||||
url_disp = url + disposition
|
||||
if not url:
|
||||
return ''
|
||||
|
||||
# check to see if we already know
|
||||
for ebook in models.Ebook.objects.filter(url=url):
|
||||
if ebook.format != 'online':
|
||||
return ebook.format
|
||||
|
||||
if not force:
|
||||
if url.find('books.openedition.org') >= 0:
|
||||
return 'online'
|
||||
if content_type:
|
||||
ct = content_type
|
||||
else:
|
||||
ct, disposition = contenttyper.calc_type(url)
|
||||
url_disp = url + disposition
|
||||
binary_type = re.search("octet-stream", ct) or re.search("application/binary", ct)
|
||||
if re.search("pdf", ct):
|
||||
return "pdf"
|
||||
elif binary_type and re.search("pdf", url_disp, flags=re.I):
|
||||
return "pdf"
|
||||
elif binary_type and re.search("epub", url_disp, flags=re.I):
|
||||
return "epub"
|
||||
elif binary_type and re.search("mobi", url_disp, flags=re.I):
|
||||
return "mobi"
|
||||
elif re.search("text/plain", ct):
|
||||
return "text"
|
||||
elif re.search("text/html", ct):
|
||||
if url.find('oapen.org/view') >= 0:
|
||||
return "html"
|
||||
return "online"
|
||||
elif re.search("epub", ct):
|
||||
return "epub"
|
||||
elif re.search("mobi", ct):
|
||||
return "mobi"
|
||||
elif ct == '404':
|
||||
return ct
|
||||
# no content-type header!
|
||||
elif ct == '' and re.search("epub", url_disp, flags=re.I):
|
||||
return "epub"
|
||||
elif ct == '' and re.search("pdf", url_disp, flags=re.I):
|
||||
return "pdf"
|
||||
elif ct == '' and re.search("mobi", url_disp, flags=re.I):
|
||||
return "mobi"
|
||||
|
||||
return "other"
|
||||
|
||||
class ContentTyper(object):
|
||||
""" """
|
||||
def __init__(self):
|
||||
self.last_call = dict()
|
||||
|
||||
def content_type(self, url):
|
||||
try:
|
||||
r = requests.head(url, allow_redirects=True)
|
||||
if r.status_code == 405:
|
||||
r = requests.get(url)
|
||||
elif r.status_code == 404:
|
||||
logger.error('File not found (404) for %s', url)
|
||||
return '404', ''
|
||||
return r.headers.get('content-type', ''), r.headers.get('content-disposition', '')
|
||||
except:
|
||||
return '', ''
|
||||
|
||||
def calc_type(self, url):
|
||||
logger.info(url)
|
||||
delay = 1
|
||||
# is there a delay associated with the url
|
||||
netloc = urlparse(url).netloc
|
||||
|
||||
# wait if necessary
|
||||
last_call = self.last_call.get(netloc)
|
||||
if last_call is not None:
|
||||
now = time.time()
|
||||
min_time_next_call = last_call + delay
|
||||
if min_time_next_call > now:
|
||||
time.sleep(min_time_next_call-now)
|
||||
|
||||
self.last_call[netloc] = time.time()
|
||||
|
||||
# compute the content-type
|
||||
return self.content_type(url)
|
||||
|
||||
contenttyper = ContentTyper()
|
||||
|
||||
def load_ebookfile(url, format, user_agent=settings.USER_AGENT, method='GET'):
|
||||
'''
|
||||
return a ContentFile, format if a new ebook has been loaded
|
||||
'''
|
||||
ebfs = models.EbookFile.objects.filter(source=url)
|
||||
if ebfs:
|
||||
return None, ''
|
||||
try:
|
||||
if method == 'POST':
|
||||
response = requests.post(url, headers={"User-Agent": user_agent})
|
||||
else:
|
||||
response = requests.get(url, headers={"User-Agent": user_agent})
|
||||
|
||||
except requests.exceptions.SSLError:
|
||||
logger.error('bad certificate? for %s', url)
|
||||
return None, ''
|
||||
except IOError as e:
|
||||
logger.error('could not open %', url)
|
||||
return None, ''
|
||||
|
||||
if response.status_code == 200:
|
||||
logger.debug(response.headers.get('content-type', ''))
|
||||
resp_format = type_for_url(url,
|
||||
content_type=response.headers.get('content-type', ''),
|
||||
disposition=response.headers.get('content-disposition', ''))
|
||||
if resp_format == 'online' or (format != 'online' and resp_format != format):
|
||||
logger.warning('response format %s for %s is not correct', resp_format, url)
|
||||
return None, resp_format
|
||||
else:
|
||||
logger.warning('couldn\'t get %s', url)
|
||||
return None, ''
|
||||
|
||||
contentfile = ContentFile(response.content)
|
||||
try:
|
||||
test_file(contentfile, resp_format)
|
||||
return contentfile, resp_format
|
||||
except ValidationError as e:
|
||||
logger.error('downloaded %s was not a valid %s', url, format)
|
||||
None, resp_format
|
||||
|
||||
|
|
@ -48,8 +48,5 @@ WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk', 'http', 'doab')
|
|||
|
||||
ID_CHOICES_MAP = dict(ID_CHOICES)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
GOOD_PROVIDERS = ('Internet Archive', 'Unglue.it', 'Github', 'OAPEN Library', 'SciELO')
|
||||
|
||||
|
|
|
@ -4,8 +4,9 @@ from io import StringIO
|
|||
|
||||
|
||||
from regluit.core.facets import BaseFacet
|
||||
from regluit.core.models import Work, good_providers
|
||||
from regluit.core.models import Work
|
||||
from regluit.api.onix import onix_feed
|
||||
from regluit.core.parameters import GOOD_PROVIDERS
|
||||
|
||||
from .models import Target
|
||||
|
||||
|
@ -45,7 +46,7 @@ def get_target_facet(target, start=datetime(1900,1,1), new=False):
|
|||
editions__ebooks__created__gt = start,
|
||||
identifiers__type="isbn",
|
||||
editions__ebooks__format__in = formats,
|
||||
editions__ebooks__provider__in = good_providers,
|
||||
editions__ebooks__provider__in = GOOD_PROVIDERS,
|
||||
).distinct().order_by('-featured')
|
||||
|
||||
model_filters = {"Ebook": format_filter, "Edition": edition_format_filter}
|
||||
|
|
|
@ -494,8 +494,9 @@ def manage_ebooks(request, edition_id, by=None):
|
|||
ebook_form = EbookForm(data = request.POST, files=request.FILES,)
|
||||
if ebook_form.is_valid():
|
||||
if ebook_form.cleaned_data.get('file', None):
|
||||
file=ebook_form.cleaned_data['file']
|
||||
new_ebf = models.EbookFile.objects.create(
|
||||
file=ebook_form.cleaned_data['file'],
|
||||
file=file,
|
||||
format=ebook_form.cleaned_data['format'],
|
||||
edition=edition,
|
||||
)
|
||||
|
@ -504,6 +505,8 @@ def manage_ebooks(request, edition_id, by=None):
|
|||
ebook_form.instance.save()
|
||||
new_ebf.ebook = ebook_form.instance
|
||||
new_ebf.save()
|
||||
new_ebf.ebook.filesize = new_ebf.file.size
|
||||
new_ebf.ebook.save()
|
||||
else:
|
||||
ebook_form.save()
|
||||
ebook_form.instance.set_next_iter()
|
||||
|
|
Loading…
Reference in New Issue