refactor ebf(url)
parent
d977e70e94
commit
a6b02d387e
|
@ -14,7 +14,6 @@ import requests
|
||||||
# django imports
|
# django imports
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.files.base import ContentFile
|
|
||||||
from django.core.files.storage import default_storage
|
from django.core.files.storage import default_storage
|
||||||
from django.db import IntegrityError
|
from django.db import IntegrityError
|
||||||
from django.db.models import Sum
|
from django.db.models import Sum
|
||||||
|
@ -31,7 +30,6 @@ from gitenberg.metadata.pandata import Pandata
|
||||||
|
|
||||||
import regluit
|
import regluit
|
||||||
import regluit.core.isbn
|
import regluit.core.isbn
|
||||||
from regluit.core.validation import test_file
|
|
||||||
from regluit.marc.models import inverse_marc_rels
|
from regluit.marc.models import inverse_marc_rels
|
||||||
from regluit.utils.lang import lang_to_language_code
|
from regluit.utils.lang import lang_to_language_code
|
||||||
|
|
||||||
|
@ -39,6 +37,7 @@ from . import cc
|
||||||
from . import models
|
from . import models
|
||||||
from .parameters import WORK_IDENTIFIERS
|
from .parameters import WORK_IDENTIFIERS
|
||||||
from .validation import identifier_cleaner, unreverse_name
|
from .validation import identifier_cleaner, unreverse_name
|
||||||
|
from .models import loader
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
request_log = logging.getLogger("requests")
|
request_log = logging.getLogger("requests")
|
||||||
|
@ -884,22 +883,6 @@ def edition_for_etype(etype, metadata, default=None):
|
||||||
for key in metadata.edition_identifiers.keys():
|
for key in metadata.edition_identifiers.keys():
|
||||||
return edition_for_ident(key, metadata.identifiers[key])
|
return edition_for_ident(key, metadata.identifiers[key])
|
||||||
|
|
||||||
def load_ebookfile(url, etype):
|
|
||||||
'''
|
|
||||||
return a ContentFile if a new ebook has been loaded
|
|
||||||
'''
|
|
||||||
ebfs = models.EbookFile.objects.filter(source=url)
|
|
||||||
if ebfs:
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
r = requests.get(url)
|
|
||||||
contentfile = ContentFile(r.content)
|
|
||||||
test_file(contentfile, etype)
|
|
||||||
return contentfile
|
|
||||||
except IOError as e:
|
|
||||||
logger.error(u'could not open {}'.format(url))
|
|
||||||
except ValidationError as e:
|
|
||||||
logger.error(u'downloaded {} was not a valid {}'.format(url, etype))
|
|
||||||
|
|
||||||
class BasePandataLoader(object):
|
class BasePandataLoader(object):
|
||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
|
@ -1016,8 +999,8 @@ class BasePandataLoader(object):
|
||||||
if url:
|
if url:
|
||||||
edition = edition_for_etype(key, metadata, default=default_edition)
|
edition = edition_for_etype(key, metadata, default=default_edition)
|
||||||
if edition:
|
if edition:
|
||||||
contentfile = load_ebookfile(url, key)
|
contentfile, fmt = loader.load_ebookfile(url, key)
|
||||||
if contentfile:
|
if contentfile and fmt == key:
|
||||||
contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
|
contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
|
||||||
path = default_storage.save(contentfile_name, contentfile)
|
path = default_storage.save(contentfile_name, contentfile)
|
||||||
ebf = models.EbookFile.objects.create(
|
ebf = models.EbookFile.objects.create(
|
||||||
|
|
|
@ -18,7 +18,7 @@ from oaipmh.metadata import MetadataRegistry, oai_dc_reader
|
||||||
from regluit.core import bookloader, cc
|
from regluit.core import bookloader, cc
|
||||||
from regluit.core import models, tasks
|
from regluit.core import models, tasks
|
||||||
from regluit.core.bookloader import merge_works
|
from regluit.core.bookloader import merge_works
|
||||||
from regluit.core.loaders.utils import type_for_url
|
from regluit.core.models.loader import type_for_url
|
||||||
from regluit.core.validation import identifier_cleaner, valid_subject
|
from regluit.core.validation import identifier_cleaner, valid_subject
|
||||||
|
|
||||||
from . import scrape_language
|
from . import scrape_language
|
||||||
|
|
|
@ -9,7 +9,7 @@ from urllib.parse import urlparse, urljoin
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from regluit.utils.lang import lang_to_language_code
|
from regluit.utils.lang import lang_to_language_code
|
||||||
from .utils import get_soup
|
from .soup import get_soup
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
|
@ -4,20 +4,19 @@ code for harvesting 'online' ebooks
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from urllib.parse import urlparse, urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.files.base import ContentFile
|
from django.core.files.base import ContentFile
|
||||||
|
|
||||||
from regluit.core.models import (
|
from regluit.core import models
|
||||||
Ebook, EbookFile, path_for_file,
|
from regluit.core.models import loader
|
||||||
)
|
from regluit.core.parameters import GOOD_PROVIDERS
|
||||||
from regluit.core.pdf import staple_pdf
|
from regluit.core.pdf import staple_pdf
|
||||||
|
|
||||||
from .utils import get_soup, type_for_url
|
from .soup import get_soup
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -40,9 +39,11 @@ class RateLimiter(object):
|
||||||
|
|
||||||
rl = RateLimiter()
|
rl = RateLimiter()
|
||||||
|
|
||||||
def dl_online(ebook, limiter=rl.delay):
|
def dl_online(ebook, limiter=rl.delay, format='online'):
|
||||||
if ebook.format != 'online':
|
if ebook.format != format or ebook.provider in DONT_HARVEST:
|
||||||
return None, 0
|
return None, 0
|
||||||
|
if ebook.ebook_files.exists():
|
||||||
|
return ebook.ebook_files.first(), 0
|
||||||
for do_harvest, harvester in harvesters(ebook):
|
for do_harvest, harvester in harvesters(ebook):
|
||||||
if do_harvest:
|
if do_harvest:
|
||||||
for ebf in ebf_if_harvested(ebook.url):
|
for ebf in ebf_if_harvested(ebook.url):
|
||||||
|
@ -63,7 +64,13 @@ CMPPROVIDERS = [
|
||||||
'editorial.uniagustiniana.edu.co',
|
'editorial.uniagustiniana.edu.co',
|
||||||
'monographs.uc.pt',
|
'monographs.uc.pt',
|
||||||
]
|
]
|
||||||
|
DONT_HARVEST = [
|
||||||
|
'Unglue.it',
|
||||||
|
'Github',
|
||||||
|
'Project Gutenberg',
|
||||||
|
'Google Books',
|
||||||
|
'OpenEdition Books',
|
||||||
|
]
|
||||||
|
|
||||||
def harvesters(ebook):
|
def harvesters(ebook):
|
||||||
yield ebook.provider in GOOD_PROVIDERS, harvest_generic
|
yield ebook.provider in GOOD_PROVIDERS, harvest_generic
|
||||||
|
@ -109,10 +116,10 @@ def harvesters(ebook):
|
||||||
|
|
||||||
|
|
||||||
def ebf_if_harvested(url):
|
def ebf_if_harvested(url):
|
||||||
onlines = EbookFile.objects.filter(source=url)
|
onlines = models.EbookFile.objects.filter(source=url)
|
||||||
if onlines:
|
if onlines:
|
||||||
return onlines
|
return onlines
|
||||||
return EbookFile.objects.none()
|
return models.EbookFile.objects.none()
|
||||||
|
|
||||||
|
|
||||||
def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET'):
|
def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET'):
|
||||||
|
@ -122,75 +129,66 @@ def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET'):
|
||||||
logger.info('making %s' % url)
|
logger.info('making %s' % url)
|
||||||
|
|
||||||
# check to see if url already harvested
|
# check to see if url already harvested
|
||||||
new_prev = []
|
|
||||||
for ebf in ebf_if_harvested(url):
|
for ebf in ebf_if_harvested(url):
|
||||||
new_ebf = EbookFile.objects.create(
|
if ebf.ebook == ebook:
|
||||||
|
return ebf, 0
|
||||||
|
new_ebf = models.EbookFile.objects.create(
|
||||||
edition=ebf.edition,
|
edition=ebf.edition,
|
||||||
format=ebf.format,
|
format=ebf.format,
|
||||||
file=ebf.file,
|
file=ebf.file,
|
||||||
source=ebook.url,
|
source=ebook.url,
|
||||||
|
ebook=ebook,
|
||||||
)
|
)
|
||||||
new_prev.append(new_ebf)
|
|
||||||
if new_prev:
|
|
||||||
logger.info("Previously harvested")
|
logger.info("Previously harvested")
|
||||||
return new_prev[0], len(new_prev)
|
return new_ebf, 0
|
||||||
|
|
||||||
try:
|
|
||||||
if method == 'POST':
|
dl_cf, fmt = loader.load_ebookfile(url, ebook.format, user_agent=user_agent, method=method)
|
||||||
response = requests.post(url, headers={"User-Agent": user_agent})
|
if dl_cf:
|
||||||
|
return make_harvested_ebook(dl_cf, ebook, fmt, filesize=dl_cf.size)
|
||||||
else:
|
else:
|
||||||
response = requests.get(url, headers={"User-Agent": user_agent})
|
logger.warning('download format %s for %s is not ebook', ebook.format, url)
|
||||||
except requests.exceptions.SSLError:
|
|
||||||
logger.error('bad certificate? for %s', url)
|
|
||||||
return None, 0
|
|
||||||
if response.status_code == 200:
|
|
||||||
filesize = int(response.headers.get("Content-Length", 0))
|
|
||||||
filesize = filesize if filesize else None
|
|
||||||
logger.debug(response.headers.get('content-type', ''))
|
|
||||||
format = type_for_url(url,
|
|
||||||
content_type=response.headers.get('content-type', ''),
|
|
||||||
disposition=response.headers.get('content-disposition', ''))
|
|
||||||
if format != 'online':
|
|
||||||
return make_harvested_ebook(response.content, ebook, format, filesize=filesize)
|
|
||||||
else:
|
|
||||||
logger.warning('download format %s for %s is not ebook', format, url)
|
|
||||||
else:
|
|
||||||
logger.warning('couldn\'t get %s', url)
|
|
||||||
return None, 0
|
return None, 0
|
||||||
|
|
||||||
def make_stapled_ebook(urllist, ebook, user_agent=settings.USER_AGENT, strip_covers=False):
|
def make_stapled_ebook(urllist, ebook, user_agent=settings.USER_AGENT, strip_covers=False):
|
||||||
pdffile = staple_pdf(urllist, user_agent, strip_covers=strip_covers)
|
pdffile = staple_pdf(urllist, user_agent, strip_covers=strip_covers)
|
||||||
if not pdffile:
|
if not pdffile:
|
||||||
return None, 0
|
return None, 0
|
||||||
return make_harvested_ebook(pdffile.getvalue(), ebook, 'pdf')
|
return make_harvested_ebook(ContentFile(pdffile.getvalue()), ebook, 'pdf')
|
||||||
|
|
||||||
def make_harvested_ebook(content, ebook, format, filesize=0):
|
def make_harvested_ebook(content, ebook, format, filesize=0):
|
||||||
if not filesize:
|
if not filesize:
|
||||||
filesize = len(content)
|
filesize = len(content)
|
||||||
new_ebf = EbookFile.objects.create(
|
new_ebf = models.EbookFile.objects.create(
|
||||||
edition=ebook.edition,
|
edition=ebook.edition,
|
||||||
format=format,
|
format=format,
|
||||||
source=ebook.url,
|
source=ebook.url,
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
new_ebf.file.save(path_for_file(new_ebf, None), ContentFile(content))
|
new_ebf.file.save(models.path_for_file(new_ebf, None), content)
|
||||||
new_ebf.save()
|
new_ebf.save()
|
||||||
except MemoryError: #huge pdf files cause problems here
|
except MemoryError: #huge pdf files cause problems here
|
||||||
logger.error("memory error saving ebook file for %s", ebook.url)
|
logger.error("memory error saving ebook file for %s", ebook.url)
|
||||||
new_ebf.delete()
|
new_ebf.delete()
|
||||||
return None, 0
|
return None, 0
|
||||||
|
if ebook.format == "online":
|
||||||
new_ebook = Ebook.objects.create(
|
harvested_ebook = models.Ebook.objects.create(
|
||||||
edition=ebook.edition,
|
edition=ebook.edition,
|
||||||
format=format,
|
format=format,
|
||||||
provider='Unglue.it',
|
provider='Unglue.it',
|
||||||
url=new_ebf.file.url,
|
url=new_ebf.file.url,
|
||||||
rights=ebook.rights,
|
rights=ebook.rights,
|
||||||
filesize=filesize,
|
filesize=filesize if filesize < 2147483647 else 2147483647, # largest safe integer
|
||||||
version_label=ebook.version_label,
|
version_label=ebook.version_label,
|
||||||
version_iter=ebook.version_iter,
|
version_iter=ebook.version_iter,
|
||||||
)
|
)
|
||||||
new_ebf.ebook = new_ebook
|
else:
|
||||||
|
if not ebook.filesize:
|
||||||
|
ebook.filesize = filesize if filesize < 2147483647 else 2147483647
|
||||||
|
ebook.save()
|
||||||
|
harvested_ebook = ebook
|
||||||
|
|
||||||
|
new_ebf.ebook = harvested_ebook
|
||||||
new_ebf.save()
|
new_ebf.save()
|
||||||
return new_ebf, 1
|
return new_ebf, 1
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
|
||||||
|
def get_soup(url, user_agent=settings.USER_AGENT):
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers={"User-Agent": user_agent})
|
||||||
|
except requests.exceptions.MissingSchema:
|
||||||
|
response = requests.get('http://%s' % url, headers={"User-Agent": user_agent})
|
||||||
|
except requests.exceptions.ConnectionError as e:
|
||||||
|
logger.error("Connection refused for %s", url)
|
||||||
|
logger.error(e)
|
||||||
|
return None
|
||||||
|
if response.status_code == 200:
|
||||||
|
soup = BeautifulSoup(response.content, 'lxml')
|
||||||
|
|
||||||
|
# make sure document has a base
|
||||||
|
if not soup.find('base'):
|
||||||
|
obj = soup.find('head')
|
||||||
|
if obj:
|
||||||
|
obj.append(soup.new_tag("base", href=response.url))
|
||||||
|
else:
|
||||||
|
logger.error('No head for %s', url)
|
||||||
|
return soup
|
||||||
|
else:
|
||||||
|
logger.error('%s returned code %s', url, response.status_code)
|
||||||
|
return None
|
|
@ -23,6 +23,7 @@ class LoaderTests(TestCase):
|
||||||
self.assertTrue(dropbox_ebf.ebook.filesize)
|
self.assertTrue(dropbox_ebf.ebook.filesize)
|
||||||
|
|
||||||
jbe_url = 'http://www.jbe-platform.com/content/books/9789027295958'
|
jbe_url = 'http://www.jbe-platform.com/content/books/9789027295958'
|
||||||
jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition)
|
jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition,
|
||||||
|
provider='jbe-platform.com')
|
||||||
jbe_ebf, new_ebf = dl_online(jbe_ebook)
|
jbe_ebf, new_ebf = dl_online(jbe_ebook)
|
||||||
self.assertTrue(jbe_ebf.ebook.filesize)
|
self.assertTrue(jbe_ebf.ebook.filesize)
|
||||||
|
|
|
@ -10,6 +10,7 @@ import requests
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
|
|
||||||
from regluit.api.crosswalks import inv_relator_contrib
|
from regluit.api.crosswalks import inv_relator_contrib
|
||||||
from regluit.bisac.models import BisacHeading
|
from regluit.bisac.models import BisacHeading
|
||||||
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
|
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
|
||||||
|
@ -18,6 +19,8 @@ from regluit.core.models import (
|
||||||
Ebook, Edition, Identifier, Subject, Work,
|
Ebook, Edition, Identifier, Subject, Work,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from .soup import get_soup
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def UnicodeDictReader(utf8_data, **kwargs):
|
def UnicodeDictReader(utf8_data, **kwargs):
|
||||||
|
@ -41,29 +44,6 @@ def utf8_general_ci_norm(s):
|
||||||
s1 = unicodedata.normalize('NFD', s)
|
s1 = unicodedata.normalize('NFD', s)
|
||||||
return ''.join(c for c in s1 if not unicodedata.combining(c)).upper()
|
return ''.join(c for c in s1 if not unicodedata.combining(c)).upper()
|
||||||
|
|
||||||
def get_soup(url, user_agent=settings.USER_AGENT):
|
|
||||||
try:
|
|
||||||
response = requests.get(url, headers={"User-Agent": user_agent})
|
|
||||||
except requests.exceptions.MissingSchema:
|
|
||||||
response = requests.get('http://%s' % url, headers={"User-Agent": user_agent})
|
|
||||||
except requests.exceptions.ConnectionError as e:
|
|
||||||
logger.error("Connection refused for %s", url)
|
|
||||||
logger.error(e)
|
|
||||||
return None
|
|
||||||
if response.status_code == 200:
|
|
||||||
soup = BeautifulSoup(response.content, 'lxml')
|
|
||||||
|
|
||||||
# make sure document has a base
|
|
||||||
if not soup.find('base'):
|
|
||||||
obj = soup.find('head')
|
|
||||||
if obj:
|
|
||||||
obj.append(soup.new_tag("base", href=response.url))
|
|
||||||
else:
|
|
||||||
logger.error('No head for %s', url)
|
|
||||||
return soup
|
|
||||||
else:
|
|
||||||
logger.error('%s returned code %s', url, response.status_code)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_authors(book):
|
def get_authors(book):
|
||||||
authors = []
|
authors = []
|
||||||
|
@ -378,89 +358,3 @@ def ids_from_urls(url):
|
||||||
ids[ident] = id_match.group('id')
|
ids[ident] = id_match.group('id')
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
def type_for_url(url, content_type=None, force=False, disposition=''):
|
|
||||||
url_disp = url + disposition
|
|
||||||
if not url:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
# check to see if we already know
|
|
||||||
for ebook in Ebook.objects.filter(url=url):
|
|
||||||
if ebook.format != 'online':
|
|
||||||
return ebook.format
|
|
||||||
|
|
||||||
if not force:
|
|
||||||
if url.find('books.openedition.org') >= 0:
|
|
||||||
return 'online'
|
|
||||||
if content_type:
|
|
||||||
ct = content_type
|
|
||||||
else:
|
|
||||||
ct, disposition = contenttyper.calc_type(url)
|
|
||||||
url_disp = url + disposition
|
|
||||||
binary_type = re.search("octet-stream", ct) or re.search("application/binary", ct)
|
|
||||||
if re.search("pdf", ct):
|
|
||||||
return "pdf"
|
|
||||||
elif binary_type and re.search("pdf", url_disp, flags=re.I):
|
|
||||||
return "pdf"
|
|
||||||
elif binary_type and re.search("epub", url_disp, flags=re.I):
|
|
||||||
return "epub"
|
|
||||||
elif binary_type and re.search("mobi", url_disp, flags=re.I):
|
|
||||||
return "mobi"
|
|
||||||
elif re.search("text/plain", ct):
|
|
||||||
return "text"
|
|
||||||
elif re.search("text/html", ct):
|
|
||||||
if url.find('oapen.org/view') >= 0:
|
|
||||||
return "html"
|
|
||||||
return "online"
|
|
||||||
elif re.search("epub", ct):
|
|
||||||
return "epub"
|
|
||||||
elif re.search("mobi", ct):
|
|
||||||
return "mobi"
|
|
||||||
elif ct == '404':
|
|
||||||
return ct
|
|
||||||
# no content-type header!
|
|
||||||
elif ct == '' and re.search("epub", url_disp, flags=re.I):
|
|
||||||
return "epub"
|
|
||||||
elif ct == '' and re.search("pdf", url_disp, flags=re.I):
|
|
||||||
return "pdf"
|
|
||||||
elif ct == '' and re.search("mobi", url_disp, flags=re.I):
|
|
||||||
return "mobi"
|
|
||||||
|
|
||||||
return "other"
|
|
||||||
|
|
||||||
class ContentTyper(object):
|
|
||||||
""" """
|
|
||||||
def __init__(self):
|
|
||||||
self.last_call = dict()
|
|
||||||
|
|
||||||
def content_type(self, url):
|
|
||||||
try:
|
|
||||||
r = requests.head(url, allow_redirects=True)
|
|
||||||
if r.status_code == 405:
|
|
||||||
r = requests.get(url)
|
|
||||||
elif r.status_code == 404:
|
|
||||||
logger.error('File not found (404) for %s', url)
|
|
||||||
return '404', ''
|
|
||||||
return r.headers.get('content-type', ''), r.headers.get('content-disposition', '')
|
|
||||||
except:
|
|
||||||
return '', ''
|
|
||||||
|
|
||||||
def calc_type(self, url):
|
|
||||||
logger.info(url)
|
|
||||||
delay = 1
|
|
||||||
# is there a delay associated with the url
|
|
||||||
netloc = urlparse(url).netloc
|
|
||||||
|
|
||||||
# wait if necessary
|
|
||||||
last_call = self.last_call.get(netloc)
|
|
||||||
if last_call is not None:
|
|
||||||
now = time.time()
|
|
||||||
min_time_next_call = last_call + delay
|
|
||||||
if min_time_next_call > now:
|
|
||||||
time.sleep(min_time_next_call-now)
|
|
||||||
|
|
||||||
self.last_call[netloc] = time.time()
|
|
||||||
|
|
||||||
# compute the content-type
|
|
||||||
return self.content_type(url)
|
|
||||||
|
|
||||||
contenttyper = ContentTyper()
|
|
||||||
|
|
|
@ -61,6 +61,7 @@ from regluit.core.parameters import (
|
||||||
THANKED,
|
THANKED,
|
||||||
OFFER_CHOICES,
|
OFFER_CHOICES,
|
||||||
ACQ_CHOICES,
|
ACQ_CHOICES,
|
||||||
|
GOOD_PROVIDERS,
|
||||||
)
|
)
|
||||||
from regluit.core.epub import personalize, ungluify, ask_epub
|
from regluit.core.epub import personalize, ungluify, ask_epub
|
||||||
from regluit.core.pdf import ask_pdf, pdf_append
|
from regluit.core.pdf import ask_pdf, pdf_append
|
||||||
|
@ -79,7 +80,6 @@ from .bibmodels import (
|
||||||
EbookFile,
|
EbookFile,
|
||||||
Edition,
|
Edition,
|
||||||
EditionNote,
|
EditionNote,
|
||||||
good_providers,
|
|
||||||
Identifier,
|
Identifier,
|
||||||
path_for_file,
|
path_for_file,
|
||||||
Publisher,
|
Publisher,
|
||||||
|
@ -893,9 +893,9 @@ class Campaign(models.Model):
|
||||||
def make_mobis(self):
|
def make_mobis(self):
|
||||||
# make archive files for ebooks, make mobi files for epubs
|
# make archive files for ebooks, make mobi files for epubs
|
||||||
versions = set()
|
versions = set()
|
||||||
for ebook in self.work.ebooks().filter(provider__in=good_providers, format='mobi'):
|
for ebook in self.work.ebooks().filter(provider__in=GOOD_PROVIDERS, format='mobi'):
|
||||||
versions.add(ebook.version_label)
|
versions.add(ebook.version_label)
|
||||||
for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=good_providers, format='epub'):
|
for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=GOOD_PROVIDERS, format='epub'):
|
||||||
if not ebook.version_label in versions:
|
if not ebook.version_label in versions:
|
||||||
# now make the mobi file
|
# now make the mobi file
|
||||||
ebf = ebook.get_archive_ebf()
|
ebf = ebook.get_archive_ebf()
|
||||||
|
@ -912,7 +912,7 @@ class Campaign(models.Model):
|
||||||
ebf.file.open()
|
ebf.file.open()
|
||||||
to_dos.append({'content': ebf.file.read(), 'ebook': ebf.ebook})
|
to_dos.append({'content': ebf.file.read(), 'ebook': ebf.ebook})
|
||||||
format_versions.append(format_version)
|
format_versions.append(format_version)
|
||||||
for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=good_providers):
|
for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=GOOD_PROVIDERS):
|
||||||
format_version = '{}_{}'.format(ebook.format, ebook.version_label)
|
format_version = '{}_{}'.format(ebook.format, ebook.version_label)
|
||||||
if ebook.format in ('pdf', 'epub') and not format_version in format_versions:
|
if ebook.format in ('pdf', 'epub') and not format_version in format_versions:
|
||||||
to_dos.append({'content': ebook.get_archive().read(), 'ebook': ebook})
|
to_dos.append({'content': ebook.get_archive().read(), 'ebook': ebook})
|
||||||
|
@ -1018,6 +1018,7 @@ class Campaign(models.Model):
|
||||||
provider="Unglue.it",
|
provider="Unglue.it",
|
||||||
url=settings.BASE_URL_SECURE + reverse('download_campaign', args=[self.work_id, format]),
|
url=settings.BASE_URL_SECURE + reverse('download_campaign', args=[self.work_id, format]),
|
||||||
version_label='unglued',
|
version_label='unglued',
|
||||||
|
filesize=ebf.file.size,
|
||||||
)
|
)
|
||||||
old_ebooks = Ebook.objects.exclude(pk=ebook.pk).filter(
|
old_ebooks = Ebook.objects.exclude(pk=ebook.pk).filter(
|
||||||
edition=self.work.preferred_edition,
|
edition=self.work.preferred_edition,
|
||||||
|
|
|
@ -35,8 +35,8 @@ from regluit.core import mobi
|
||||||
import regluit.core.cc as cc
|
import regluit.core.cc as cc
|
||||||
from regluit.core.epub import test_epub
|
from regluit.core.epub import test_epub
|
||||||
from regluit.core.links import id_url
|
from regluit.core.links import id_url
|
||||||
|
from regluit.core.loaders.harvest import dl_online
|
||||||
from regluit.core.validation import valid_subject
|
from regluit.core.validation import valid_subject
|
||||||
|
|
||||||
from regluit.core.parameters import (
|
from regluit.core.parameters import (
|
||||||
AGE_LEVEL_CHOICES,
|
AGE_LEVEL_CHOICES,
|
||||||
BORROWED,
|
BORROWED,
|
||||||
|
@ -56,7 +56,6 @@ from regluit.core.parameters import (
|
||||||
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
good_providers = ('Internet Archive', 'Unglue.it', 'Github', 'OAPEN Library', 'SciELO')
|
|
||||||
|
|
||||||
def id_for(obj, type):
|
def id_for(obj, type):
|
||||||
if not obj.pk:
|
if not obj.pk:
|
||||||
|
@ -1143,7 +1142,7 @@ class EbookFile(models.Model):
|
||||||
edition=self.edition,
|
edition=self.edition,
|
||||||
format='mobi',
|
format='mobi',
|
||||||
asking=self.asking,
|
asking=self.asking,
|
||||||
source=self.file.url
|
source=self.file.url,
|
||||||
)
|
)
|
||||||
|
|
||||||
new_mobi_ebf.file.save(path_for_file(new_mobi_ebf, None), mobi_cf)
|
new_mobi_ebf.file.save(path_for_file(new_mobi_ebf, None), mobi_cf)
|
||||||
|
@ -1157,6 +1156,7 @@ class EbookFile(models.Model):
|
||||||
rights=self.ebook.rights,
|
rights=self.ebook.rights,
|
||||||
version_label=self.ebook.version_label,
|
version_label=self.ebook.version_label,
|
||||||
version_iter=self.ebook.version_iter,
|
version_iter=self.ebook.version_iter,
|
||||||
|
filesize=mobi_cf.size,
|
||||||
)
|
)
|
||||||
new_mobi_ebf.ebook = new_ebook
|
new_mobi_ebf.ebook = new_ebook
|
||||||
new_mobi_ebf.save()
|
new_mobi_ebf.save()
|
||||||
|
@ -1205,39 +1205,14 @@ class Ebook(models.Model):
|
||||||
return ebf.file
|
return ebf.file
|
||||||
|
|
||||||
def get_archive_ebf(self): # returns an ebf
|
def get_archive_ebf(self): # returns an ebf
|
||||||
if not self.ebook_files.filter(asking=False).exists():
|
if self.ebook_files.filter(asking=False):
|
||||||
if not self.provider in good_providers:
|
ebf = self.ebook_files.filter(asking=False).last()
|
||||||
|
elif EbookFile.objects.filter(source=self.url, format=self.format):
|
||||||
|
ebf = self.ebook_files.filter(asking=False).last()
|
||||||
|
else:
|
||||||
|
ebf, num = dl_online(self, format=self.format)
|
||||||
|
if not ebf:
|
||||||
return None
|
return None
|
||||||
try:
|
|
||||||
r = requests.get(self.url)
|
|
||||||
if r.status_code == 200:
|
|
||||||
self.filesize = len(r.content)
|
|
||||||
if self.save:
|
|
||||||
self.filesize = self.filesize if self.filesize < 2147483647 else 2147483647 # largest safe positive integer
|
|
||||||
self.save()
|
|
||||||
ebf = EbookFile.objects.create(
|
|
||||||
edition=self.edition,
|
|
||||||
ebook=self,
|
|
||||||
format=self.format,
|
|
||||||
source=self.url
|
|
||||||
)
|
|
||||||
ebf.file.save(path_for_file(ebf, None), ContentFile(r.content))
|
|
||||||
ebf.file.close()
|
|
||||||
ebf.save()
|
|
||||||
return ebf
|
|
||||||
else:
|
|
||||||
logging.error('Bad link error: {}'.format(self.url))
|
|
||||||
except IOError:
|
|
||||||
logger.error(u'could not open {}'.format(self.url))
|
|
||||||
else:
|
|
||||||
ebf = self.ebook_files.filter(asking=False).order_by('-created')[0]
|
|
||||||
if not self.filesize:
|
|
||||||
try:
|
|
||||||
self.filesize = ebf.file.size
|
|
||||||
self.save()
|
|
||||||
except ClientError:
|
|
||||||
# error thrown when the can't access the S3 bucket
|
|
||||||
pass
|
|
||||||
return ebf
|
return ebf
|
||||||
|
|
||||||
def set_provider(self):
|
def set_provider(self):
|
||||||
|
|
|
@ -0,0 +1,147 @@
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from django.apps import apps
|
||||||
|
from django.conf import settings
|
||||||
|
from django.core.files.base import ContentFile
|
||||||
|
from django.forms import ValidationError
|
||||||
|
|
||||||
|
from regluit.core.validation import test_file
|
||||||
|
from regluit.core import models
|
||||||
|
#from . import Ebook, EbookFile
|
||||||
|
|
||||||
|
#Ebook = apps.get_model('core', 'Ebook')
|
||||||
|
#EbookFile = apps.get_model('core', 'EbookFile')
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def type_for_url(url, content_type=None, force=False, disposition=''):
|
||||||
|
url_disp = url + disposition
|
||||||
|
if not url:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
# check to see if we already know
|
||||||
|
for ebook in models.Ebook.objects.filter(url=url):
|
||||||
|
if ebook.format != 'online':
|
||||||
|
return ebook.format
|
||||||
|
|
||||||
|
if not force:
|
||||||
|
if url.find('books.openedition.org') >= 0:
|
||||||
|
return 'online'
|
||||||
|
if content_type:
|
||||||
|
ct = content_type
|
||||||
|
else:
|
||||||
|
ct, disposition = contenttyper.calc_type(url)
|
||||||
|
url_disp = url + disposition
|
||||||
|
binary_type = re.search("octet-stream", ct) or re.search("application/binary", ct)
|
||||||
|
if re.search("pdf", ct):
|
||||||
|
return "pdf"
|
||||||
|
elif binary_type and re.search("pdf", url_disp, flags=re.I):
|
||||||
|
return "pdf"
|
||||||
|
elif binary_type and re.search("epub", url_disp, flags=re.I):
|
||||||
|
return "epub"
|
||||||
|
elif binary_type and re.search("mobi", url_disp, flags=re.I):
|
||||||
|
return "mobi"
|
||||||
|
elif re.search("text/plain", ct):
|
||||||
|
return "text"
|
||||||
|
elif re.search("text/html", ct):
|
||||||
|
if url.find('oapen.org/view') >= 0:
|
||||||
|
return "html"
|
||||||
|
return "online"
|
||||||
|
elif re.search("epub", ct):
|
||||||
|
return "epub"
|
||||||
|
elif re.search("mobi", ct):
|
||||||
|
return "mobi"
|
||||||
|
elif ct == '404':
|
||||||
|
return ct
|
||||||
|
# no content-type header!
|
||||||
|
elif ct == '' and re.search("epub", url_disp, flags=re.I):
|
||||||
|
return "epub"
|
||||||
|
elif ct == '' and re.search("pdf", url_disp, flags=re.I):
|
||||||
|
return "pdf"
|
||||||
|
elif ct == '' and re.search("mobi", url_disp, flags=re.I):
|
||||||
|
return "mobi"
|
||||||
|
|
||||||
|
return "other"
|
||||||
|
|
||||||
|
class ContentTyper(object):
|
||||||
|
""" """
|
||||||
|
def __init__(self):
|
||||||
|
self.last_call = dict()
|
||||||
|
|
||||||
|
def content_type(self, url):
|
||||||
|
try:
|
||||||
|
r = requests.head(url, allow_redirects=True)
|
||||||
|
if r.status_code == 405:
|
||||||
|
r = requests.get(url)
|
||||||
|
elif r.status_code == 404:
|
||||||
|
logger.error('File not found (404) for %s', url)
|
||||||
|
return '404', ''
|
||||||
|
return r.headers.get('content-type', ''), r.headers.get('content-disposition', '')
|
||||||
|
except:
|
||||||
|
return '', ''
|
||||||
|
|
||||||
|
def calc_type(self, url):
|
||||||
|
logger.info(url)
|
||||||
|
delay = 1
|
||||||
|
# is there a delay associated with the url
|
||||||
|
netloc = urlparse(url).netloc
|
||||||
|
|
||||||
|
# wait if necessary
|
||||||
|
last_call = self.last_call.get(netloc)
|
||||||
|
if last_call is not None:
|
||||||
|
now = time.time()
|
||||||
|
min_time_next_call = last_call + delay
|
||||||
|
if min_time_next_call > now:
|
||||||
|
time.sleep(min_time_next_call-now)
|
||||||
|
|
||||||
|
self.last_call[netloc] = time.time()
|
||||||
|
|
||||||
|
# compute the content-type
|
||||||
|
return self.content_type(url)
|
||||||
|
|
||||||
|
contenttyper = ContentTyper()
|
||||||
|
|
||||||
|
def load_ebookfile(url, format, user_agent=settings.USER_AGENT, method='GET'):
|
||||||
|
'''
|
||||||
|
return a ContentFile, format if a new ebook has been loaded
|
||||||
|
'''
|
||||||
|
ebfs = models.EbookFile.objects.filter(source=url)
|
||||||
|
if ebfs:
|
||||||
|
return None, ''
|
||||||
|
try:
|
||||||
|
if method == 'POST':
|
||||||
|
response = requests.post(url, headers={"User-Agent": user_agent})
|
||||||
|
else:
|
||||||
|
response = requests.get(url, headers={"User-Agent": user_agent})
|
||||||
|
|
||||||
|
except requests.exceptions.SSLError:
|
||||||
|
logger.error('bad certificate? for %s', url)
|
||||||
|
return None, ''
|
||||||
|
except IOError as e:
|
||||||
|
logger.error('could not open %', url)
|
||||||
|
return None, ''
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
logger.debug(response.headers.get('content-type', ''))
|
||||||
|
resp_format = type_for_url(url,
|
||||||
|
content_type=response.headers.get('content-type', ''),
|
||||||
|
disposition=response.headers.get('content-disposition', ''))
|
||||||
|
if resp_format == 'online' or (format != 'online' and resp_format != format):
|
||||||
|
logger.warning('response format %s for %s is not correct', resp_format, url)
|
||||||
|
return None, resp_format
|
||||||
|
else:
|
||||||
|
logger.warning('couldn\'t get %s', url)
|
||||||
|
return None, ''
|
||||||
|
|
||||||
|
contentfile = ContentFile(response.content)
|
||||||
|
try:
|
||||||
|
test_file(contentfile, resp_format)
|
||||||
|
return contentfile, resp_format
|
||||||
|
except ValidationError as e:
|
||||||
|
logger.error('downloaded %s was not a valid %s', url, format)
|
||||||
|
None, resp_format
|
||||||
|
|
||||||
|
|
|
@ -48,8 +48,5 @@ WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk', 'http', 'doab')
|
||||||
|
|
||||||
ID_CHOICES_MAP = dict(ID_CHOICES)
|
ID_CHOICES_MAP = dict(ID_CHOICES)
|
||||||
|
|
||||||
|
GOOD_PROVIDERS = ('Internet Archive', 'Unglue.it', 'Github', 'OAPEN Library', 'SciELO')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,8 +4,9 @@ from io import StringIO
|
||||||
|
|
||||||
|
|
||||||
from regluit.core.facets import BaseFacet
|
from regluit.core.facets import BaseFacet
|
||||||
from regluit.core.models import Work, good_providers
|
from regluit.core.models import Work
|
||||||
from regluit.api.onix import onix_feed
|
from regluit.api.onix import onix_feed
|
||||||
|
from regluit.core.parameters import GOOD_PROVIDERS
|
||||||
|
|
||||||
from .models import Target
|
from .models import Target
|
||||||
|
|
||||||
|
@ -45,7 +46,7 @@ def get_target_facet(target, start=datetime(1900,1,1), new=False):
|
||||||
editions__ebooks__created__gt = start,
|
editions__ebooks__created__gt = start,
|
||||||
identifiers__type="isbn",
|
identifiers__type="isbn",
|
||||||
editions__ebooks__format__in = formats,
|
editions__ebooks__format__in = formats,
|
||||||
editions__ebooks__provider__in = good_providers,
|
editions__ebooks__provider__in = GOOD_PROVIDERS,
|
||||||
).distinct().order_by('-featured')
|
).distinct().order_by('-featured')
|
||||||
|
|
||||||
model_filters = {"Ebook": format_filter, "Edition": edition_format_filter}
|
model_filters = {"Ebook": format_filter, "Edition": edition_format_filter}
|
||||||
|
|
|
@ -494,8 +494,9 @@ def manage_ebooks(request, edition_id, by=None):
|
||||||
ebook_form = EbookForm(data = request.POST, files=request.FILES,)
|
ebook_form = EbookForm(data = request.POST, files=request.FILES,)
|
||||||
if ebook_form.is_valid():
|
if ebook_form.is_valid():
|
||||||
if ebook_form.cleaned_data.get('file', None):
|
if ebook_form.cleaned_data.get('file', None):
|
||||||
|
file=ebook_form.cleaned_data['file']
|
||||||
new_ebf = models.EbookFile.objects.create(
|
new_ebf = models.EbookFile.objects.create(
|
||||||
file=ebook_form.cleaned_data['file'],
|
file=file,
|
||||||
format=ebook_form.cleaned_data['format'],
|
format=ebook_form.cleaned_data['format'],
|
||||||
edition=edition,
|
edition=edition,
|
||||||
)
|
)
|
||||||
|
@ -504,6 +505,8 @@ def manage_ebooks(request, edition_id, by=None):
|
||||||
ebook_form.instance.save()
|
ebook_form.instance.save()
|
||||||
new_ebf.ebook = ebook_form.instance
|
new_ebf.ebook = ebook_form.instance
|
||||||
new_ebf.save()
|
new_ebf.save()
|
||||||
|
new_ebf.ebook.filesize = new_ebf.file.size
|
||||||
|
new_ebf.ebook.save()
|
||||||
else:
|
else:
|
||||||
ebook_form.save()
|
ebook_form.save()
|
||||||
ebook_form.instance.set_next_iter()
|
ebook_form.instance.set_next_iter()
|
||||||
|
|
Loading…
Reference in New Issue