From a6b02d387ecc14e467494589bedd49d7a2f54779 Mon Sep 17 00:00:00 2001 From: eric Date: Sat, 15 Aug 2020 20:21:56 -0400 Subject: [PATCH] refactor ebf(url) --- core/bookloader.py | 23 +----- core/loaders/doab.py | 2 +- core/loaders/doab_utils.py | 2 +- core/loaders/harvest.py | 100 +++++++++++++------------ core/loaders/soup.py | 31 ++++++++ core/loaders/tests.py | 3 +- core/loaders/utils.py | 112 +--------------------------- core/models/__init__.py | 9 ++- core/models/bibmodels.py | 47 +++--------- core/models/loader.py | 147 +++++++++++++++++++++++++++++++++++++ core/parameters.py | 5 +- distro/push.py | 5 +- frontend/views/__init__.py | 5 +- 13 files changed, 261 insertions(+), 230 deletions(-) create mode 100644 core/loaders/soup.py create mode 100644 core/models/loader.py diff --git a/core/bookloader.py b/core/bookloader.py index 05123280..f00d9ffb 100755 --- a/core/bookloader.py +++ b/core/bookloader.py @@ -14,7 +14,6 @@ import requests # django imports from django.conf import settings -from django.core.files.base import ContentFile from django.core.files.storage import default_storage from django.db import IntegrityError from django.db.models import Sum @@ -31,7 +30,6 @@ from gitenberg.metadata.pandata import Pandata import regluit import regluit.core.isbn -from regluit.core.validation import test_file from regluit.marc.models import inverse_marc_rels from regluit.utils.lang import lang_to_language_code @@ -39,6 +37,7 @@ from . import cc from . import models from .parameters import WORK_IDENTIFIERS from .validation import identifier_cleaner, unreverse_name +from .models import loader logger = logging.getLogger(__name__) request_log = logging.getLogger("requests") @@ -884,22 +883,6 @@ def edition_for_etype(etype, metadata, default=None): for key in metadata.edition_identifiers.keys(): return edition_for_ident(key, metadata.identifiers[key]) -def load_ebookfile(url, etype): - ''' - return a ContentFile if a new ebook has been loaded - ''' - ebfs = models.EbookFile.objects.filter(source=url) - if ebfs: - return None - try: - r = requests.get(url) - contentfile = ContentFile(r.content) - test_file(contentfile, etype) - return contentfile - except IOError as e: - logger.error(u'could not open {}'.format(url)) - except ValidationError as e: - logger.error(u'downloaded {} was not a valid {}'.format(url, etype)) class BasePandataLoader(object): def __init__(self, url): @@ -1016,8 +999,8 @@ class BasePandataLoader(object): if url: edition = edition_for_etype(key, metadata, default=default_edition) if edition: - contentfile = load_ebookfile(url, key) - if contentfile: + contentfile, fmt = loader.load_ebookfile(url, key) + if contentfile and fmt == key: contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key) path = default_storage.save(contentfile_name, contentfile) ebf = models.EbookFile.objects.create( diff --git a/core/loaders/doab.py b/core/loaders/doab.py index 95ab5c77..62d1e5b2 100644 --- a/core/loaders/doab.py +++ b/core/loaders/doab.py @@ -18,7 +18,7 @@ from oaipmh.metadata import MetadataRegistry, oai_dc_reader from regluit.core import bookloader, cc from regluit.core import models, tasks from regluit.core.bookloader import merge_works -from regluit.core.loaders.utils import type_for_url +from regluit.core.models.loader import type_for_url from regluit.core.validation import identifier_cleaner, valid_subject from . import scrape_language diff --git a/core/loaders/doab_utils.py b/core/loaders/doab_utils.py index dec0060e..a5355d83 100644 --- a/core/loaders/doab_utils.py +++ b/core/loaders/doab_utils.py @@ -9,7 +9,7 @@ from urllib.parse import urlparse, urljoin import requests from regluit.utils.lang import lang_to_language_code -from .utils import get_soup +from .soup import get_soup logger = logging.getLogger(__name__) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 9188d9c8..3d40a5b7 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -4,20 +4,19 @@ code for harvesting 'online' ebooks import logging import re import time -from urllib.parse import urlparse, urljoin +from urllib.parse import urljoin import requests from django.conf import settings from django.core.files.base import ContentFile -from regluit.core.models import ( - Ebook, EbookFile, path_for_file, -) +from regluit.core import models +from regluit.core.models import loader +from regluit.core.parameters import GOOD_PROVIDERS from regluit.core.pdf import staple_pdf -from .utils import get_soup, type_for_url - +from .soup import get_soup logger = logging.getLogger(__name__) @@ -40,9 +39,11 @@ class RateLimiter(object): rl = RateLimiter() -def dl_online(ebook, limiter=rl.delay): - if ebook.format != 'online': +def dl_online(ebook, limiter=rl.delay, format='online'): + if ebook.format != format or ebook.provider in DONT_HARVEST: return None, 0 + if ebook.ebook_files.exists(): + return ebook.ebook_files.first(), 0 for do_harvest, harvester in harvesters(ebook): if do_harvest: for ebf in ebf_if_harvested(ebook.url): @@ -63,7 +64,13 @@ CMPPROVIDERS = [ 'editorial.uniagustiniana.edu.co', 'monographs.uc.pt', ] - +DONT_HARVEST = [ + 'Unglue.it', + 'Github', + 'Project Gutenberg', + 'Google Books', + 'OpenEdition Books', +] def harvesters(ebook): yield ebook.provider in GOOD_PROVIDERS, harvest_generic @@ -109,10 +116,10 @@ def harvesters(ebook): def ebf_if_harvested(url): - onlines = EbookFile.objects.filter(source=url) + onlines = models.EbookFile.objects.filter(source=url) if onlines: return onlines - return EbookFile.objects.none() + return models.EbookFile.objects.none() def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET'): @@ -122,75 +129,66 @@ def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET'): logger.info('making %s' % url) # check to see if url already harvested - new_prev = [] for ebf in ebf_if_harvested(url): - new_ebf = EbookFile.objects.create( + if ebf.ebook == ebook: + return ebf, 0 + new_ebf = models.EbookFile.objects.create( edition=ebf.edition, format=ebf.format, file=ebf.file, source=ebook.url, + ebook=ebook, ) - new_prev.append(new_ebf) - if new_prev: logger.info("Previously harvested") - return new_prev[0], len(new_prev) + return new_ebf, 0 - try: - if method == 'POST': - response = requests.post(url, headers={"User-Agent": user_agent}) - else: - response = requests.get(url, headers={"User-Agent": user_agent}) - except requests.exceptions.SSLError: - logger.error('bad certificate? for %s', url) - return None, 0 - if response.status_code == 200: - filesize = int(response.headers.get("Content-Length", 0)) - filesize = filesize if filesize else None - logger.debug(response.headers.get('content-type', '')) - format = type_for_url(url, - content_type=response.headers.get('content-type', ''), - disposition=response.headers.get('content-disposition', '')) - if format != 'online': - return make_harvested_ebook(response.content, ebook, format, filesize=filesize) - else: - logger.warning('download format %s for %s is not ebook', format, url) + + dl_cf, fmt = loader.load_ebookfile(url, ebook.format, user_agent=user_agent, method=method) + if dl_cf: + return make_harvested_ebook(dl_cf, ebook, fmt, filesize=dl_cf.size) else: - logger.warning('couldn\'t get %s', url) + logger.warning('download format %s for %s is not ebook', ebook.format, url) return None, 0 def make_stapled_ebook(urllist, ebook, user_agent=settings.USER_AGENT, strip_covers=False): pdffile = staple_pdf(urllist, user_agent, strip_covers=strip_covers) if not pdffile: return None, 0 - return make_harvested_ebook(pdffile.getvalue(), ebook, 'pdf') + return make_harvested_ebook(ContentFile(pdffile.getvalue()), ebook, 'pdf') def make_harvested_ebook(content, ebook, format, filesize=0): if not filesize: filesize = len(content) - new_ebf = EbookFile.objects.create( + new_ebf = models.EbookFile.objects.create( edition=ebook.edition, format=format, source=ebook.url, ) try: - new_ebf.file.save(path_for_file(new_ebf, None), ContentFile(content)) + new_ebf.file.save(models.path_for_file(new_ebf, None), content) new_ebf.save() except MemoryError: #huge pdf files cause problems here logger.error("memory error saving ebook file for %s", ebook.url) new_ebf.delete() return None, 0 - - new_ebook = Ebook.objects.create( - edition=ebook.edition, - format=format, - provider='Unglue.it', - url=new_ebf.file.url, - rights=ebook.rights, - filesize=filesize, - version_label=ebook.version_label, - version_iter=ebook.version_iter, - ) - new_ebf.ebook = new_ebook + if ebook.format == "online": + harvested_ebook = models.Ebook.objects.create( + edition=ebook.edition, + format=format, + provider='Unglue.it', + url=new_ebf.file.url, + rights=ebook.rights, + filesize=filesize if filesize < 2147483647 else 2147483647, # largest safe integer + version_label=ebook.version_label, + version_iter=ebook.version_iter, + ) + else: + if not ebook.filesize: + ebook.filesize = filesize if filesize < 2147483647 else 2147483647 + ebook.save() + harvested_ebook = ebook + + new_ebf.ebook = harvested_ebook new_ebf.save() return new_ebf, 1 diff --git a/core/loaders/soup.py b/core/loaders/soup.py new file mode 100644 index 00000000..f30f9aa5 --- /dev/null +++ b/core/loaders/soup.py @@ -0,0 +1,31 @@ +import logging + +from bs4 import BeautifulSoup +import requests + +from django.conf import settings + + +def get_soup(url, user_agent=settings.USER_AGENT): + try: + response = requests.get(url, headers={"User-Agent": user_agent}) + except requests.exceptions.MissingSchema: + response = requests.get('http://%s' % url, headers={"User-Agent": user_agent}) + except requests.exceptions.ConnectionError as e: + logger.error("Connection refused for %s", url) + logger.error(e) + return None + if response.status_code == 200: + soup = BeautifulSoup(response.content, 'lxml') + + # make sure document has a base + if not soup.find('base'): + obj = soup.find('head') + if obj: + obj.append(soup.new_tag("base", href=response.url)) + else: + logger.error('No head for %s', url) + return soup + else: + logger.error('%s returned code %s', url, response.status_code) + return None diff --git a/core/loaders/tests.py b/core/loaders/tests.py index 50a36911..17a34a59 100644 --- a/core/loaders/tests.py +++ b/core/loaders/tests.py @@ -23,6 +23,7 @@ class LoaderTests(TestCase): self.assertTrue(dropbox_ebf.ebook.filesize) jbe_url = 'http://www.jbe-platform.com/content/books/9789027295958' - jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition) + jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition, + provider='jbe-platform.com') jbe_ebf, new_ebf = dl_online(jbe_ebook) self.assertTrue(jbe_ebf.ebook.filesize) diff --git a/core/loaders/utils.py b/core/loaders/utils.py index cbc45168..35f24f43 100644 --- a/core/loaders/utils.py +++ b/core/loaders/utils.py @@ -10,6 +10,7 @@ import requests from django.conf import settings + from regluit.api.crosswalks import inv_relator_contrib from regluit.bisac.models import BisacHeading from regluit.core.bookloader import add_by_isbn_from_google, merge_works @@ -18,6 +19,8 @@ from regluit.core.models import ( Ebook, Edition, Identifier, Subject, Work, ) +from .soup import get_soup + logger = logging.getLogger(__name__) def UnicodeDictReader(utf8_data, **kwargs): @@ -41,29 +44,6 @@ def utf8_general_ci_norm(s): s1 = unicodedata.normalize('NFD', s) return ''.join(c for c in s1 if not unicodedata.combining(c)).upper() -def get_soup(url, user_agent=settings.USER_AGENT): - try: - response = requests.get(url, headers={"User-Agent": user_agent}) - except requests.exceptions.MissingSchema: - response = requests.get('http://%s' % url, headers={"User-Agent": user_agent}) - except requests.exceptions.ConnectionError as e: - logger.error("Connection refused for %s", url) - logger.error(e) - return None - if response.status_code == 200: - soup = BeautifulSoup(response.content, 'lxml') - - # make sure document has a base - if not soup.find('base'): - obj = soup.find('head') - if obj: - obj.append(soup.new_tag("base", href=response.url)) - else: - logger.error('No head for %s', url) - return soup - else: - logger.error('%s returned code %s', url, response.status_code) - return None def get_authors(book): authors = [] @@ -378,89 +358,3 @@ def ids_from_urls(url): ids[ident] = id_match.group('id') return ids -def type_for_url(url, content_type=None, force=False, disposition=''): - url_disp = url + disposition - if not url: - return '' - - # check to see if we already know - for ebook in Ebook.objects.filter(url=url): - if ebook.format != 'online': - return ebook.format - - if not force: - if url.find('books.openedition.org') >= 0: - return 'online' - if content_type: - ct = content_type - else: - ct, disposition = contenttyper.calc_type(url) - url_disp = url + disposition - binary_type = re.search("octet-stream", ct) or re.search("application/binary", ct) - if re.search("pdf", ct): - return "pdf" - elif binary_type and re.search("pdf", url_disp, flags=re.I): - return "pdf" - elif binary_type and re.search("epub", url_disp, flags=re.I): - return "epub" - elif binary_type and re.search("mobi", url_disp, flags=re.I): - return "mobi" - elif re.search("text/plain", ct): - return "text" - elif re.search("text/html", ct): - if url.find('oapen.org/view') >= 0: - return "html" - return "online" - elif re.search("epub", ct): - return "epub" - elif re.search("mobi", ct): - return "mobi" - elif ct == '404': - return ct - # no content-type header! - elif ct == '' and re.search("epub", url_disp, flags=re.I): - return "epub" - elif ct == '' and re.search("pdf", url_disp, flags=re.I): - return "pdf" - elif ct == '' and re.search("mobi", url_disp, flags=re.I): - return "mobi" - - return "other" - -class ContentTyper(object): - """ """ - def __init__(self): - self.last_call = dict() - - def content_type(self, url): - try: - r = requests.head(url, allow_redirects=True) - if r.status_code == 405: - r = requests.get(url) - elif r.status_code == 404: - logger.error('File not found (404) for %s', url) - return '404', '' - return r.headers.get('content-type', ''), r.headers.get('content-disposition', '') - except: - return '', '' - - def calc_type(self, url): - logger.info(url) - delay = 1 - # is there a delay associated with the url - netloc = urlparse(url).netloc - - # wait if necessary - last_call = self.last_call.get(netloc) - if last_call is not None: - now = time.time() - min_time_next_call = last_call + delay - if min_time_next_call > now: - time.sleep(min_time_next_call-now) - - self.last_call[netloc] = time.time() - - # compute the content-type - return self.content_type(url) - -contenttyper = ContentTyper() diff --git a/core/models/__init__.py b/core/models/__init__.py index 656838b1..13e1703e 100755 --- a/core/models/__init__.py +++ b/core/models/__init__.py @@ -61,6 +61,7 @@ from regluit.core.parameters import ( THANKED, OFFER_CHOICES, ACQ_CHOICES, + GOOD_PROVIDERS, ) from regluit.core.epub import personalize, ungluify, ask_epub from regluit.core.pdf import ask_pdf, pdf_append @@ -79,7 +80,6 @@ from .bibmodels import ( EbookFile, Edition, EditionNote, - good_providers, Identifier, path_for_file, Publisher, @@ -893,9 +893,9 @@ class Campaign(models.Model): def make_mobis(self): # make archive files for ebooks, make mobi files for epubs versions = set() - for ebook in self.work.ebooks().filter(provider__in=good_providers, format='mobi'): + for ebook in self.work.ebooks().filter(provider__in=GOOD_PROVIDERS, format='mobi'): versions.add(ebook.version_label) - for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=good_providers, format='epub'): + for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=GOOD_PROVIDERS, format='epub'): if not ebook.version_label in versions: # now make the mobi file ebf = ebook.get_archive_ebf() @@ -912,7 +912,7 @@ class Campaign(models.Model): ebf.file.open() to_dos.append({'content': ebf.file.read(), 'ebook': ebf.ebook}) format_versions.append(format_version) - for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=good_providers): + for ebook in self.work.ebooks_all().exclude(provider='Unglue.it').filter(provider__in=GOOD_PROVIDERS): format_version = '{}_{}'.format(ebook.format, ebook.version_label) if ebook.format in ('pdf', 'epub') and not format_version in format_versions: to_dos.append({'content': ebook.get_archive().read(), 'ebook': ebook}) @@ -1018,6 +1018,7 @@ class Campaign(models.Model): provider="Unglue.it", url=settings.BASE_URL_SECURE + reverse('download_campaign', args=[self.work_id, format]), version_label='unglued', + filesize=ebf.file.size, ) old_ebooks = Ebook.objects.exclude(pk=ebook.pk).filter( edition=self.work.preferred_edition, diff --git a/core/models/bibmodels.py b/core/models/bibmodels.py index 1021663e..f2d4ea45 100644 --- a/core/models/bibmodels.py +++ b/core/models/bibmodels.py @@ -35,8 +35,8 @@ from regluit.core import mobi import regluit.core.cc as cc from regluit.core.epub import test_epub from regluit.core.links import id_url +from regluit.core.loaders.harvest import dl_online from regluit.core.validation import valid_subject - from regluit.core.parameters import ( AGE_LEVEL_CHOICES, BORROWED, @@ -56,7 +56,6 @@ from regluit.core.parameters import ( ImageFile.LOAD_TRUNCATED_IMAGES = True logger = logging.getLogger(__name__) -good_providers = ('Internet Archive', 'Unglue.it', 'Github', 'OAPEN Library', 'SciELO') def id_for(obj, type): if not obj.pk: @@ -1143,7 +1142,7 @@ class EbookFile(models.Model): edition=self.edition, format='mobi', asking=self.asking, - source=self.file.url + source=self.file.url, ) new_mobi_ebf.file.save(path_for_file(new_mobi_ebf, None), mobi_cf) @@ -1157,6 +1156,7 @@ class EbookFile(models.Model): rights=self.ebook.rights, version_label=self.ebook.version_label, version_iter=self.ebook.version_iter, + filesize=mobi_cf.size, ) new_mobi_ebf.ebook = new_ebook new_mobi_ebf.save() @@ -1205,40 +1205,15 @@ class Ebook(models.Model): return ebf.file def get_archive_ebf(self): # returns an ebf - if not self.ebook_files.filter(asking=False).exists(): - if not self.provider in good_providers: - return None - try: - r = requests.get(self.url) - if r.status_code == 200: - self.filesize = len(r.content) - if self.save: - self.filesize = self.filesize if self.filesize < 2147483647 else 2147483647 # largest safe positive integer - self.save() - ebf = EbookFile.objects.create( - edition=self.edition, - ebook=self, - format=self.format, - source=self.url - ) - ebf.file.save(path_for_file(ebf, None), ContentFile(r.content)) - ebf.file.close() - ebf.save() - return ebf - else: - logging.error('Bad link error: {}'.format(self.url)) - except IOError: - logger.error(u'could not open {}'.format(self.url)) + if self.ebook_files.filter(asking=False): + ebf = self.ebook_files.filter(asking=False).last() + elif EbookFile.objects.filter(source=self.url, format=self.format): + ebf = self.ebook_files.filter(asking=False).last() else: - ebf = self.ebook_files.filter(asking=False).order_by('-created')[0] - if not self.filesize: - try: - self.filesize = ebf.file.size - self.save() - except ClientError: - # error thrown when the can't access the S3 bucket - pass - return ebf + ebf, num = dl_online(self, format=self.format) + if not ebf: + return None + return ebf def set_provider(self): self.provider = Ebook.infer_provider(self.url) diff --git a/core/models/loader.py b/core/models/loader.py new file mode 100644 index 00000000..338599fa --- /dev/null +++ b/core/models/loader.py @@ -0,0 +1,147 @@ +import logging +import re +import requests +from urllib.parse import urlparse + +from django.apps import apps +from django.conf import settings +from django.core.files.base import ContentFile +from django.forms import ValidationError + +from regluit.core.validation import test_file +from regluit.core import models +#from . import Ebook, EbookFile + +#Ebook = apps.get_model('core', 'Ebook') +#EbookFile = apps.get_model('core', 'EbookFile') + +logger = logging.getLogger(__name__) + +def type_for_url(url, content_type=None, force=False, disposition=''): + url_disp = url + disposition + if not url: + return '' + + # check to see if we already know + for ebook in models.Ebook.objects.filter(url=url): + if ebook.format != 'online': + return ebook.format + + if not force: + if url.find('books.openedition.org') >= 0: + return 'online' + if content_type: + ct = content_type + else: + ct, disposition = contenttyper.calc_type(url) + url_disp = url + disposition + binary_type = re.search("octet-stream", ct) or re.search("application/binary", ct) + if re.search("pdf", ct): + return "pdf" + elif binary_type and re.search("pdf", url_disp, flags=re.I): + return "pdf" + elif binary_type and re.search("epub", url_disp, flags=re.I): + return "epub" + elif binary_type and re.search("mobi", url_disp, flags=re.I): + return "mobi" + elif re.search("text/plain", ct): + return "text" + elif re.search("text/html", ct): + if url.find('oapen.org/view') >= 0: + return "html" + return "online" + elif re.search("epub", ct): + return "epub" + elif re.search("mobi", ct): + return "mobi" + elif ct == '404': + return ct + # no content-type header! + elif ct == '' and re.search("epub", url_disp, flags=re.I): + return "epub" + elif ct == '' and re.search("pdf", url_disp, flags=re.I): + return "pdf" + elif ct == '' and re.search("mobi", url_disp, flags=re.I): + return "mobi" + + return "other" + +class ContentTyper(object): + """ """ + def __init__(self): + self.last_call = dict() + + def content_type(self, url): + try: + r = requests.head(url, allow_redirects=True) + if r.status_code == 405: + r = requests.get(url) + elif r.status_code == 404: + logger.error('File not found (404) for %s', url) + return '404', '' + return r.headers.get('content-type', ''), r.headers.get('content-disposition', '') + except: + return '', '' + + def calc_type(self, url): + logger.info(url) + delay = 1 + # is there a delay associated with the url + netloc = urlparse(url).netloc + + # wait if necessary + last_call = self.last_call.get(netloc) + if last_call is not None: + now = time.time() + min_time_next_call = last_call + delay + if min_time_next_call > now: + time.sleep(min_time_next_call-now) + + self.last_call[netloc] = time.time() + + # compute the content-type + return self.content_type(url) + +contenttyper = ContentTyper() + +def load_ebookfile(url, format, user_agent=settings.USER_AGENT, method='GET'): + ''' + return a ContentFile, format if a new ebook has been loaded + ''' + ebfs = models.EbookFile.objects.filter(source=url) + if ebfs: + return None, '' + try: + if method == 'POST': + response = requests.post(url, headers={"User-Agent": user_agent}) + else: + response = requests.get(url, headers={"User-Agent": user_agent}) + + except requests.exceptions.SSLError: + logger.error('bad certificate? for %s', url) + return None, '' + except IOError as e: + logger.error('could not open %', url) + return None, '' + + if response.status_code == 200: + logger.debug(response.headers.get('content-type', '')) + resp_format = type_for_url(url, + content_type=response.headers.get('content-type', ''), + disposition=response.headers.get('content-disposition', '')) + if resp_format == 'online' or (format != 'online' and resp_format != format): + logger.warning('response format %s for %s is not correct', resp_format, url) + return None, resp_format + else: + logger.warning('couldn\'t get %s', url) + return None, '' + + contentfile = ContentFile(response.content) + try: + test_file(contentfile, resp_format) + return contentfile, resp_format + except ValidationError as e: + logger.error('downloaded %s was not a valid %s', url, format) + None, resp_format + + diff --git a/core/parameters.py b/core/parameters.py index 1b39eb20..cbb03160 100644 --- a/core/parameters.py +++ b/core/parameters.py @@ -48,8 +48,5 @@ WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk', 'http', 'doab') ID_CHOICES_MAP = dict(ID_CHOICES) - - - - +GOOD_PROVIDERS = ('Internet Archive', 'Unglue.it', 'Github', 'OAPEN Library', 'SciELO') diff --git a/distro/push.py b/distro/push.py index d29015ed..4a485ff9 100644 --- a/distro/push.py +++ b/distro/push.py @@ -4,8 +4,9 @@ from io import StringIO from regluit.core.facets import BaseFacet -from regluit.core.models import Work, good_providers +from regluit.core.models import Work from regluit.api.onix import onix_feed +from regluit.core.parameters import GOOD_PROVIDERS from .models import Target @@ -45,7 +46,7 @@ def get_target_facet(target, start=datetime(1900,1,1), new=False): editions__ebooks__created__gt = start, identifiers__type="isbn", editions__ebooks__format__in = formats, - editions__ebooks__provider__in = good_providers, + editions__ebooks__provider__in = GOOD_PROVIDERS, ).distinct().order_by('-featured') model_filters = {"Ebook": format_filter, "Edition": edition_format_filter} diff --git a/frontend/views/__init__.py b/frontend/views/__init__.py index 536d2488..01ebdd41 100755 --- a/frontend/views/__init__.py +++ b/frontend/views/__init__.py @@ -494,8 +494,9 @@ def manage_ebooks(request, edition_id, by=None): ebook_form = EbookForm(data = request.POST, files=request.FILES,) if ebook_form.is_valid(): if ebook_form.cleaned_data.get('file', None): + file=ebook_form.cleaned_data['file'] new_ebf = models.EbookFile.objects.create( - file=ebook_form.cleaned_data['file'], + file=file, format=ebook_form.cleaned_data['format'], edition=edition, ) @@ -504,6 +505,8 @@ def manage_ebooks(request, edition_id, by=None): ebook_form.instance.save() new_ebf.ebook = ebook_form.instance new_ebf.save() + new_ebf.ebook.filesize = new_ebf.file.size + new_ebf.ebook.save() else: ebook_form.save() ebook_form.instance.set_next_iter()