From 78892547d0930e9aecd671cc4bb899f05c10909c Mon Sep 17 00:00:00 2001 From: eric Date: Mon, 31 Jul 2023 17:57:43 -0400 Subject: [PATCH 01/32] fix title cleaner --- utils/text.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/text.py b/utils/text.py index 0bee9a23..068768a9 100644 --- a/utils/text.py +++ b/utils/text.py @@ -26,7 +26,7 @@ _ends_in_num = re.compile(r'\W*\d+$') def remove_badxml(s): return _illegal_xml_chars_RE.sub('', s) -_ws_runs_RE = re.compile(r'[\r\n\t]+') +_ws_runs_RE = re.compile(r'([\r\n\t]| \$b)+') def sanitize_ws(s): return _ws_runs_RE.sub(u' ', s) @@ -39,3 +39,4 @@ def remove_author_junk(authname): if 'ORCID:' in authname: authname = authname.split('ORCID:')[0].strip() return _ends_in_num.sub('', authname) + From 98f40890f2946fcab28324e4bbd09b8cdbbfc005 Mon Sep 17 00:00:00 2001 From: eric Date: Thu, 3 Aug 2023 17:16:19 -0400 Subject: [PATCH 02/32] Update harvest.py istanbul, editorialbonaventuriana, fahce, elgar, fupress --- core/loaders/harvest.py | 58 +++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 706f80bd..afe46af7 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -195,11 +195,12 @@ def harvesters(ebook): yield ebook.provider == 'usmcu.edu', harvest_usmcu yield ebook.provider == 'lalibreria.upv.es', harvest_upv yield ebook.provider == 'cambridge.org', harvest_cambridge - yield ebook.provider == 'iupress.istanbul.edu.tr', harvest_iupress yield ebook.provider == 'exonpublications.com', harvest_exon yield ebook.provider == 'ressources.una-editions.fr', harvest_una yield ebook.provider == 'wbg-wissenverbindet.de', harvest_wbg yield ebook.provider == 'urn.kb.se', harvest_kb + yield ebook.provider == 'iupress.istanbul.edu.tr', harvest_istanbul + yield ebook.provider == 'editorialbonaventuriana.usb.edu.co', harvest_editorialbonaventuriana def ebf_if_harvested(url): onlines = models.EbookFile.objects.filter(source=url) @@ -685,7 +686,7 @@ def harvest_usu(ebook): def harvest_fahce(ebook): def selector(doc): - return doc.select_one('div.publicationFormatLink a[href]') + return doc.select_one('div.pub_format_single a[href]') return harvest_one_generic(ebook, selector) BAD_CERTS = {'libri.unimi.it', 'editorial.ucatolicaluisamigo.edu.co', 'openpress.mtsu.edu'} @@ -787,9 +788,15 @@ def harvest_ios(ebook): def harvest_elgar(ebook): - def chap_selector(doc): - return doc.select('#toc li.pdfLink a[href]') - return harvest_stapled_generic(ebook, None, chap_selector) + if 'display' in ebook.url: + url = ebook.url.replace('display', 'downloadpdf')[:-3] + 'pdf' + elif 'monobook-oa' in ebook.url: + url = ebook.url.replace('monobook-oa', 'downloadpdf')[:-3] + 'pdf' + elif 'edcollbook-oa' in ebook.url: + url = ebook.url.replace('edcollbook-oa', 'downloadpdf')[:-3] + 'pdf' + else: + return None, 0 + return make_dl_ebook(url, ebook, user_agent=settings.GOOGLEBOT_UA) def harvest_wsp(ebook): @@ -966,12 +973,15 @@ def harvest_ipsflab(ebook): def harvest_fupress(ebook): def selector(doc): return doc.select_one('#ctl00_contenuto_pdf a.btn-open[href]') + if 'isbn' in ebook.url: + set_bookshop(ebook) + return None, 0 return harvest_one_generic(ebook, selector) def harvest_dunckerhumblot(ebook): def selector(doc): - return doc.select_one('section.index-card a[href$="download"]') + return doc.select_one('div.section__buttons a[href$="download"]') return harvest_one_generic(ebook, selector) @@ -980,6 +990,11 @@ def harvest_cornellopen(ebook): return doc.select('div.sp-product__buy-btn-container li a[href]') return harvest_multiple_generic(ebook, selector) +def harvest_editorialbonaventuriana(ebook): + def selector(doc): + return doc.select_one('div.djc_fulltext p a[href$=".pdf"]') + return harvest_one_generic(ebook, selector) + def harvest_esv(ebook): doc = get_soup(ebook.url.replace('details', 'download')) @@ -1085,11 +1100,6 @@ def harvest_cambridge(ebook): logger.warning('couldn\'t get soup for %s', ebook.url) return None, 0 -def harvest_iupress(ebook): - def selector(doc): - return doc.find_all('a', string=re.compile(r'(Full Text \(PDF\)|e-PUB)')) - return harvest_multiple_generic(ebook, selector) - def harvest_exon(ebook): doc = get_soup(ebook.url) if doc: @@ -1129,3 +1139,29 @@ def harvest_kb(ebook): def selector(doc): return doc.select_one('a[title=fulltext][href]') return harvest_one_generic(ebook, selector) + +def harvest_istanbul(ebook): + def cdn_url(soup): + objs = soup.find_all('a', href=re.compile(r'cdn\.istanbul')) + for obj in objs: + yield obj['href'] + def pdf_urls(ebook): + doc = get_soup(ebook.url, user_agent=settings.GOOGLEBOT_UA, follow_redirects=True) + if doc: + for content_url in cdn_url(doc): + yield content_url + for obj in doc.select('div.post-content h5 a.from-journal[href]'): + chap_url = urljoin(ebook.url, obj['href']) + chap_doc = get_soup(chap_url, user_agent=settings.GOOGLEBOT_UA, follow_redirects=True) + if chap_doc: + for content_url in cdn_url(chap_doc): + yield content_url + + # staple the chapters + stapled = make_stapled_ebook(pdf_urls(ebook), ebook, user_agent=settings.GOOGLEBOT_UA) + if stapled: + return stapled + else: + logger.warning('couldn\'t make ebook file for %s', ebook.url) + return None, 0 + From f16e55f128c43347cbb8ae92a061e4575687fefe Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 4 Aug 2023 12:26:08 -0400 Subject: [PATCH 03/32] oapen fixes --- core/loaders/harvest.py | 8 +++++++ core/management/commands/fix_online_ebooks.py | 23 ++++++++----------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index afe46af7..7d662622 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -134,6 +134,7 @@ DONT_HARVEST = [ ] def harvesters(ebook): + yield ebook.provider == 'OAPEN Library', harvest_oapen yield ebook.provider in GOOD_PROVIDERS, harvest_generic yield 'dropbox.com/s/' in ebook.url, harvest_dropbox yield ebook.provider == 'jbe-platform.com', harvest_jbe @@ -326,6 +327,13 @@ def harvest_generic(ebook): return set_bookshop(ebook) return make_dl_ebook(ebook.url, ebook) +def harvest_oapen(ebook): + if is_bookshop_url(ebook.url): + return set_bookshop(ebook) + if '/bitstream/' in ebook.url: + return make_dl_ebook(ebook.url, ebook, user_agent=settings.GOOGLEBOT_UA) + return None, 0 + def harvest_one_generic(ebook, selector, user_agent=settings.USER_AGENT): doc = get_soup(ebook.url, user_agent=user_agent, follow_redirects=True) diff --git a/core/management/commands/fix_online_ebooks.py b/core/management/commands/fix_online_ebooks.py index 30fda49a..2f8a2103 100644 --- a/core/management/commands/fix_online_ebooks.py +++ b/core/management/commands/fix_online_ebooks.py @@ -2,29 +2,26 @@ from django.core.management.base import BaseCommand from regluit.core.loaders.doab_utils import online_to_download from regluit.core.models import Ebook -from regluit.core.models.loader import type_for_url class Command(BaseCommand): - help = "fix 'online' ebooks" + help = "deactivate dead oapen ebooks" args = "" def add_arguments(self, parser): - parser.add_argument('limit', nargs='?', type=int, default=0, help="max to harvest") + parser.add_argument('limit', nargs='?', type=int, default=0, help="max to fix") def handle(self, limit=0, **options): limit = int(limit) if limit else 0 - onlines = Ebook.objects.filter(format='online', provider='SciELO') + onlines = Ebook.objects.filter(active=1, provider='OAPEN Library', + url__contains='/download/') done = 0 for online in onlines: - urls = online_to_download(online.url) - for url in urls: - online.format = type_for_url(url, force=True) - online.active = True - online.save() - done += 1 - self.stdout.write(online.edition.work.title) + online.active = False + online.save() + done += 1 + #self.stdout.write(online.edition.work.title) if done > limit: break self.stdout.write('fixed {} ebooks'.format(done)) - if done == 100: - self.stdout.write('100 is the maximum; repeat to do more') + if done >= 1000: + self.stdout.write('1000 is the maximum; repeat to do more') From 26edcaec14d9621f0c368b1bfcb306b22cd70435 Mon Sep 17 00:00:00 2001 From: eric Date: Wed, 13 Sep 2023 13:29:37 -0400 Subject: [PATCH 04/32] switch default donation to general --- frontend/templates/home.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontend/templates/home.html b/frontend/templates/home.html index 23e99fda..240ffb6f 100755 --- a/frontend/templates/home.html +++ b/frontend/templates/home.html @@ -203,9 +203,9 @@ function put_un_in_cookie2(){

Donate!

-
Please help support Unglue.it by making a tax-deductible donation to the Free Ebook Foundation. Donations are currently directed to our Monographs Matching Fund.
+
Please help support Unglue.it by making a tax-deductible donation to the Free Ebook Foundation.
- +
From be7bb61a5598836432dccbfdc7e1073c7446a967 Mon Sep 17 00:00:00 2001 From: eric Date: Mon, 27 Nov 2023 10:54:33 -0500 Subject: [PATCH 05/32] autoactivate springer --- core/management/commands/load_books_springer.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/core/management/commands/load_books_springer.py b/core/management/commands/load_books_springer.py index 34bd3cbd..93980e9a 100644 --- a/core/management/commands/load_books_springer.py +++ b/core/management/commands/load_books_springer.py @@ -18,3 +18,15 @@ class Command(BaseCommand): else: books = load_springer(int(startpage), int(endpage)) self.stdout.write("loaded {} books".format(len(books))) + + for edition in books: + done_fmt = set() + for ebook in edition.work.ebooks_all(): + for fmt in ['pdf', 'epub', 'mobi']: + if ebook.format == fmt: + if fmt not in done_fmt: + ebook.activate() + done_fmt.add(fmt) + else: + ebook.deactivate() + From fe0a2b3a2ddc2499bffd77db2ea049dbb2daceee Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 8 Dec 2023 12:42:01 -0500 Subject: [PATCH 06/32] get journal.frontiersin.org to work --- core/loaders/harvest.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 7d662622..c5f63299 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -595,13 +595,9 @@ def harvest_frontiersin(ebook): rl.last.pop(ebook.provider, 0) return None, 0 - if ebook.provider == 'journal.frontiersin.org': - ebook, status = redirect_ebook(ebook) - if status < 1: - return None, -1 if status < 0 else 0 num = 0 harvested = None - doc = get_soup(ebook.url) + doc = get_soup(ebook.url, follow_redirects=True) if doc: for obj in doc.select('button[data-href]'): dl_url = obj['data-href'] From 78b63773a7cf545e4a089403ea0b4a5802b14fa7 Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 8 Dec 2023 12:42:25 -0500 Subject: [PATCH 07/32] fix brill.com --- core/loaders/harvest.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index c5f63299..dedeef0f 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -176,7 +176,7 @@ def harvesters(ebook): yield ebook.provider == 'content.sciendo.com', harvest_sciendo yield ebook.provider == 'edition-topoi.org', harvest_topoi yield ebook.provider == 'meson.press', harvest_meson - yield 'brillonline' in ebook.provider, harvest_brill + yield 'brill' in ebook.provider, harvest_brill yield ebook.provider == 'DOI Resolver', harvest_doi yield ebook.provider == 'apps.crossref.org', harvest_doi_coaccess yield ebook.provider == 'ispf-lab.cnr.it', harvest_ipsflab @@ -892,10 +892,16 @@ def harvest_meson(ebook): def harvest_brill(ebook): r = requests.get(ebook.url, headers={'User-Agent': settings.GOOGLEBOT_UA}) - if not r.url.startswith('https://brill.com/view/title/'): - return None, 0 - dl_url = 'https://brill.com/downloadpdf/title/%s.pdf' % r.url[29:] - return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA) + if r.url.startswith('https://brill.com/view/title/'): + dl_url = 'https://brill.com/downloadpdf/title/%s.pdf' % r.url[29:] + return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA) + elif r.url.startswith('https://brill.com/display/title/'): + dl_url = 'https://brill.com/downloadpdf/title/%s.pdf' % r.url[32:] + return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA) + elif r.url.startswith('https://brill.com/edcollbook-oa/title/'): + dl_url = 'https://brill.com/downloadpdf/title/%s.pdf' % r.url[38:] + return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA) + return None, 0 def harvest_doi(ebook): # usually a 404. From 8dd4713bac861ec3e33b9ff014f12f6825294e90 Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 8 Dec 2023 12:42:46 -0500 Subject: [PATCH 08/32] cathc an exception in libroschile --- core/loaders/harvest.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index dedeef0f..959662ca 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -964,7 +964,10 @@ def harvest_libroschile(ebook): if not guid: return None, 0 jsonurl = LIBROSJSON % (booknum, guid) - json = requests.get(jsonurl).json() + try: + json = requests.get(jsonurl).json() + except: + return None, 0 if not json: return None, 0 filename = json.get('downloads',{}).get('url', None) From f2e4fe6b292bcc73f6087204a481bd71243b718c Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 8 Dec 2023 12:43:05 -0500 Subject: [PATCH 09/32] add urldefese to STOREPROVIDERS --- core/loaders/doab_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/core/loaders/doab_utils.py b/core/loaders/doab_utils.py index ec1c2878..076b0a6c 100644 --- a/core/loaders/doab_utils.py +++ b/core/loaders/doab_utils.py @@ -93,6 +93,7 @@ STOREPROVIDERS = [ "una-editions.fr", "universitetsforlaget.no", 'usu.edu', + 'urldefense.com', "wbg-wissenverbindet.de", "zalozba.zrc-sazu.si", ] From a1385b3818bff5cefea63ff9e62f61fcc613aac5 Mon Sep 17 00:00:00 2001 From: eric Date: Thu, 14 Dec 2023 10:29:36 -0500 Subject: [PATCH 10/32] fix error with bad google id --- frontend/views/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/views/__init__.py b/frontend/views/__init__.py index 45062927..7b539bf1 100755 --- a/frontend/views/__init__.py +++ b/frontend/views/__init__.py @@ -559,7 +559,7 @@ def googlebooks(request, googlebooks_id): return HttpResponseNotFound("failed looking up googlebooks id %s" % googlebooks_id) try: edition = bookloader.add_by_googlebooks_id(googlebooks_id) - if edition.new: + if edition and edition.new: # add related editions asynchronously tasks.populate_edition.delay(edition.isbn_13) if request.user.is_authenticated: From 031e305458cd7ef73f6b5b05d0f2bbc0d8c69864 Mon Sep 17 00:00:00 2001 From: eric Date: Thu, 14 Dec 2023 10:44:03 -0500 Subject: [PATCH 11/32] catch an integrity error --- frontend/views/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/frontend/views/__init__.py b/frontend/views/__init__.py index 7b539bf1..34869ff1 100755 --- a/frontend/views/__init__.py +++ b/frontend/views/__init__.py @@ -33,6 +33,7 @@ from django.core.mail import EmailMessage from django.urls import reverse, reverse_lazy from django.core.validators import validate_email from django.db.models import Q, Count, Sum +from django.db.utils import IntegrityError from django.forms import Select from django.forms.models import inlineformset_factory from django.http import ( @@ -567,6 +568,10 @@ def googlebooks(request, googlebooks_id): except bookloader.LookupFailure: logger.warning("failed to load googlebooks_id %s" % googlebooks_id) return HttpResponseNotFound("failed looking up googlebooks id %s" % googlebooks_id) + except IntegrityError: + logger.warning("duplicate (maybe) googlebooks_id %s" % googlebooks_id) + return HttpResponseNotFound("failed adding googlebooks id %s" % googlebooks_id) + if not edition: return HttpResponseNotFound("invalid googlebooks id") work_url = reverse('work', kwargs={'work_id': edition.work_id}) From 991357a6aec330baf8c92db9df924b001da5c0c6 Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 15 Dec 2023 14:08:29 -0500 Subject: [PATCH 12/32] americana --- core/loaders/harvest.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 959662ca..762dcef9 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -142,8 +142,10 @@ def harvesters(ebook): yield OPENBOOKPUB.search(ebook.url), harvest_obp yield ebook.provider == 'Transcript-Verlag', harvest_transcript yield ebook.provider == 'ksp.kit.edu', harvest_ksp - yield ebook.provider == 'digitalis.uc.pt', harvest_digitalis + yield ebook.provider in ['digitalis.uc.pt', 'repositorio.americana.edu.co'], harvest_dspace2 + yield ebook.provider in ['repositorio.americana.edu.co'], harvest_dspace2 yield ebook.provider == 'nomos-elibrary.de', harvest_nomos + yield ebook.provider == 'digitalis.uc.pt', harvest_digitalis yield 'frontiersin.org' in ebook.provider, harvest_frontiersin yield ebook.provider in ['Palgrave Connect', 'Springer', 'springer.com'], harvest_springerlink yield ebook.provider == 'pulp.up.ac.za', harvest_pulp @@ -537,19 +539,12 @@ def harvest_ksp(ebook): return doc.select_one('p.linkForPDF a') return harvest_one_generic(ebook, selector) + def harvest_digitalis(ebook): - doc = get_soup(ebook.url) - if doc: - obj = doc.find('meta', attrs={"name": "citation_pdf_url"}) - if obj: - dl_url = urljoin(ebook.url, obj.get('content', None)) - if dl_url: - return make_dl_ebook(dl_url, ebook) - else: - logger.warning('couldn\'t get dl_url for %s', ebook.url) - else: - logger.warning('couldn\'t get soup for %s', ebook.url) - return None, 0 + def selector(doc): + return doc.select_one('a.item-download-button') + return harvest_one_generic(ebook, selector) + NOMOSPDF = re.compile('download_full_pdf') def harvest_nomos(ebook): @@ -715,6 +710,22 @@ def harvest_dspace(ebook): return harvest_one_generic(ebook, selector) +def harvest_dspace2(ebook): + doc = get_soup(ebook.url) + if doc: + obj = doc.find('meta', attrs={"name": "citation_pdf_url"}) + if obj: + dl_url = urljoin(ebook.url, obj.get('content', None)) + if dl_url: + dl_url = dl_url.replace('http://', 'https://') + return make_dl_ebook(dl_url, ebook) + else: + logger.warning('couldn\'t get dl_url for %s', ebook.url) + else: + logger.warning('couldn\'t get soup for %s', ebook.url) + return None, 0 + + # won't harvest page-image books def harvest_unt(ebook): def selector(doc): From c303e26be799b5cf3ef199cf99ddc64f80b508f6 Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 19 Dec 2023 16:34:41 -0500 Subject: [PATCH 13/32] OBP, cambridge, revert digitalis, add stores --- core/loaders/doab_utils.py | 2 ++ core/loaders/harvest.py | 21 +++++++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/core/loaders/doab_utils.py b/core/loaders/doab_utils.py index 076b0a6c..ddfbb018 100644 --- a/core/loaders/doab_utils.py +++ b/core/loaders/doab_utils.py @@ -75,6 +75,7 @@ STOREPROVIDERS = [ 'e-elgar.com', "edicions.ub.edu", "epubli.de", + 'eurekaselect.com', "iospress.nl", "karolinum.cz", "librumstore.com", @@ -90,6 +91,7 @@ STOREPROVIDERS = [ "publicacions.ub.edu", "publicacions.urv.cat", 'sci.fo', + 'store.printservice.nl', "una-editions.fr", "universitetsforlaget.no", 'usu.edu', diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 762dcef9..b852d731 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -23,7 +23,6 @@ logger = logging.getLogger(__name__) DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"') DELAY = 1.0 -OPENBOOKPUB = re.compile(r'openbookpublishers.com/+(reader|product|/?download/book)/(\d+)') class RateLimiter(object): def __init__(self): @@ -139,10 +138,9 @@ def harvesters(ebook): yield 'dropbox.com/s/' in ebook.url, harvest_dropbox yield ebook.provider == 'jbe-platform.com', harvest_jbe yield ebook.provider == u'De Gruyter Online', harvest_degruyter - yield OPENBOOKPUB.search(ebook.url), harvest_obp + yield ebook.provider == 'Open Book Publishers', harvest_obp yield ebook.provider == 'Transcript-Verlag', harvest_transcript yield ebook.provider == 'ksp.kit.edu', harvest_ksp - yield ebook.provider in ['digitalis.uc.pt', 'repositorio.americana.edu.co'], harvest_dspace2 yield ebook.provider in ['repositorio.americana.edu.co'], harvest_dspace2 yield ebook.provider == 'nomos-elibrary.de', harvest_nomos yield ebook.provider == 'digitalis.uc.pt', harvest_digitalis @@ -415,6 +413,7 @@ def harvest_stapled_generic(ebook, selector, chap_selector, strip_covers=0, logger.warning('couldn\'t get soup for %s', ebook.url) return None, 0 +OPENBOOKPUB = re.compile(r'openbookpublishers.com/+(reader|product|/?download/book|books)/(10\.11647/OBP\.\d+|\d+)') def harvest_obp(ebook): match = OPENBOOKPUB.search(ebook.url) @@ -431,6 +430,9 @@ def harvest_obp(ebook): booknum = OPENBOOKPUB.search(booknum).group(2) else: logger.warning('couldn\'t get soup for %s', prod_url) + elif match and match.group(2).startswith('10.'): + dl_url = 'https://books.openbookpublishers.com/' + match.group(2).lower() + '.pdf' + return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA) else: booknum = match.group(2) if not booknum: @@ -747,7 +749,10 @@ def harvest_mdpi(ebook): return harvest_one_generic(ebook, selector) -def harvest_idunn(ebook): +def harvest_idunn(ebook): + # if '/doi/book/' in ebook.url: + # url = ebook.url.replace('/book/', '/pdf/') + '?download=true' + # return make_dl_ebook(url, ebook)''' doc = get_soup(ebook.url) if doc: obj = doc.select_one('#accessinfo[data-product-id]') @@ -1108,6 +1113,14 @@ def harvest_cambridge(ebook): ebook, status = redirect_ebook(ebook) doc = get_soup(ebook.url) if doc: + obj = doc.find('a', string=re.compile('Full book PDF')) + if obj and obj['href']: + dl_url = urljoin(ebook.url, obj['href']) + return make_dl_ebook(dl_url, ebook) + obj = doc.find('meta', attrs={"name": re.compile("citation_pdf_url")}) + if obj and obj['content']: + dl_url = obj['content'] + return make_dl_ebook(dl_url, ebook) pdflinks = [] for obj in doc.select('a[data-pdf-content-id]'): if obj and obj['href']: From 2b8381aeb299dff6a461db67f414f24789054ec0 Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 19 Dec 2023 17:07:13 -0500 Subject: [PATCH 14/32] Update harvest.py --- core/loaders/harvest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index b852d731..9dce0fef 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -418,7 +418,9 @@ OPENBOOKPUB = re.compile(r'openbookpublishers.com/+(reader|product|/?download/b def harvest_obp(ebook): match = OPENBOOKPUB.search(ebook.url) booknum = None - if match and match.group(1) in ('product', 'reader'): + if not match: + return None, 0 + if match: and match.group(1) in ('product', 'reader'): prodnum = match.group(2) prod_url = 'https://www.openbookpublishers.com/product/{}'.format(prodnum) doc = get_soup(prod_url, settings.GOOGLEBOT_UA) From 9bea94098a2ea0b30a42c813c54f08a203a2664a Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 19 Dec 2023 17:09:20 -0500 Subject: [PATCH 15/32] fix after test --- core/loaders/harvest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 9dce0fef..2d912404 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -420,7 +420,7 @@ def harvest_obp(ebook): booknum = None if not match: return None, 0 - if match: and match.group(1) in ('product', 'reader'): + if match and match.group(1) in ('product', 'reader'): prodnum = match.group(2) prod_url = 'https://www.openbookpublishers.com/product/{}'.format(prodnum) doc = get_soup(prod_url, settings.GOOGLEBOT_UA) From e5b78a9db09ac577b4d77c2a94ef0840c3db2742 Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 19 Dec 2023 20:39:40 -0500 Subject: [PATCH 16/32] tudelft, kit --- core/loaders/harvest.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 2d912404..95cf68ff 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -123,6 +123,7 @@ CMPPROVIDERS = [ 'omp.zrc-sazu.si', 'openpress.mtsu.edu', 'teiresias-supplements.mcgill.ca', + 'textbooks.open.tudelft.nl', ] DONT_HARVEST = [ 'Unglue.it', @@ -200,6 +201,7 @@ def harvesters(ebook): yield ebook.provider == 'ressources.una-editions.fr', harvest_una yield ebook.provider == 'wbg-wissenverbindet.de', harvest_wbg yield ebook.provider == 'urn.kb.se', harvest_kb + yield ebook.provider == 'publikationen.bibliothek.kit.edu', harvest_kit yield ebook.provider == 'iupress.istanbul.edu.tr', harvest_istanbul yield ebook.provider == 'editorialbonaventuriana.usb.edu.co', harvest_editorialbonaventuriana @@ -550,6 +552,12 @@ def harvest_digitalis(ebook): return harvest_one_generic(ebook, selector) +def harvest_kit(ebook): + def selector(doc): + return doc.select_one('a.downloadTextLink') + return harvest_one_generic(ebook, selector) + + NOMOSPDF = re.compile('download_full_pdf') def harvest_nomos(ebook): doc = get_soup(ebook.url, follow_redirects=True) From b0c5dcb8779a529fee75ef3bb98f6450d662bcd3 Mon Sep 17 00:00:00 2001 From: eric Date: Wed, 20 Dec 2023 07:44:30 -0500 Subject: [PATCH 17/32] funlam --- core/loaders/harvest.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 95cf68ff..4c963372 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -183,6 +183,7 @@ def harvesters(ebook): yield ebook.provider == 'ispf-lab.cnr.it', harvest_ipsflab yield ebook.provider == 'libros.uchile.cl', harvest_libroschile yield ebook.provider == 'fupress.com', harvest_fupress + yield ebook.provider == 'funlam.edu.co', harvest_funlam yield ebook.provider == 'elibrary.duncker-humblot.com', harvest_dunckerhumblot yield ebook.provider == 'cornellopen.org', harvest_cornellopen yield ebook.provider == 'esv.info', harvest_esv @@ -1017,6 +1018,12 @@ def harvest_fupress(ebook): return None, 0 return harvest_one_generic(ebook, selector) +def harvest_funlam(ebook): + if '/modules/' in ebook.url: + set_bookshop(ebook) + return None, 0 + return make_dl_ebook(ebook.url, ebook) + def harvest_dunckerhumblot(ebook): def selector(doc): From 500d8ee7c4278a57cf16d340356282371e0d119b Mon Sep 17 00:00:00 2001 From: eric Date: Thu, 21 Dec 2023 10:21:37 -0500 Subject: [PATCH 18/32] add facility for manual harvest --- core/loaders/harvest.py | 48 +++++++++++++++++++++++++++++++++++++--- core/models/bibmodels.py | 4 +++- 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 4c963372..29b4bcdd 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -10,6 +10,7 @@ import requests from django.conf import settings from django.core.files.base import ContentFile +from django.core.files.storage import default_storage from regluit.core import models from regluit.core.models import loader @@ -109,6 +110,7 @@ def clean_archive(ebf): CMPPROVIDERS = [ 'books.open.tudelft.nl', 'ebooks.epublishing.ekt.gr', + 'ebooks.uminho.pt', 'editorial.inudi.edu.pe', 'editorial.ucatolicaluisamigo.edu.co', 'editorial.uniagustiniana.edu.co', @@ -132,10 +134,17 @@ DONT_HARVEST = [ 'Google Books', 'OpenEdition Books', ] +MANUAL_HARVEST = [ + 'cabidigitallibrary.org', + 'books.google.be', + 'books.google.ch', + 'books.google.nl', +] def harvesters(ebook): yield ebook.provider == 'OAPEN Library', harvest_oapen yield ebook.provider in GOOD_PROVIDERS, harvest_generic + yield ebook.provider in MANUAL_HARVEST, harvest_manual yield 'dropbox.com/s/' in ebook.url, harvest_dropbox yield ebook.provider == 'jbe-platform.com', harvest_jbe yield ebook.provider == u'De Gruyter Online', harvest_degruyter @@ -330,6 +339,39 @@ def harvest_generic(ebook): return set_bookshop(ebook) return make_dl_ebook(ebook.url, ebook) + +def harvest_manual(ebook): + def make_manual_ebf(format): + fname = f'mebf/{ebook.id}.{format}' + if default_storage.exists(fname): + filesize = default_storage.size(fname) + new_ebf = models.EbookFile.objects.create( + edition=ebook.edition, + format=format, + source=ebook.url, + ) + new_ebf.file.name = fname + harvested_ebook = models.Ebook.objects.create( + edition=ebook.edition, + format=format, + provider='Unglue.it', + url=new_ebf.file.url, + rights=ebook.rights, + filesize=filesize, + version_label=ebook.version_label, + version_iter=ebook.version_iter, + ) + new_ebf.ebook = harvested_ebook + new_ebf.save() + return new_ebf + else: + return None + pdf_ebf = make_manual_ebf('pdf') + epub_ebf = make_manual_ebf('epub') + + return pdf_ebf or epub_ebf, (1 if pdf_ebf else 0) + (1 if epub_ebf else 0) + + def harvest_oapen(ebook): if is_bookshop_url(ebook.url): return set_bookshop(ebook) @@ -761,9 +803,9 @@ def harvest_mdpi(ebook): def harvest_idunn(ebook): - # if '/doi/book/' in ebook.url: - # url = ebook.url.replace('/book/', '/pdf/') + '?download=true' - # return make_dl_ebook(url, ebook)''' + if '/doi/book/' in ebook.url: + return harvest_manual(ebook) + doc = get_soup(ebook.url) if doc: obj = doc.select_one('#accessinfo[data-product-id]') diff --git a/core/models/bibmodels.py b/core/models/bibmodels.py index 301cbb38..a53c3b38 100644 --- a/core/models/bibmodels.py +++ b/core/models/bibmodels.py @@ -1080,7 +1080,9 @@ def safe_get_work(work_id): return work def path_for_file(instance, filename): - return "ebf/{}.{}".format(uuid.uuid4().hex, instance.format) + if filename: + return f"mebf/{filename}" + return f"ebf/{uuid.uuid4().hex}.{instance.format}" class EbookFile(models.Model): file = models.FileField(upload_to=path_for_file) From 3307fe944171b04d27139b2061f103608d5b08ad Mon Sep 17 00:00:00 2001 From: eric Date: Thu, 21 Dec 2023 11:02:13 -0500 Subject: [PATCH 19/32] iminho (improve cmp) --- core/loaders/harvest.py | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 29b4bcdd..269af957 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -404,7 +404,7 @@ def harvest_one_generic(ebook, selector, user_agent=settings.USER_AGENT): def harvest_multiple_generic(ebook, selector, dl=lambda x:x): num = 0 harvested = None - doc = get_soup(ebook.url) + doc = get_soup(ebook.url, follow_redirects=True) if doc: found = [] try: @@ -743,18 +743,37 @@ def harvest_fahce(ebook): return doc.select_one('div.pub_format_single a[href]') return harvest_one_generic(ebook, selector) +def get_meta(doc, term): + obj = doc.find('meta', attrs={"name": term}) + if obj: + return obj.get('content', None) + else: + logger.warning(f'no meta for {term}') + + BAD_CERTS = {'libri.unimi.it', 'editorial.ucatolicaluisamigo.edu.co', 'openpress.mtsu.edu'} def harvest_cmp(ebook): def selector(doc): - objs = doc.select('.chapters a.cmp_download_link[href]') - if (len({obj['href'] for obj in objs})) > 1: - return [] - return doc.select('a.cmp_download_link[href]') + citation_pdf_url = get_meta(doc, "citation_pdf_url") + citation_epub_url = get_meta(doc, "citation_epub_url") + if citation_pdf_url or citation_epub_url: + if citation_pdf_url: + yield {'href': citation_pdf_url} + if citation_epub_url: + yield {'href': citation_epub_url} + else: + objs = doc.select('.chapters a.cmp_download_link[href]') + if (len({obj['href'] for obj in objs})) > 1: + return [] + return doc.select('a.cmp_download_link[href]') + def dl(url): return url.replace('view', 'download') + '?inline=1' + verify = ebook.provider not in BAD_CERTS if '/view/' in ebook.url: return make_dl_ebook(dl(ebook.url), ebook, verify=verify) + return harvest_multiple_generic(ebook, selector, dl=dl) @@ -764,16 +783,14 @@ def harvest_dspace(ebook): return doc.find(href=DSPACEPDF) return harvest_one_generic(ebook, selector) - def harvest_dspace2(ebook): doc = get_soup(ebook.url) if doc: - obj = doc.find('meta', attrs={"name": "citation_pdf_url"}) - if obj: - dl_url = urljoin(ebook.url, obj.get('content', None)) - if dl_url: - dl_url = dl_url.replace('http://', 'https://') - return make_dl_ebook(dl_url, ebook) + citation_pdf_url = get_meta(doc, "citation_pdf_url") + if citation_pdf_url: + dl_url = urljoin(ebook.url, citation_pdf_url) + dl_url = dl_url.replace('http://', 'https://') + return make_dl_ebook(dl_url, ebook) else: logger.warning('couldn\'t get dl_url for %s', ebook.url) else: From b229d3e0b336a6cbb76703a9599b7c609335b397 Mon Sep 17 00:00:00 2001 From: eric Date: Thu, 21 Dec 2023 12:11:11 -0500 Subject: [PATCH 20/32] add mr to doi coaccess --- core/loaders/harvest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 269af957..3efc73d9 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -188,7 +188,7 @@ def harvesters(ebook): yield ebook.provider == 'meson.press', harvest_meson yield 'brill' in ebook.provider, harvest_brill yield ebook.provider == 'DOI Resolver', harvest_doi - yield ebook.provider == 'apps.crossref.org', harvest_doi_coaccess + yield ebook.provider in ['apps.crossref.org', 'mr.crossref.org'], harvest_doi_coaccess yield ebook.provider == 'ispf-lab.cnr.it', harvest_ipsflab yield ebook.provider == 'libros.uchile.cl', harvest_libroschile yield ebook.provider == 'fupress.com', harvest_fupress From 5ca1c3c369b3067a40716edd381704fbdc060784 Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 22 Dec 2023 11:25:43 -0500 Subject: [PATCH 21/32] sciendo --- core/loaders/harvest.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 3efc73d9..18826f26 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -1,6 +1,7 @@ """ code for harvesting 'online' ebooks """ +import json import logging import re import time @@ -183,7 +184,7 @@ def harvesters(ebook): yield ebook.provider == 'laboutique.edpsciences.fr', harvest_edpsciences yield ebook.provider == 'waxmann.com', harvest_waxmann yield ebook.provider == 'pbsociety.org.pl', harvest_ojs - yield ebook.provider == 'content.sciendo.com', harvest_sciendo + yield 'sciendo.com' in ebook.provider, harvest_sciendo yield ebook.provider == 'edition-topoi.org', harvest_topoi yield ebook.provider == 'meson.press', harvest_meson yield 'brill' in ebook.provider, harvest_brill @@ -1278,3 +1279,22 @@ def harvest_istanbul(ebook): logger.warning('couldn\'t make ebook file for %s', ebook.url) return None, 0 + +def harvest_sciendo(ebook): + def selector(doc): + json_obj = doc.find('script', id='__NEXT_DATA__') + if json_obj: + try: + json_data = json.loads(json_obj.string) + pdf_url = json_data['props']['pageProps']['product']['pdfLink'] + epub_url = json_data['props']['pageProps']['product']['epubLink'] + if pdf_url or epub_url: + if pdf_url: + yield {'href': pdf_url} + if epub_url: + yield {'href': epub_url} + except json.JSONDecodeError as je: + logger.error(f'Bad json {je.msg}') + except KeyError as ke: + logger.error('No links in json for {ebook.url}') + return harvest_multiple_generic(ebook, selector) From d3e33da60e84bb9af620becc3c5114d0d91a6477 Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 22 Dec 2023 11:26:41 -0500 Subject: [PATCH 22/32] manu --- core/loaders/harvest.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 18826f26..953bddd4 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -215,6 +215,7 @@ def harvesters(ebook): yield ebook.provider == 'publikationen.bibliothek.kit.edu', harvest_kit yield ebook.provider == 'iupress.istanbul.edu.tr', harvest_istanbul yield ebook.provider == 'editorialbonaventuriana.usb.edu.co', harvest_editorialbonaventuriana + yield ebook.provider == 'manchesteruniversitypress.co.uk', harvest_manu def ebf_if_harvested(url): onlines = models.EbookFile.objects.filter(source=url) @@ -1280,6 +1281,21 @@ def harvest_istanbul(ebook): return None, 0 +def harvest_manu(ebook): + def chap_selector(doc): + return doc.select('div.content-box-body div.book-toc a.c-Button--link[href*="/display/"]') + def dl(url): + return url.replace('/display/', '/downloadpdf/').replace('.xml', '.pdf') + doc = get_soup(ebook.url, follow_redirects=True, user_agent=settings.CHROME_UA) + if doc: + obj = doc.find('a', string=re.compile(r"Open Access")) + if not obj or 'href' not in obj.attrs: + return None, 0 + ebook.url = urljoin(ebook.url, obj['href']) + return harvest_stapled_generic(ebook, lambda x: None, chap_selector, + user_agent=settings.CHROME_UA, dl=dl) + return None, 0 + def harvest_sciendo(ebook): def selector(doc): json_obj = doc.find('script', id='__NEXT_DATA__') From 2ac96406b35a076b1a2a972d09cddcbc6b757e06 Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 22 Dec 2023 11:27:31 -0500 Subject: [PATCH 23/32] reversion --- core/models/bibmodels.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/models/bibmodels.py b/core/models/bibmodels.py index a53c3b38..60917686 100644 --- a/core/models/bibmodels.py +++ b/core/models/bibmodels.py @@ -1080,8 +1080,6 @@ def safe_get_work(work_id): return work def path_for_file(instance, filename): - if filename: - return f"mebf/{filename}" return f"ebf/{uuid.uuid4().hex}.{instance.format}" class EbookFile(models.Model): From b3abb6952749b2c6af822c81ecdf589b55f1b33c Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 22 Dec 2023 11:28:05 -0500 Subject: [PATCH 24/32] store providers --- core/loaders/doab_utils.py | 62 ++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/core/loaders/doab_utils.py b/core/loaders/doab_utils.py index ddfbb018..bc521ab7 100644 --- a/core/loaders/doab_utils.py +++ b/core/loaders/doab_utils.py @@ -59,45 +59,47 @@ doab_reader = MetadataReader( STOREPROVIDERS = [ '7switch.com', - "amazon.ca", - "amazon.co.uk", - "amazon.com", - "amazon.de", - "amzn.to", - "apress.com", + 'amazon.ca', + 'amazon.co.uk', + 'amazon.com', + 'amazon.de', + 'amzn.to', + 'apress.com', 'bloomsbury.com', - "bod.de", - "cabi.org", - "cdcshoppingcart.uchicago.edu", - "checkout.sas.ac.uk", + 'bod.de', + 'cabi.org', + 'cdcshoppingcart.uchicago.edu', + 'checkout.sas.ac.uk', 'duncker-humblot.de', - "dykinson.com", + 'dykinson.com', 'e-elgar.com', - "edicions.ub.edu", - "epubli.de", + 'edicions.ub.edu', + 'epubli.de', 'eurekaselect.com', - "iospress.nl", - "karolinum.cz", - "librumstore.com", - "logos-verlag.de", - "mitpress.mit.edu", - "munishop.muni.cz", - "nomos-shop.de", - "palgrave.com", + 'global.oup.com', + 'iospress.nl', + 'karolinum.cz', + 'librumstore.com', + 'logos-verlag.de', + 'mitpress.mit.edu', + 'munishop.muni.cz', + 'nomos-shop.de', + 'palgrave.com', 'placedeslibraires.fr', - "play.google.com", - "press.umich.edu", - "pressesuniversitairesdeliege.be", - "publicacions.ub.edu", - "publicacions.urv.cat", + 'play.google.com', + 'press.umich.edu', + 'pressesuniversitairesdeliege.be', + 'publicacions.ub.edu', + 'publicacions.urv.cat', 'sci.fo', + 'schueren-verlag.de', 'store.printservice.nl', - "una-editions.fr", - "universitetsforlaget.no", + 'una-editions.fr', + 'universitetsforlaget.no', 'usu.edu', 'urldefense.com', - "wbg-wissenverbindet.de", - "zalozba.zrc-sazu.si", + 'wbg-wissenverbindet.de', + 'zalozba.zrc-sazu.si', ] def online_to_download(url): From 516d2b189688295eb4aaef8193b78b2a830d7f10 Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 22 Dec 2023 11:28:44 -0500 Subject: [PATCH 25/32] budrich --- core/loaders/harvest.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 953bddd4..6ec94d20 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -151,6 +151,7 @@ def harvesters(ebook): yield ebook.provider == u'De Gruyter Online', harvest_degruyter yield ebook.provider == 'Open Book Publishers', harvest_obp yield ebook.provider == 'Transcript-Verlag', harvest_transcript + yield ebook.provider == 'shop.budrich.de', harvest_budrich yield ebook.provider == 'ksp.kit.edu', harvest_ksp yield ebook.provider in ['repositorio.americana.edu.co'], harvest_dspace2 yield ebook.provider == 'nomos-elibrary.de', harvest_nomos @@ -603,6 +604,12 @@ def harvest_kit(ebook): return harvest_one_generic(ebook, selector) +def harvest_budrich(ebook): + def selector(doc): + return doc.select_one('a.download_pdf') + return harvest_one_generic(ebook, selector) + + NOMOSPDF = re.compile('download_full_pdf') def harvest_nomos(ebook): doc = get_soup(ebook.url, follow_redirects=True) From 7fdbe6eee606894ca39667ee45c1800abbac3bdb Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 22 Dec 2023 11:28:56 -0500 Subject: [PATCH 26/32] remove old sciendo --- core/loaders/harvest.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 6ec94d20..5c45c64d 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -966,12 +966,6 @@ def harvest_ojs(ebook): return harvest_multiple_generic(ebook, selector, dl=dl) -def harvest_sciendo(ebook): - def selector(doc): - return doc.select_one('a[title=PDF]') - return harvest_one_generic(ebook, selector, user_agent=settings.GOOGLEBOT_UA) - - def harvest_topoi(ebook): def selector(doc): return doc.select_one('li.pdf a[href]') From 43f31332c208e331e93242bbea47aa6c245508a7 Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 22 Dec 2023 11:29:17 -0500 Subject: [PATCH 27/32] figshare --- core/loaders/harvest.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 5c45c64d..0a176ffa 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -193,6 +193,7 @@ def harvesters(ebook): yield ebook.provider in ['apps.crossref.org', 'mr.crossref.org'], harvest_doi_coaccess yield ebook.provider == 'ispf-lab.cnr.it', harvest_ipsflab yield ebook.provider == 'libros.uchile.cl', harvest_libroschile + yield ebook.provider == 'smithsonian.figshare.com', harvest_figshare yield ebook.provider == 'fupress.com', harvest_fupress yield ebook.provider == 'funlam.edu.co', harvest_funlam yield ebook.provider == 'elibrary.duncker-humblot.com', harvest_dunckerhumblot @@ -1072,6 +1073,12 @@ def harvest_ipsflab(ebook): return harvest_multiple_generic(ebook, selector) +def harvest_figshare(ebook): + def selector(doc): + return doc.find('a', href=re.compile(r'/ndownloader/')) + return harvest_one_generic(ebook, selector) + + def harvest_fupress(ebook): def selector(doc): return doc.select_one('#ctl00_contenuto_pdf a.btn-open[href]') From 376141b8f6fcb2c2d186a4cbdd8165711e22e22d Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 22 Dec 2023 11:29:35 -0500 Subject: [PATCH 28/32] rti --- core/loaders/harvest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 0a176ffa..b42876d4 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -913,7 +913,9 @@ def harvest_mprl(ebook): def harvest_rti(ebook): - return make_dl_ebook(ebook.url + "/fulltext.pdf", ebook) + def selector(doc): + return doc.find('a', href=re.compile('fulltext.pdf')) + return harvest_one_generic(ebook, selector) def harvest_unibas(ebook): From b7aac2710ca9757caa5dacd5a31615dd60d20b0b Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 22 Dec 2023 11:29:49 -0500 Subject: [PATCH 29/32] edp --- core/loaders/harvest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index b42876d4..ec7b8a79 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -946,6 +946,8 @@ def harvest_pensoft(ebook): def harvest_edp(ebook): def selector(doc): return doc.select_one('a.fulldl[href]') + if ebook.url.endswith('.pdf'): + return harvest_generic(ebook) return harvest_one_generic(ebook, selector) From 64752eb72d838b520cad8a149284c4685e54c0ba Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 22 Dec 2023 11:30:17 -0500 Subject: [PATCH 30/32] gta --- core/loaders/harvest.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index ec7b8a79..4df0852d 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -217,6 +217,7 @@ def harvesters(ebook): yield ebook.provider == 'publikationen.bibliothek.kit.edu', harvest_kit yield ebook.provider == 'iupress.istanbul.edu.tr', harvest_istanbul yield ebook.provider == 'editorialbonaventuriana.usb.edu.co', harvest_editorialbonaventuriana + yield ebook.provider == 'verlag.gta.arch.ethz.ch', harvest_gta yield ebook.provider == 'manchesteruniversitypress.co.uk', harvest_manu def ebf_if_harvested(url): @@ -1292,6 +1293,28 @@ def harvest_istanbul(ebook): logger.warning('couldn\'t make ebook file for %s', ebook.url) return None, 0 +def harvest_gta(ebook): + # https://verlag.gta.arch.ethz.ch/en/gta:book_978-3-85676-393-0 + pos = ebook.url.find('_') + if pos < 1: + return None, 0 + isbn = ebook.url[pos + 1:] + api_host = 'https://api.verlag.gta.arch.ethz.ch' + json_url = f'{api_host}/api/v1/graphs/gta/data/gtaapi:PublicRetrieveBook/gta:book_{isbn}/' + r = requests.get(json_url) + if r.status_code == 200: + try: + file_url = None + graph = r.json()['@graph'] + for obj in graph: + if "gtaapi:file_url" in obj: + file_url = obj["gtaapi:file_url"] + break + if file_url: + return make_dl_ebook(file_url, ebook) + except IndexError: + logger.error('no item_file for %s', ebook.url) + return None, 0 def harvest_manu(ebook): def chap_selector(doc): From bc20dca5774cdc187573e2a0cbab7a2a327b3e55 Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 22 Dec 2023 11:30:32 -0500 Subject: [PATCH 31/32] unap --- core/loaders/harvest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 4df0852d..1b90da6a 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -115,6 +115,7 @@ CMPPROVIDERS = [ 'editorial.inudi.edu.pe', 'editorial.ucatolicaluisamigo.edu.co', 'editorial.uniagustiniana.edu.co', + 'fcjp.derecho.unap.edu.pe', 'fedoabooks.unina.it', 'humanities-digital-library.org', 'idicap.com', From c561d915f0c896dcb362c2b5bc9b1e7768fc2aa4 Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 22 Dec 2023 11:36:19 -0500 Subject: [PATCH 32/32] delint --- core/loaders/harvest.py | 143 +++++++++++++++++++++++++++------------- 1 file changed, 96 insertions(+), 47 deletions(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 1b90da6a..8100fb1a 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -41,6 +41,7 @@ class RateLimiter(object): rl = RateLimiter() + def set_bookshop(ebook): ebook.format = 'bookshop' ebook.save() @@ -63,6 +64,7 @@ def dl_online(ebook, limiter=rl.delay, format='online', force=False): return harvester(ebook) return None, 0 + def archive_dl(ebook, limiter=rl.delay, force=False): """ status codes 0 : archive exists @@ -88,6 +90,7 @@ def archive_dl(ebook, limiter=rl.delay, force=False): status = -1 return status + def clean_archive(ebf): fsize = ebf.ebook.filesize ebook = ebf.ebook @@ -143,6 +146,7 @@ MANUAL_HARVEST = [ 'books.google.nl', ] + def harvesters(ebook): yield ebook.provider == 'OAPEN Library', harvest_oapen yield ebook.provider in GOOD_PROVIDERS, harvest_generic @@ -188,11 +192,11 @@ def harvesters(ebook): yield ebook.provider == 'pbsociety.org.pl', harvest_ojs yield 'sciendo.com' in ebook.provider, harvest_sciendo yield ebook.provider == 'edition-topoi.org', harvest_topoi - yield ebook.provider == 'meson.press', harvest_meson + yield ebook.provider == 'meson.press', harvest_meson yield 'brill' in ebook.provider, harvest_brill yield ebook.provider == 'DOI Resolver', harvest_doi yield ebook.provider in ['apps.crossref.org', 'mr.crossref.org'], harvest_doi_coaccess - yield ebook.provider == 'ispf-lab.cnr.it', harvest_ipsflab + yield ebook.provider == 'ispf-lab.cnr.it', harvest_ipsflab yield ebook.provider == 'libros.uchile.cl', harvest_libroschile yield ebook.provider == 'smithsonian.figshare.com', harvest_figshare yield ebook.provider == 'fupress.com', harvest_fupress @@ -221,6 +225,7 @@ def harvesters(ebook): yield ebook.provider == 'verlag.gta.arch.ethz.ch', harvest_gta yield ebook.provider == 'manchesteruniversitypress.co.uk', harvest_manu + def ebf_if_harvested(url): onlines = models.EbookFile.objects.filter(source=url) if onlines.exists(): @@ -249,7 +254,6 @@ def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET', ver logger.info("Previously harvested") return new_ebf, 0 - dl_cf, fmt = loader.load_ebookfile(url, ebook.format, user_agent=user_agent, method=method, verify=verify) if dl_cf: @@ -258,6 +262,7 @@ def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET', ver logger.warning('download format %s for %s is not ebook', ebook.format, url) return None, 0 + def redirect_ebook(ebook): """ returns an ebook and status : -3 : bad return code or problem @@ -265,7 +270,7 @@ def redirect_ebook(ebook): -2 : dead, but we need to keep items 0 : replaced with existing 1 : url updated - + """ try: r = requests.head(ebook.url, allow_redirects=True) @@ -273,7 +278,7 @@ def redirect_ebook(ebook): logger.error("Connection refused for %s", url) logger.error(e) return ebook, -3 - + if r.status_code == 404: if not models.Ebook.ebook_files.exists(): logger.info('deleting ebook for dead url', ebook.url) @@ -291,12 +296,14 @@ def redirect_ebook(ebook): logger.error("status code %s for %s", r.status_code, ebook.url) return ebook, -3 + def make_stapled_ebook(urllist, ebook, user_agent=settings.USER_AGENT, strip_covers=False): pdffile = staple_pdf(urllist, user_agent, strip_covers=strip_covers) if not pdffile: return None, 0 return make_harvested_ebook(ContentFile(pdffile.getvalue()), ebook, 'pdf') + def make_harvested_ebook(content, ebook, format, filesize=0): if not filesize: filesize = len(content) @@ -328,11 +335,12 @@ def make_harvested_ebook(content, ebook, format, filesize=0): ebook.filesize = filesize if filesize < 2147483647 else 2147483647 ebook.save() harvested_ebook = ebook - + new_ebf.ebook = harvested_ebook new_ebf.save() return new_ebf, 1 + def is_bookshop_url(url): if '/prodotto/' in url: return True @@ -340,9 +348,10 @@ def is_bookshop_url(url): return True return False + def harvest_generic(ebook): if is_bookshop_url(ebook.url): - return set_bookshop(ebook) + return set_bookshop(ebook) return make_dl_ebook(ebook.url, ebook) @@ -380,7 +389,7 @@ def harvest_manual(ebook): def harvest_oapen(ebook): if is_bookshop_url(ebook.url): - return set_bookshop(ebook) + return set_bookshop(ebook) if '/bitstream/' in ebook.url: return make_dl_ebook(ebook.url, ebook, user_agent=settings.GOOGLEBOT_UA) return None, 0 @@ -440,7 +449,7 @@ def harvest_stapled_generic(ebook, selector, chap_selector, strip_covers=0, except: base = ebook.url made = None - + # check for complete ebook if selector: obj = selector(doc) @@ -464,9 +473,10 @@ def harvest_stapled_generic(ebook, selector, chap_selector, strip_covers=0, logger.warning('couldn\'t get soup for %s', ebook.url) return None, 0 + OPENBOOKPUB = re.compile(r'openbookpublishers.com/+(reader|product|/?download/book|books)/(10\.11647/OBP\.\d+|\d+)') -def harvest_obp(ebook): +def harvest_obp(ebook): match = OPENBOOKPUB.search(ebook.url) booknum = None if not match: @@ -495,6 +505,7 @@ def harvest_obp(ebook): made = make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA, method='POST') return made + DEGRUYTERFULL = re.compile(r'/downloadpdf/title/.*') DEGRUYTERCHAP = re.compile(r'/downloadpdf/book/.*') COMPLETE = re.compile(r'complete ebook', flags=re.I) @@ -519,7 +530,7 @@ def harvest_degruyter(ebook): if obj: dl_url = urljoin(base, obj['href']) harvested, made = make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA) - + # check for pdf obj = doc.select_one('a.downloadPdf') if obj: @@ -551,6 +562,7 @@ def harvest_degruyter(ebook): logger.warning('couldn\'t get soup for %s', ebook.url) return None, 0 + def harvest_dropbox(ebook): if ebook.url.find(u'dl=0') >= 0: dl_url = ebook.url.replace(u'dl=0', u'dl=1') @@ -567,14 +579,16 @@ def harvest_dropbox(ebook): logger.warning('couldn\'t get %s', ebook.url) else: logger.warning('couldn\'t get dl for %s', ebook.url) - return None, 0 - -def harvest_jbe(ebook): + return None, 0 + + +def harvest_jbe(ebook): def selector(doc): return doc.select('div.access-options a[href]') return harvest_multiple_generic(ebook, selector) -def harvest_transcript(ebook): + +def harvest_transcript(ebook): num = 0 harvested = None doc = get_soup(ebook.url) @@ -589,32 +603,33 @@ def harvest_transcript(ebook): logger.warning('couldn\'t get any dl_url for %s', ebook.url) return harvested, num -def harvest_ksp(ebook): + +def harvest_ksp(ebook): def selector(doc): return doc.select_one('p.linkForPDF a') return harvest_one_generic(ebook, selector) -def harvest_digitalis(ebook): +def harvest_digitalis(ebook): def selector(doc): return doc.select_one('a.item-download-button') return harvest_one_generic(ebook, selector) -def harvest_kit(ebook): +def harvest_kit(ebook): def selector(doc): return doc.select_one('a.downloadTextLink') return harvest_one_generic(ebook, selector) -def harvest_budrich(ebook): +def harvest_budrich(ebook): def selector(doc): return doc.select_one('a.download_pdf') return harvest_one_generic(ebook, selector) NOMOSPDF = re.compile('download_full_pdf') -def harvest_nomos(ebook): +def harvest_nomos(ebook): doc = get_soup(ebook.url, follow_redirects=True) try: base = doc.find('base')['href'] @@ -631,7 +646,7 @@ def harvest_nomos(ebook): # staple the chapters chaps = doc.select('li.access[data-doi]') - + pdflinks = [] for chap in chaps: link = urljoin( @@ -651,12 +666,13 @@ def harvest_nomos(ebook): logger.warning('couldn\'t get soup for %s', ebook.url) return None, 0 -def harvest_frontiersin(ebook): + +def harvest_frontiersin(ebook): if 'GetFile.aspx' in ebook.url: ebook.delete() rl.last.pop(ebook.provider, 0) return None, 0 - + num = 0 harvested = None doc = get_soup(ebook.url, follow_redirects=True) @@ -673,9 +689,10 @@ def harvest_frontiersin(ebook): logger.warning('couldn\'t get any dl_url for %s', ebook.url) return harvested, num + SPRINGERDL = re.compile(r'(EPUB|PDF)') -def harvest_springerlink(ebook): +def harvest_springerlink(ebook): def selector(doc): return doc.find_all('a', title=SPRINGERDL) if ebook.provider == "springer.com": @@ -738,6 +755,7 @@ def harvest_bloomsbury(ebook): logger.warning('couldn\'t get soup for %s', ebook.url) return None, 0 + def harvest_athabasca(ebook): def selector(doc): return doc.select_one('li.downloadPDF a[href]') @@ -755,6 +773,7 @@ def harvest_fahce(ebook): return doc.select_one('div.pub_format_single a[href]') return harvest_one_generic(ebook, selector) + def get_meta(doc, term): obj = doc.find('meta', attrs={"name": term}) if obj: @@ -795,7 +814,8 @@ def harvest_dspace(ebook): return doc.find(href=DSPACEPDF) return harvest_one_generic(ebook, selector) -def harvest_dspace2(ebook): + +def harvest_dspace2(ebook): doc = get_soup(ebook.url) if doc: citation_pdf_url = get_meta(doc, "citation_pdf_url") @@ -864,13 +884,14 @@ def harvest_muse(ebook): return doc.find_all('a', href=re.compile(r'/chapter/\d+/pdf')) return harvest_stapled_generic(ebook, None, chap_selector, strip_covers=1) + def harvest_mitpress(ebook): def chap_selector(doc): return doc.select('a.section-pdfLink[href]') return harvest_stapled_generic(ebook, None, chap_selector, strip_covers=0) -def harvest_ios(ebook): +def harvest_ios(ebook): booknum = None doc = get_soup(ebook.url) if doc: @@ -908,7 +929,8 @@ def harvest_wsp(ebook): return make_dl_ebook(url, ebook, user_agent=settings.CHROME_UA) return None, 0 -def harvest_mprl(ebook): + +def harvest_mprl(ebook): def selector(doc): return doc.select('a.ml-20[href]') return harvest_multiple_generic(ebook, selector) @@ -925,6 +947,7 @@ def harvest_unibas(ebook): return doc.select_one('a.ep_document_link[href]') return harvest_one_generic(ebook, selector) + PENSOFT = re.compile(r'/book/(\d+)/list/') def harvest_pensoft(ebook): if ebook.id == 263395: @@ -961,7 +984,7 @@ def harvest_edpsciences(ebook): def harvest_waxmann(ebook): if ebook.url.startswith('https://www.waxmann.com/buch'): - return make_dl_ebook(ebook.url.replace('buch', 'index.php?eID=download&buchnr='), ebook) + return make_dl_ebook(ebook.url.replace('buch', 'index.php?eID=download&buchnr='), ebook) return None, 0 @@ -973,13 +996,13 @@ def harvest_ojs(ebook): return harvest_multiple_generic(ebook, selector, dl=dl) -def harvest_topoi(ebook): +def harvest_topoi(ebook): def selector(doc): return doc.select_one('li.pdf a[href]') return harvest_one_generic(ebook, selector) -def harvest_meson(ebook): +def harvest_meson(ebook): def selector(doc): for btn in doc.select('a[href] btn.btn-openaccess'): yield btn.parent @@ -998,7 +1021,8 @@ def harvest_brill(ebook): dl_url = 'https://brill.com/downloadpdf/title/%s.pdf' % r.url[38:] return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA) return None, 0 - + + def harvest_doi(ebook): # usually a 404. ebook, status = redirect_ebook(ebook) @@ -1006,6 +1030,7 @@ def harvest_doi(ebook): return None, -1 return None, 0 + def harvest_doi_coaccess(ebook): # make a new ebook for the "main pub" and ignore the "related pub" if ebook.url.startswith('https://doi.org/'): @@ -1036,7 +1061,8 @@ def harvest_doi_coaccess(ebook): set_bookshop(ebook) if format in DOWNLOADABLE: return make_dl_ebook(url, ebook) - return None, 0 + return None, 0 + GUID = re.compile(r'FBInit\.GUID = \"([0-9a-z]+)\"') LIBROSID = re.compile(r'(\d+)$') @@ -1070,22 +1096,22 @@ def harvest_libroschile(ebook): if not filename: return None, 0 pdfurl = LIBRODPDF % (booknum, filename, guid) - return make_dl_ebook(pdfurl, ebook) + return make_dl_ebook(pdfurl, ebook) -def harvest_ipsflab(ebook): +def harvest_ipsflab(ebook): def selector(doc): return doc.find_all('a', href=re.compile(r'/system/files/ispf_lab/quaderni/.*\.(pdf|epub)')) return harvest_multiple_generic(ebook, selector) -def harvest_figshare(ebook): +def harvest_figshare(ebook): def selector(doc): return doc.find('a', href=re.compile(r'/ndownloader/')) return harvest_one_generic(ebook, selector) -def harvest_fupress(ebook): +def harvest_fupress(ebook): def selector(doc): return doc.select_one('#ctl00_contenuto_pdf a.btn-open[href]') if 'isbn' in ebook.url: @@ -1093,24 +1119,26 @@ def harvest_fupress(ebook): return None, 0 return harvest_one_generic(ebook, selector) -def harvest_funlam(ebook): + +def harvest_funlam(ebook): if '/modules/' in ebook.url: set_bookshop(ebook) return None, 0 return make_dl_ebook(ebook.url, ebook) -def harvest_dunckerhumblot(ebook): +def harvest_dunckerhumblot(ebook): def selector(doc): return doc.select_one('div.section__buttons a[href$="download"]') return harvest_one_generic(ebook, selector) -def harvest_cornellopen(ebook): +def harvest_cornellopen(ebook): def selector(doc): return doc.select('div.sp-product__buy-btn-container li a[href]') return harvest_multiple_generic(ebook, selector) + def harvest_editorialbonaventuriana(ebook): def selector(doc): return doc.select_one('div.djc_fulltext p a[href$=".pdf"]') @@ -1129,17 +1157,20 @@ def harvest_esv(ebook): logger.warning('couldn\'t get soup for %s', ebook.url) return None, 0 -def harvest_fulcrum(ebook): + +def harvest_fulcrum(ebook): def selector(doc): return doc.select('ul.monograph-catalog-rep-downloads a[href]') return harvest_multiple_generic(ebook, selector) -def harvest_ubiquity(ebook): + +def harvest_ubiquity(ebook): def selector(doc): return doc.find_all('a', attrs={'data-category': re.compile('(epub|pdf) download')}) return harvest_multiple_generic(ebook, selector) -def harvest_orkana(ebook): + +def harvest_orkana(ebook): def selector(doc): for obj in doc.find_all('p', string=re.compile(r'\((PDF|E-BOK)\)')): div = obj.find_parent('div') @@ -1147,12 +1178,14 @@ def harvest_orkana(ebook): yield div.find_next_sibling('div').find('a') return harvest_multiple_generic(ebook, selector) + def harvest_euna(ebook): if '/view/' in ebook.url: return make_dl_ebook(ebook.url.replace('view', 'download'), ebook) set_bookshop(ebook) return None, 0 + def harvest_orl(ebook): if ebook.url.startswith('https://openresearchlibrary.org/viewer/'): orl_id = ebook.url[39:] @@ -1161,16 +1194,19 @@ def harvest_orl(ebook): ebook) return None, 0 + def harvest_pressesagro(ebook): def selector(doc): return doc.select_one('#sidebar ul li span a[href]') return harvest_one_generic(ebook, selector) + def harvest_buponline(ebook): def selector(doc): return doc.find('a', string=DOWNLOAD) return harvest_one_generic(ebook, selector) + INTECH = re.compile(r'\.intechopen\.com/books/(\d+)$') def harvest_intech(ebook): booknum = INTECH.search(ebook.url) @@ -1179,16 +1215,19 @@ def harvest_intech(ebook): return make_dl_ebook(url, ebook) return None, 0 + def harvest_usmcu(ebook): def selector(doc): return doc.find('a', string='PDF download') return harvest_one_generic(ebook, selector) + def harvest_upv(ebook): def selector(doc): return doc.select_one('a.descargar[href]') return harvest_one_generic(ebook, selector) + def harvest_una_editions(ebook): doc = get_soup(ebook.url) if doc: @@ -1201,6 +1240,7 @@ def harvest_una_editions(ebook): logger.warning('couldn\'t get soup for %s', ebook.url) return None, 0 + def harvest_cambridge(ebook): ebook, status = redirect_ebook(ebook) doc = get_soup(ebook.url) @@ -1229,6 +1269,7 @@ def harvest_cambridge(ebook): logger.warning('couldn\'t get soup for %s', ebook.url) return None, 0 + def harvest_exon(ebook): doc = get_soup(ebook.url) if doc: @@ -1248,11 +1289,13 @@ def harvest_exon(ebook): logger.warning('couldn\'t get soup for %s', ebook.url) return None, 0 + def harvest_una(ebook): def selector(doc): return doc.select_one('#header-primary-action a[href]') return harvest_one_generic(ebook, selector) + def harvest_wbg(ebook): ''' most of these are archived under files.wbg-wissenverbindet.de ''' doc = get_soup(ebook.url) @@ -1264,11 +1307,13 @@ def harvest_wbg(ebook): return make_dl_ebook(url, ebook) return None, 0 + def harvest_kb(ebook): def selector(doc): return doc.select_one('a[title=fulltext][href]') return harvest_one_generic(ebook, selector) + def harvest_istanbul(ebook): def cdn_url(soup): objs = soup.find_all('a', href=re.compile(r'cdn\.istanbul')) @@ -1285,7 +1330,7 @@ def harvest_istanbul(ebook): if chap_doc: for content_url in cdn_url(chap_doc): yield content_url - + # staple the chapters stapled = make_stapled_ebook(pdf_urls(ebook), ebook, user_agent=settings.GOOGLEBOT_UA) if stapled: @@ -1294,6 +1339,7 @@ def harvest_istanbul(ebook): logger.warning('couldn\'t make ebook file for %s', ebook.url) return None, 0 + def harvest_gta(ebook): # https://verlag.gta.arch.ethz.ch/en/gta:book_978-3-85676-393-0 pos = ebook.url.find('_') @@ -1306,17 +1352,18 @@ def harvest_gta(ebook): if r.status_code == 200: try: file_url = None - graph = r.json()['@graph'] + graph = r.json()['@graph'] for obj in graph: if "gtaapi:file_url" in obj: file_url = obj["gtaapi:file_url"] break - if file_url: + if file_url: return make_dl_ebook(file_url, ebook) except IndexError: logger.error('no item_file for %s', ebook.url) return None, 0 + def harvest_manu(ebook): def chap_selector(doc): return doc.select('div.content-box-body div.book-toc a.c-Button--link[href*="/display/"]') @@ -1328,11 +1375,12 @@ def harvest_manu(ebook): if not obj or 'href' not in obj.attrs: return None, 0 ebook.url = urljoin(ebook.url, obj['href']) - return harvest_stapled_generic(ebook, lambda x: None, chap_selector, + return harvest_stapled_generic(ebook, lambda x: None, chap_selector, user_agent=settings.CHROME_UA, dl=dl) return None, 0 -def harvest_sciendo(ebook): + +def harvest_sciendo(ebook): def selector(doc): json_obj = doc.find('script', id='__NEXT_DATA__') if json_obj: @@ -1350,3 +1398,4 @@ def harvest_sciendo(ebook): except KeyError as ke: logger.error('No links in json for {ebook.url}') return harvest_multiple_generic(ebook, selector) +