From 9c9102cb2d5ed2933e41ca62e2f4f49635f40725 Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 31 Jul 2020 14:45:41 -0400 Subject: [PATCH 1/3] refactor simple harvests --- core/loaders/harvest.py | 82 +++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 48 deletions(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 684c4842..ad43e664 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -149,6 +149,27 @@ def make_harvested_ebook(content, ebook, format, filesize=0): new_ebf.save() return new_ebf, 1 + +def harvest_one_generic(ebook, selector): + doc = get_soup(ebook.url) + if doc: + try: + base = doc.find('base')['href'] + except: + base = ebook.url + obj = selector(doc) + if obj: + dl_url = urljoin(base, obj['href']) + harvest = make_dl_ebook(dl_url, ebook) + if not harvest[0]: + logger.warning('couldn\'t harvest %s', dl_url) + return harvest + else: + logger.warning('couldn\'t get dl_url for %s', ebook.url) + else: + logger.warning('couldn\'t get soup for %s', ebook.url) + return None, 0 + def harvest_obp(ebook): match = OPENBOOKPUB.search(ebook.url) booknum = None @@ -237,17 +258,9 @@ def harvest_dropbox(ebook): return None, 0 def harvest_jbe(ebook): - doc = get_soup(ebook.url) - if doc: - obj = doc.select_one('div.pdfItem a') - if obj: - dl_url = urljoin(ebook.url, obj['href']) - return make_dl_ebook(dl_url, ebook) - else: - logger.warning('couldn\'t get dl_url for %s', ebook.url) - else: - logger.warning('couldn\'t get soup for %s', ebook.url) - return None, 0 + def selector(doc): + return doc.select_one('div.pdfItem a') + return harvest_one_generic(ebook, selector) def harvest_transcript(ebook): num = 0 @@ -265,17 +278,9 @@ def harvest_transcript(ebook): return harvested, num def harvest_ksp(ebook): - doc = get_soup(ebook.url) - if doc: - obj = doc.select_one('p.linkForPDF a') - if obj: - dl_url = urljoin(ebook.url, obj['href']) - return make_dl_ebook(dl_url, ebook) - else: - logger.warning('couldn\'t get dl_url for %s', ebook.url) - else: - logger.warning('couldn\'t get soup for %s', ebook.url) - return None, 0 + def selector(doc): + return doc.select_one('p.linkForPDF a') + return harvest_one_generic(ebook, selector) def harvest_digitalis(ebook): doc = get_soup(ebook.url) @@ -440,32 +445,13 @@ def harvest_bloomsbury(ebook): return None, 0 def harvest_athabasca(ebook): - doc = get_soup(ebook.url) - if doc: - try: - base = doc.find('base')['href'] - except: - base = ebook.url - obj = doc.select_one('li.downloadPDF a[href]') - if obj: - dl_url = urljoin(base, obj['href']) - return make_dl_ebook(dl_url, ebook) - else: - logger.warning('couldn\'t get dl_url for %s', base) - else: - logger.warning('couldn\'t get soup for %s', ebook.url) - return None, 0 + def selector(doc): + return doc.select_one('li.downloadPDF a[href]') + return harvest_one_generic(ebook, selector) def harvest_usu(ebook): - doc = get_soup(ebook.url) - if doc: - obj = doc.select_one('#full-text a[href]') - if obj: - dl_url = urljoin(ebook.url, obj['href']) - return make_dl_ebook(dl_url, ebook) - else: - logger.warning('couldn\'t get dl_url for %s', ebook.url) - else: - logger.warning('couldn\'t get soup for %s', ebook.url) - return None, 0 + def selector(doc): + return doc.select_one('#full-text a[href]') + return harvest_one_generic(ebook, selector) + From 9de38329652989967baef846ff54ccc978440cc1 Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 31 Jul 2020 14:46:43 -0400 Subject: [PATCH 2/3] refactor multiple ebook harvests --- core/loaders/harvest.py | 43 +++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index ad43e664..e08127b5 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -170,6 +170,29 @@ def harvest_one_generic(ebook, selector): logger.warning('couldn\'t get soup for %s', ebook.url) return None, 0 +def harvest_multiple_generic(ebook, selector, dl=lambda x:x): + num = 0 + harvested = None + doc = get_soup(ebook.url) + if doc: + found = [] + try: + base = doc.find('base')['href'] + except: + base = ebook.url + for obj in selector(doc): + dl_url = dl(urljoin(base, obj.get('href'))) + logger.info(dl_url) + if dl_url in found: + continue + else: + found.append(dl_url) + harvested, made = make_dl_ebook(dl_url, ebook) + num += made + if num == 0: + logger.warning('couldn\'t get any dl_url for %s', ebook.url) + return harvested, num + def harvest_obp(ebook): match = OPENBOOKPUB.search(ebook.url) booknum = None @@ -349,23 +372,9 @@ def harvest_frontiersin(ebook): SPRINGERDL = re.compile(r'(EPUB|PDF|MOBI)') def harvest_springerlink(ebook): - num = 0 - harvested = None - doc = get_soup(ebook.url) - if doc: - found = [] - for obj in doc.find_all('a', title=SPRINGERDL): - if obj.get('href'): - dl_url = urljoin(ebook.url, obj.get('href')) - if dl_url in found: - continue - else: - found.append(dl_url) - harvested, made = make_dl_ebook(dl_url, ebook) - num += made - if num == 0: - logger.warning('couldn\'t get any dl_url for %s', ebook.url) - return harvested, num + def selector(doc): + return doc.find_all('a', title=SPRINGERDL) + return harvest_multiple_generic(ebook, selector) OAPENPDF = re.compile('^/bitstream.*\.pdf') From 9d48f8a0817b512300eaf52c20d6bbeebfeea1a8 Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 31 Jul 2020 14:47:11 -0400 Subject: [PATCH 3/3] add fahce, fedoabooks --- core/loaders/harvest.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index e08127b5..13765fa5 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -68,6 +68,8 @@ def harvesters(ebook): yield ebook.provider == 'bloomsburycollections.com', harvest_bloomsbury yield ebook.provider == 'Athabasca University Press', harvest_athabasca yield ebook.url.find('digitalcommons.usu.edu') > 0, harvest_usu + yield ebook.provider == 'libros.fahce.unlp.edu.ar', harvest_fahce + yield ebook.provider == 'fedoabooks.unina.it', harvest_fedoabooks def ebf_if_harvested(url): onlines = EbookFile.objects.filter(source=url) @@ -464,3 +466,16 @@ def harvest_usu(ebook): return doc.select_one('#full-text a[href]') return harvest_one_generic(ebook, selector) + +def harvest_fahce(ebook): + def selector(doc): + return doc.select_one('div.publicationFormatLink a[href]') + return harvest_one_generic(ebook, selector) + +def harvest_fedoabooks(ebook): + def selector(doc): + return doc.select('a.cmp_download_link[href]') + def dl(url): + return url.replace('view', 'download') + '?inline=1' + return harvest_multiple_generic(ebook, selector, dl=dl) +