From 48af441a2772ba741e294327b510008cf4d4f13a Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 31 Jul 2020 16:18:07 -0400 Subject: [PATCH 1/3] add unt and ub --- core/loaders/harvest.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 13765fa5..d1bb6b3c 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -70,6 +70,8 @@ def harvesters(ebook): yield ebook.url.find('digitalcommons.usu.edu') > 0, harvest_usu yield ebook.provider == 'libros.fahce.unlp.edu.ar', harvest_fahce yield ebook.provider == 'fedoabooks.unina.it', harvest_fedoabooks + yield ebook.provider == 'digital.library.unt.edu', harvest_unt + yield ebook.provider == 'diposit.ub.edu', harvest_ub def ebf_if_harvested(url): onlines = EbookFile.objects.filter(source=url) @@ -472,6 +474,7 @@ def harvest_fahce(ebook): return doc.select_one('div.publicationFormatLink a[href]') return harvest_one_generic(ebook, selector) + def harvest_fedoabooks(ebook): def selector(doc): return doc.select('a.cmp_download_link[href]') @@ -479,3 +482,19 @@ def harvest_fedoabooks(ebook): return url.replace('view', 'download') + '?inline=1' return harvest_multiple_generic(ebook, selector, dl=dl) + +UBPDF = re.compile(r'/dspace/bitstream/.*\.pdf') +def harvest_ub(ebook): + def selector(doc): + return doc.find(href=UBPDF) + return harvest_one_generic(ebook, selector) + + +# won't harvest page-image books +def harvest_unt(ebook): + def selector(doc): + return doc.select_one('#link-pdf-version[href]') + return harvest_one_generic(ebook, selector) + + + From b8749e3c0239515eafd7dea3b9c5a14ac2ca1b09 Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 31 Jul 2020 16:26:37 -0400 Subject: [PATCH 2/3] made cmp generic --- core/loaders/harvest.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index d1bb6b3c..90a55be3 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -69,9 +69,10 @@ def harvesters(ebook): yield ebook.provider == 'Athabasca University Press', harvest_athabasca yield ebook.url.find('digitalcommons.usu.edu') > 0, harvest_usu yield ebook.provider == 'libros.fahce.unlp.edu.ar', harvest_fahce - yield ebook.provider == 'fedoabooks.unina.it', harvest_fedoabooks + yield ebook.provider == 'fedoabooks.unina.it', harvest_cmp yield ebook.provider == 'digital.library.unt.edu', harvest_unt yield ebook.provider == 'diposit.ub.edu', harvest_ub + yield ebook.provider == 'llibres.urv.cat', harvest_cmp def ebf_if_harvested(url): onlines = EbookFile.objects.filter(source=url) @@ -475,7 +476,7 @@ def harvest_fahce(ebook): return harvest_one_generic(ebook, selector) -def harvest_fedoabooks(ebook): +def harvest_cmp(ebook): def selector(doc): return doc.select('a.cmp_download_link[href]') def dl(url): From 71647bc00760299fd706ebd9b23fe1c4f19f77fa Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 31 Jul 2020 17:31:11 -0400 Subject: [PATCH 3/3] genericized cmp providers --- core/loaders/harvest.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 90a55be3..9c4d3a7d 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -51,6 +51,18 @@ def dl_online(ebook, limiter=rl.delay): return harvester(ebook) return None, 0 +CMPPROVIDERS = [ + 'editorial.uniagustiniana.edu.co', + 'llibres.urv.cat', + 'fedoabooks.unina.it', + 'Scholars Portal', + 'pressesagro.be', + 'ebooks.epublishing.ekt.gr', + 'teiresias-supplements.mcgill.ca', + 'humanities-digital-library.org', + 'editorial.uniagustiniana.edu.co', +] + def harvesters(ebook): yield ebook.url.find(u'dropbox.com/s/') >= 0, harvest_dropbox @@ -69,10 +81,9 @@ def harvesters(ebook): yield ebook.provider == 'Athabasca University Press', harvest_athabasca yield ebook.url.find('digitalcommons.usu.edu') > 0, harvest_usu yield ebook.provider == 'libros.fahce.unlp.edu.ar', harvest_fahce - yield ebook.provider == 'fedoabooks.unina.it', harvest_cmp yield ebook.provider == 'digital.library.unt.edu', harvest_unt yield ebook.provider == 'diposit.ub.edu', harvest_ub - yield ebook.provider == 'llibres.urv.cat', harvest_cmp + yield ebook.provider in CMPPROVIDERS, harvest_cmp def ebf_if_harvested(url): onlines = EbookFile.objects.filter(source=url) @@ -481,6 +492,8 @@ def harvest_cmp(ebook): return doc.select('a.cmp_download_link[href]') def dl(url): return url.replace('view', 'download') + '?inline=1' + if ebook.url.find('/view/') >= 0: + return make_dl_ebook(dl(ebook.url), ebook) return harvest_multiple_generic(ebook, selector, dl=dl)