From d977e70e945aa9a9552b1d9c1a057944e874c7b9 Mon Sep 17 00:00:00 2001 From: eric Date: Sat, 15 Aug 2020 20:17:29 -0400 Subject: [PATCH] add providers --- core/loaders/harvest.py | 44 ++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index cb1add54..9188d9c8 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -66,6 +66,7 @@ CMPPROVIDERS = [ def harvesters(ebook): + yield ebook.provider in GOOD_PROVIDERS, harvest_generic yield 'dropbox.com/s/' in ebook.url, harvest_dropbox yield ebook.provider == 'jbe-platform.com', harvest_jbe yield ebook.provider == u'De Gruyter Online', harvest_degruyter @@ -82,14 +83,14 @@ def harvesters(ebook): yield ebook.provider == 'Athabasca University Press', harvest_athabasca yield 'digitalcommons.usu.edu' in ebook.url, harvest_usu yield ebook.provider == 'libros.fahce.unlp.edu.ar', harvest_fahce - yield ebook.provider == 'digital.library.unt.edu', harvest_unt + yield ebook.provider in ['digital.library.unt.edu', 'texashistory.unt.edu'], harvest_unt yield ebook.provider in ['diposit.ub.edu', 'orbi.ulg.ac.be'], harvest_dspace yield ebook.provider in CMPPROVIDERS, harvest_cmp yield 'mdpi' in ebook.provider.lower(), harvest_mdpi yield ebook.provider == 'idunn.no', harvest_idunn yield ebook.provider == 'press.ucalgary.ca', harvest_calgary yield ebook.provider in ['Ledizioni', 'bibsciences.org', - 'heiup.uni-heidelberg.de', 'e-archivo.uc3m.es'], harvest_badhead + 'heiup.uni-heidelberg.de', 'e-archivo.uc3m.es'], harvest_generic yield ebook.provider == 'muse.jhu.edu', harvest_muse yield ebook.provider == 'IOS Press Ebooks', harvest_ios yield ebook.provider == 'elgaronline.com', harvest_elgar @@ -97,11 +98,14 @@ def harvesters(ebook): yield ebook.provider in ['edition-open-access.de', 'edition-open-sources.org'], harvest_mprl yield ebook.provider == 'rti.org', harvest_rti yield ebook.provider == 'edoc.unibas.ch', harvest_unibas - yield ebook.provider == 'books.pensoft.net', harvest_pensoft + yield 'pensoft' in ebook.provider, harvest_pensoft yield ebook.provider == 'edp-open.org', harvest_edp yield ebook.provider == 'waxmann.com', harvest_waxmann yield ebook.provider == 'pbsociety.org.pl', harvest_ojs yield ebook.provider == 'content.sciendo.com', harvest_sciendo + yield ebook.provider == 'edition-topoi.org', harvest_topoi + yield ebook.provider == 'meson.press', harvest_meson + yield 'brillonline' in ebook.provider, harvest_brill def ebf_if_harvested(url): @@ -191,6 +195,10 @@ def make_harvested_ebook(content, ebook, format, filesize=0): return new_ebf, 1 +def harvest_generic(ebook): + return make_dl_ebook(ebook.url, ebook) + + def harvest_one_generic(ebook, selector, user_agent=settings.USER_AGENT): doc = get_soup(ebook.url, user_agent=user_agent) if doc: @@ -211,6 +219,7 @@ def harvest_one_generic(ebook, selector, user_agent=settings.USER_AGENT): logger.warning('couldn\'t get soup for %s', ebook.url) return None, 0 + def harvest_multiple_generic(ebook, selector, dl=lambda x:x): num = 0 harvested = None @@ -234,6 +243,7 @@ def harvest_multiple_generic(ebook, selector, dl=lambda x:x): logger.warning('couldn\'t get any dl_url for %s', ebook.url) return harvested, num + def harvest_stapled_generic(ebook, selector, chap_selector, strip_covers=0, user_agent=settings.GOOGLEBOT_UA, dl=lambda x:x): doc = get_soup(ebook.url, user_agent=user_agent) @@ -268,9 +278,6 @@ def harvest_stapled_generic(ebook, selector, chap_selector, strip_covers=0, return None, 0 -def harvest_badhead(ebook): - return make_dl_ebook(ebook.url, ebook) - def harvest_obp(ebook): match = OPENBOOKPUB.search(ebook.url) booknum = None @@ -665,12 +672,14 @@ def harvest_unibas(ebook): return doc.select_one('a.ep_document_link[href]') return harvest_one_generic(ebook, selector) - +PENSOFT = re.compile(r'/book/(\d+)/list/') def harvest_pensoft(ebook): if ebook.id == 263395: book_id = '12847' elif ebook.url.startswith('https://books.pensoft.net/books/'): book_id = ebook.url[32:] + elif PENSOFT.search(ebook.url): + book_id = PENSOFT.search(ebook.url).group(1) else: return None, 0 r = requests.get('https://books.pensoft.net/api/books/' + book_id) @@ -708,3 +717,24 @@ def harvest_sciendo(ebook): return doc.select_one('a[title=PDF]') return harvest_one_generic(ebook, selector, user_agent=settings.GOOGLEBOT_UA) + +def harvest_topoi(ebook): + def selector(doc): + return doc.select_one('li.pdf a[href]') + return harvest_one_generic(ebook, selector) + + +def harvest_meson(ebook): + def selector(doc): + for btn in doc.select_one('a[href] btn.btn-openaccess'): + yield btn.parent + return harvest_one_generic(ebook, selector) + + +def harvest_brill(ebook): + r = requests.get(ebook.url, headers={'User-Agent': settings.GOOGLEBOT_UA}) + if not r.url.startswith('https://brill.com/view/title/'): + return None, 0 + dl_url = 'https://brill.com/downloadpdf/title/%s.pdf' % r.url[29:] + return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA) +