add providers
parent
08d0a2efd4
commit
d977e70e94
|
@ -66,6 +66,7 @@ CMPPROVIDERS = [
|
|||
|
||||
|
||||
def harvesters(ebook):
|
||||
yield ebook.provider in GOOD_PROVIDERS, harvest_generic
|
||||
yield 'dropbox.com/s/' in ebook.url, harvest_dropbox
|
||||
yield ebook.provider == 'jbe-platform.com', harvest_jbe
|
||||
yield ebook.provider == u'De Gruyter Online', harvest_degruyter
|
||||
|
@ -82,14 +83,14 @@ def harvesters(ebook):
|
|||
yield ebook.provider == 'Athabasca University Press', harvest_athabasca
|
||||
yield 'digitalcommons.usu.edu' in ebook.url, harvest_usu
|
||||
yield ebook.provider == 'libros.fahce.unlp.edu.ar', harvest_fahce
|
||||
yield ebook.provider == 'digital.library.unt.edu', harvest_unt
|
||||
yield ebook.provider in ['digital.library.unt.edu', 'texashistory.unt.edu'], harvest_unt
|
||||
yield ebook.provider in ['diposit.ub.edu', 'orbi.ulg.ac.be'], harvest_dspace
|
||||
yield ebook.provider in CMPPROVIDERS, harvest_cmp
|
||||
yield 'mdpi' in ebook.provider.lower(), harvest_mdpi
|
||||
yield ebook.provider == 'idunn.no', harvest_idunn
|
||||
yield ebook.provider == 'press.ucalgary.ca', harvest_calgary
|
||||
yield ebook.provider in ['Ledizioni', 'bibsciences.org',
|
||||
'heiup.uni-heidelberg.de', 'e-archivo.uc3m.es'], harvest_badhead
|
||||
'heiup.uni-heidelberg.de', 'e-archivo.uc3m.es'], harvest_generic
|
||||
yield ebook.provider == 'muse.jhu.edu', harvest_muse
|
||||
yield ebook.provider == 'IOS Press Ebooks', harvest_ios
|
||||
yield ebook.provider == 'elgaronline.com', harvest_elgar
|
||||
|
@ -97,11 +98,14 @@ def harvesters(ebook):
|
|||
yield ebook.provider in ['edition-open-access.de', 'edition-open-sources.org'], harvest_mprl
|
||||
yield ebook.provider == 'rti.org', harvest_rti
|
||||
yield ebook.provider == 'edoc.unibas.ch', harvest_unibas
|
||||
yield ebook.provider == 'books.pensoft.net', harvest_pensoft
|
||||
yield 'pensoft' in ebook.provider, harvest_pensoft
|
||||
yield ebook.provider == 'edp-open.org', harvest_edp
|
||||
yield ebook.provider == 'waxmann.com', harvest_waxmann
|
||||
yield ebook.provider == 'pbsociety.org.pl', harvest_ojs
|
||||
yield ebook.provider == 'content.sciendo.com', harvest_sciendo
|
||||
yield ebook.provider == 'edition-topoi.org', harvest_topoi
|
||||
yield ebook.provider == 'meson.press', harvest_meson
|
||||
yield 'brillonline' in ebook.provider, harvest_brill
|
||||
|
||||
|
||||
def ebf_if_harvested(url):
|
||||
|
@ -191,6 +195,10 @@ def make_harvested_ebook(content, ebook, format, filesize=0):
|
|||
return new_ebf, 1
|
||||
|
||||
|
||||
def harvest_generic(ebook):
|
||||
return make_dl_ebook(ebook.url, ebook)
|
||||
|
||||
|
||||
def harvest_one_generic(ebook, selector, user_agent=settings.USER_AGENT):
|
||||
doc = get_soup(ebook.url, user_agent=user_agent)
|
||||
if doc:
|
||||
|
@ -211,6 +219,7 @@ def harvest_one_generic(ebook, selector, user_agent=settings.USER_AGENT):
|
|||
logger.warning('couldn\'t get soup for %s', ebook.url)
|
||||
return None, 0
|
||||
|
||||
|
||||
def harvest_multiple_generic(ebook, selector, dl=lambda x:x):
|
||||
num = 0
|
||||
harvested = None
|
||||
|
@ -234,6 +243,7 @@ def harvest_multiple_generic(ebook, selector, dl=lambda x:x):
|
|||
logger.warning('couldn\'t get any dl_url for %s', ebook.url)
|
||||
return harvested, num
|
||||
|
||||
|
||||
def harvest_stapled_generic(ebook, selector, chap_selector, strip_covers=0,
|
||||
user_agent=settings.GOOGLEBOT_UA, dl=lambda x:x):
|
||||
doc = get_soup(ebook.url, user_agent=user_agent)
|
||||
|
@ -268,9 +278,6 @@ def harvest_stapled_generic(ebook, selector, chap_selector, strip_covers=0,
|
|||
return None, 0
|
||||
|
||||
|
||||
def harvest_badhead(ebook):
|
||||
return make_dl_ebook(ebook.url, ebook)
|
||||
|
||||
def harvest_obp(ebook):
|
||||
match = OPENBOOKPUB.search(ebook.url)
|
||||
booknum = None
|
||||
|
@ -665,12 +672,14 @@ def harvest_unibas(ebook):
|
|||
return doc.select_one('a.ep_document_link[href]')
|
||||
return harvest_one_generic(ebook, selector)
|
||||
|
||||
|
||||
PENSOFT = re.compile(r'/book/(\d+)/list/')
|
||||
def harvest_pensoft(ebook):
|
||||
if ebook.id == 263395:
|
||||
book_id = '12847'
|
||||
elif ebook.url.startswith('https://books.pensoft.net/books/'):
|
||||
book_id = ebook.url[32:]
|
||||
elif PENSOFT.search(ebook.url):
|
||||
book_id = PENSOFT.search(ebook.url).group(1)
|
||||
else:
|
||||
return None, 0
|
||||
r = requests.get('https://books.pensoft.net/api/books/' + book_id)
|
||||
|
@ -708,3 +717,24 @@ def harvest_sciendo(ebook):
|
|||
return doc.select_one('a[title=PDF]')
|
||||
return harvest_one_generic(ebook, selector, user_agent=settings.GOOGLEBOT_UA)
|
||||
|
||||
|
||||
def harvest_topoi(ebook):
|
||||
def selector(doc):
|
||||
return doc.select_one('li.pdf a[href]')
|
||||
return harvest_one_generic(ebook, selector)
|
||||
|
||||
|
||||
def harvest_meson(ebook):
|
||||
def selector(doc):
|
||||
for btn in doc.select_one('a[href] btn.btn-openaccess'):
|
||||
yield btn.parent
|
||||
return harvest_one_generic(ebook, selector)
|
||||
|
||||
|
||||
def harvest_brill(ebook):
|
||||
r = requests.get(ebook.url, headers={'User-Agent': settings.GOOGLEBOT_UA})
|
||||
if not r.url.startswith('https://brill.com/view/title/'):
|
||||
return None, 0
|
||||
dl_url = 'https://brill.com/downloadpdf/title/%s.pdf' % r.url[29:]
|
||||
return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA)
|
||||
|
||||
|
|
Loading…
Reference in New Issue