Merge pull request #896 from Gluejar/maintenance2020

unt, ub and cmp providers
pull/94/head
Eric Hellman 2020-07-31 17:41:33 -04:00 committed by GitHub
commit 5439ad3ba3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 35 additions and 2 deletions

View File

@ -51,6 +51,18 @@ def dl_online(ebook, limiter=rl.delay):
return harvester(ebook)
return None, 0
CMPPROVIDERS = [
'editorial.uniagustiniana.edu.co',
'llibres.urv.cat',
'fedoabooks.unina.it',
'Scholars Portal',
'pressesagro.be',
'ebooks.epublishing.ekt.gr',
'teiresias-supplements.mcgill.ca',
'humanities-digital-library.org',
'editorial.uniagustiniana.edu.co',
]
def harvesters(ebook):
yield ebook.url.find(u'dropbox.com/s/') >= 0, harvest_dropbox
@ -69,7 +81,9 @@ def harvesters(ebook):
yield ebook.provider == 'Athabasca University Press', harvest_athabasca
yield ebook.url.find('digitalcommons.usu.edu') > 0, harvest_usu
yield ebook.provider == 'libros.fahce.unlp.edu.ar', harvest_fahce
yield ebook.provider == 'fedoabooks.unina.it', harvest_fedoabooks
yield ebook.provider == 'digital.library.unt.edu', harvest_unt
yield ebook.provider == 'diposit.ub.edu', harvest_ub
yield ebook.provider in CMPPROVIDERS, harvest_cmp
def ebf_if_harvested(url):
onlines = EbookFile.objects.filter(source=url)
@ -472,10 +486,29 @@ def harvest_fahce(ebook):
return doc.select_one('div.publicationFormatLink a[href]')
return harvest_one_generic(ebook, selector)
def harvest_fedoabooks(ebook):
def harvest_cmp(ebook):
def selector(doc):
return doc.select('a.cmp_download_link[href]')
def dl(url):
return url.replace('view', 'download') + '?inline=1'
if ebook.url.find('/view/') >= 0:
return make_dl_ebook(dl(ebook.url), ebook)
return harvest_multiple_generic(ebook, selector, dl=dl)
UBPDF = re.compile(r'/dspace/bitstream/.*\.pdf')
def harvest_ub(ebook):
def selector(doc):
return doc.find(href=UBPDF)
return harvest_one_generic(ebook, selector)
# won't harvest page-image books
def harvest_unt(ebook):
def selector(doc):
return doc.select_one('#link-pdf-version[href]')
return harvest_one_generic(ebook, selector)