Merge pull request #895 from Gluejar/maintenance2020

refactor harvestors, add hadce, fedoabooks
pull/94/head
Eric Hellman 2020-07-31 14:59:21 -04:00 committed by GitHub
commit b889749c02
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 75 additions and 65 deletions

View File

@ -68,6 +68,8 @@ def harvesters(ebook):
yield ebook.provider == 'bloomsburycollections.com', harvest_bloomsbury
yield ebook.provider == 'Athabasca University Press', harvest_athabasca
yield ebook.url.find('digitalcommons.usu.edu') > 0, harvest_usu
yield ebook.provider == 'libros.fahce.unlp.edu.ar', harvest_fahce
yield ebook.provider == 'fedoabooks.unina.it', harvest_fedoabooks
def ebf_if_harvested(url):
onlines = EbookFile.objects.filter(source=url)
@ -149,6 +151,50 @@ def make_harvested_ebook(content, ebook, format, filesize=0):
new_ebf.save()
return new_ebf, 1
def harvest_one_generic(ebook, selector):
doc = get_soup(ebook.url)
if doc:
try:
base = doc.find('base')['href']
except:
base = ebook.url
obj = selector(doc)
if obj:
dl_url = urljoin(base, obj['href'])
harvest = make_dl_ebook(dl_url, ebook)
if not harvest[0]:
logger.warning('couldn\'t harvest %s', dl_url)
return harvest
else:
logger.warning('couldn\'t get dl_url for %s', ebook.url)
else:
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0
def harvest_multiple_generic(ebook, selector, dl=lambda x:x):
num = 0
harvested = None
doc = get_soup(ebook.url)
if doc:
found = []
try:
base = doc.find('base')['href']
except:
base = ebook.url
for obj in selector(doc):
dl_url = dl(urljoin(base, obj.get('href')))
logger.info(dl_url)
if dl_url in found:
continue
else:
found.append(dl_url)
harvested, made = make_dl_ebook(dl_url, ebook)
num += made
if num == 0:
logger.warning('couldn\'t get any dl_url for %s', ebook.url)
return harvested, num
def harvest_obp(ebook):
match = OPENBOOKPUB.search(ebook.url)
booknum = None
@ -237,17 +283,9 @@ def harvest_dropbox(ebook):
return None, 0
def harvest_jbe(ebook):
doc = get_soup(ebook.url)
if doc:
obj = doc.select_one('div.pdfItem a')
if obj:
dl_url = urljoin(ebook.url, obj['href'])
return make_dl_ebook(dl_url, ebook)
else:
logger.warning('couldn\'t get dl_url for %s', ebook.url)
else:
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0
def selector(doc):
return doc.select_one('div.pdfItem a')
return harvest_one_generic(ebook, selector)
def harvest_transcript(ebook):
num = 0
@ -265,17 +303,9 @@ def harvest_transcript(ebook):
return harvested, num
def harvest_ksp(ebook):
doc = get_soup(ebook.url)
if doc:
obj = doc.select_one('p.linkForPDF a')
if obj:
dl_url = urljoin(ebook.url, obj['href'])
return make_dl_ebook(dl_url, ebook)
else:
logger.warning('couldn\'t get dl_url for %s', ebook.url)
else:
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0
def selector(doc):
return doc.select_one('p.linkForPDF a')
return harvest_one_generic(ebook, selector)
def harvest_digitalis(ebook):
doc = get_soup(ebook.url)
@ -344,23 +374,9 @@ def harvest_frontiersin(ebook):
SPRINGERDL = re.compile(r'(EPUB|PDF|MOBI)')
def harvest_springerlink(ebook):
num = 0
harvested = None
doc = get_soup(ebook.url)
if doc:
found = []
for obj in doc.find_all('a', title=SPRINGERDL):
if obj.get('href'):
dl_url = urljoin(ebook.url, obj.get('href'))
if dl_url in found:
continue
else:
found.append(dl_url)
harvested, made = make_dl_ebook(dl_url, ebook)
num += made
if num == 0:
logger.warning('couldn\'t get any dl_url for %s', ebook.url)
return harvested, num
def selector(doc):
return doc.find_all('a', title=SPRINGERDL)
return harvest_multiple_generic(ebook, selector)
OAPENPDF = re.compile('^/bitstream.*\.pdf')
@ -440,32 +456,26 @@ def harvest_bloomsbury(ebook):
return None, 0
def harvest_athabasca(ebook):
doc = get_soup(ebook.url)
if doc:
try:
base = doc.find('base')['href']
except:
base = ebook.url
obj = doc.select_one('li.downloadPDF a[href]')
if obj:
dl_url = urljoin(base, obj['href'])
return make_dl_ebook(dl_url, ebook)
else:
logger.warning('couldn\'t get dl_url for %s', base)
else:
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0
def selector(doc):
return doc.select_one('li.downloadPDF a[href]')
return harvest_one_generic(ebook, selector)
def harvest_usu(ebook):
doc = get_soup(ebook.url)
if doc:
obj = doc.select_one('#full-text a[href]')
if obj:
dl_url = urljoin(ebook.url, obj['href'])
return make_dl_ebook(dl_url, ebook)
else:
logger.warning('couldn\'t get dl_url for %s', ebook.url)
else:
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0
def selector(doc):
return doc.select_one('#full-text a[href]')
return harvest_one_generic(ebook, selector)
def harvest_fahce(ebook):
def selector(doc):
return doc.select_one('div.publicationFormatLink a[href]')
return harvest_one_generic(ebook, selector)
def harvest_fedoabooks(ebook):
def selector(doc):
return doc.select('a.cmp_download_link[href]')
def dl(url):
return url.replace('view', 'download') + '?inline=1'
return harvest_multiple_generic(ebook, selector, dl=dl)