refactor simple harvests
parent
26bbc5cee3
commit
9c9102cb2d
|
@ -149,6 +149,27 @@ def make_harvested_ebook(content, ebook, format, filesize=0):
|
||||||
new_ebf.save()
|
new_ebf.save()
|
||||||
return new_ebf, 1
|
return new_ebf, 1
|
||||||
|
|
||||||
|
|
||||||
|
def harvest_one_generic(ebook, selector):
|
||||||
|
doc = get_soup(ebook.url)
|
||||||
|
if doc:
|
||||||
|
try:
|
||||||
|
base = doc.find('base')['href']
|
||||||
|
except:
|
||||||
|
base = ebook.url
|
||||||
|
obj = selector(doc)
|
||||||
|
if obj:
|
||||||
|
dl_url = urljoin(base, obj['href'])
|
||||||
|
harvest = make_dl_ebook(dl_url, ebook)
|
||||||
|
if not harvest[0]:
|
||||||
|
logger.warning('couldn\'t harvest %s', dl_url)
|
||||||
|
return harvest
|
||||||
|
else:
|
||||||
|
logger.warning('couldn\'t get dl_url for %s', ebook.url)
|
||||||
|
else:
|
||||||
|
logger.warning('couldn\'t get soup for %s', ebook.url)
|
||||||
|
return None, 0
|
||||||
|
|
||||||
def harvest_obp(ebook):
|
def harvest_obp(ebook):
|
||||||
match = OPENBOOKPUB.search(ebook.url)
|
match = OPENBOOKPUB.search(ebook.url)
|
||||||
booknum = None
|
booknum = None
|
||||||
|
@ -237,17 +258,9 @@ def harvest_dropbox(ebook):
|
||||||
return None, 0
|
return None, 0
|
||||||
|
|
||||||
def harvest_jbe(ebook):
|
def harvest_jbe(ebook):
|
||||||
doc = get_soup(ebook.url)
|
def selector(doc):
|
||||||
if doc:
|
return doc.select_one('div.pdfItem a')
|
||||||
obj = doc.select_one('div.pdfItem a')
|
return harvest_one_generic(ebook, selector)
|
||||||
if obj:
|
|
||||||
dl_url = urljoin(ebook.url, obj['href'])
|
|
||||||
return make_dl_ebook(dl_url, ebook)
|
|
||||||
else:
|
|
||||||
logger.warning('couldn\'t get dl_url for %s', ebook.url)
|
|
||||||
else:
|
|
||||||
logger.warning('couldn\'t get soup for %s', ebook.url)
|
|
||||||
return None, 0
|
|
||||||
|
|
||||||
def harvest_transcript(ebook):
|
def harvest_transcript(ebook):
|
||||||
num = 0
|
num = 0
|
||||||
|
@ -265,17 +278,9 @@ def harvest_transcript(ebook):
|
||||||
return harvested, num
|
return harvested, num
|
||||||
|
|
||||||
def harvest_ksp(ebook):
|
def harvest_ksp(ebook):
|
||||||
doc = get_soup(ebook.url)
|
def selector(doc):
|
||||||
if doc:
|
return doc.select_one('p.linkForPDF a')
|
||||||
obj = doc.select_one('p.linkForPDF a')
|
return harvest_one_generic(ebook, selector)
|
||||||
if obj:
|
|
||||||
dl_url = urljoin(ebook.url, obj['href'])
|
|
||||||
return make_dl_ebook(dl_url, ebook)
|
|
||||||
else:
|
|
||||||
logger.warning('couldn\'t get dl_url for %s', ebook.url)
|
|
||||||
else:
|
|
||||||
logger.warning('couldn\'t get soup for %s', ebook.url)
|
|
||||||
return None, 0
|
|
||||||
|
|
||||||
def harvest_digitalis(ebook):
|
def harvest_digitalis(ebook):
|
||||||
doc = get_soup(ebook.url)
|
doc = get_soup(ebook.url)
|
||||||
|
@ -440,32 +445,13 @@ def harvest_bloomsbury(ebook):
|
||||||
return None, 0
|
return None, 0
|
||||||
|
|
||||||
def harvest_athabasca(ebook):
|
def harvest_athabasca(ebook):
|
||||||
doc = get_soup(ebook.url)
|
def selector(doc):
|
||||||
if doc:
|
return doc.select_one('li.downloadPDF a[href]')
|
||||||
try:
|
return harvest_one_generic(ebook, selector)
|
||||||
base = doc.find('base')['href']
|
|
||||||
except:
|
|
||||||
base = ebook.url
|
|
||||||
obj = doc.select_one('li.downloadPDF a[href]')
|
|
||||||
if obj:
|
|
||||||
dl_url = urljoin(base, obj['href'])
|
|
||||||
return make_dl_ebook(dl_url, ebook)
|
|
||||||
else:
|
|
||||||
logger.warning('couldn\'t get dl_url for %s', base)
|
|
||||||
else:
|
|
||||||
logger.warning('couldn\'t get soup for %s', ebook.url)
|
|
||||||
return None, 0
|
|
||||||
|
|
||||||
|
|
||||||
def harvest_usu(ebook):
|
def harvest_usu(ebook):
|
||||||
doc = get_soup(ebook.url)
|
def selector(doc):
|
||||||
if doc:
|
return doc.select_one('#full-text a[href]')
|
||||||
obj = doc.select_one('#full-text a[href]')
|
return harvest_one_generic(ebook, selector)
|
||||||
if obj:
|
|
||||||
dl_url = urljoin(ebook.url, obj['href'])
|
|
||||||
return make_dl_ebook(dl_url, ebook)
|
|
||||||
else:
|
|
||||||
logger.warning('couldn\'t get dl_url for %s', ebook.url)
|
|
||||||
else:
|
|
||||||
logger.warning('couldn\'t get soup for %s', ebook.url)
|
|
||||||
return None, 0
|
|
||||||
|
|
Loading…
Reference in New Issue