allow harvest to harvest multiple ebooks

pull/94/head
eric 2020-07-27 17:53:32 -04:00
parent 26e32a4738
commit 5932bc09ed
2 changed files with 15 additions and 14 deletions

View File

@ -42,14 +42,14 @@ rl = RateLimiter()
def dl_online(ebook, limiter=rl.delay): def dl_online(ebook, limiter=rl.delay):
if ebook.format != 'online': if ebook.format != 'online':
return None, False return None, 0
for do_harvest, harvester in harvesters(ebook): for do_harvest, harvester in harvesters(ebook):
if do_harvest: if do_harvest:
for ebf in ebf_if_harvested(ebook.url): for ebf in ebf_if_harvested(ebook.url):
return ebf, False return ebf, 0
limiter(ebook.provider) limiter(ebook.provider)
return harvester(ebook) return harvester(ebook)
return None, False return None, 0
def harvesters(ebook): def harvesters(ebook):
@ -69,7 +69,7 @@ def ebf_if_harvested(url):
def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET'): def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET'):
if not url: if not url:
logger.warning('no url for ebook %s', ebook.id) logger.warning('no url for ebook %s', ebook.id)
return None, False return None, 0
logger.info('making %s' % url) logger.info('making %s' % url)
if method == 'POST': if method == 'POST':
response = requests.post(url, headers={"User-Agent": user_agent}) response = requests.post(url, headers={"User-Agent": user_agent})
@ -87,12 +87,12 @@ def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET'):
logger.warning('download format for %s is not ebook', url) logger.warning('download format for %s is not ebook', url)
else: else:
logger.warning('couldn\'t get %s', url) logger.warning('couldn\'t get %s', url)
return None, False return None, 0
def make_stapled_ebook(urllist, ebook, user_agent=settings.USER_AGENT): def make_stapled_ebook(urllist, ebook, user_agent=settings.USER_AGENT):
pdffile = staple_pdf(urllist, user_agent) pdffile = staple_pdf(urllist, user_agent)
if not pdffile: if not pdffile:
return None, False return None, 0
return make_harvested_ebook(pdffile.getvalue(), ebook, 'pdf') return make_harvested_ebook(pdffile.getvalue(), ebook, 'pdf')
def make_harvested_ebook(content, ebook, format, filesize=0): def make_harvested_ebook(content, ebook, format, filesize=0):
@ -109,7 +109,7 @@ def make_harvested_ebook(content, ebook, format, filesize=0):
except MemoryError: #huge pdf files cause problems here except MemoryError: #huge pdf files cause problems here
logger.error("memory error saving ebook file for %s", ebook.url) logger.error("memory error saving ebook file for %s", ebook.url)
new_ebf.delete() new_ebf.delete()
return None, False return None, 0
new_ebook = Ebook.objects.create( new_ebook = Ebook.objects.create(
edition=ebook.edition, edition=ebook.edition,
@ -123,7 +123,7 @@ def make_harvested_ebook(content, ebook, format, filesize=0):
) )
new_ebf.ebook = new_ebook new_ebf.ebook = new_ebook
new_ebf.save() new_ebf.save()
return new_ebf, True return new_ebf, 1
def harvest_obp(ebook): def harvest_obp(ebook):
match = OPENBOOKPUB.search(ebook.url) match = OPENBOOKPUB.search(ebook.url)
@ -144,7 +144,7 @@ def harvest_obp(ebook):
booknum = match.group(2) booknum = match.group(2)
if not booknum: if not booknum:
logger.warning('couldn\'t get booknum for %s', ebook.url) logger.warning('couldn\'t get booknum for %s', ebook.url)
return None, False return None, 0
dl_url = 'https://www.openbookpublishers.com//download/book_content/{}'.format(booknum) dl_url = 'https://www.openbookpublishers.com//download/book_content/{}'.format(booknum)
made = make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA, method='POST') made = make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA, method='POST')
return made return made
@ -192,7 +192,7 @@ def harvest_degruyter(ebook):
logger.warning('couldn\'t get dl_url for %s', ebook.url) logger.warning('couldn\'t get dl_url for %s', ebook.url)
else: else:
logger.warning('couldn\'t get soup for %s', ebook.url) logger.warning('couldn\'t get soup for %s', ebook.url)
return None, False return None, 0
def harvest_dropbox(ebook): def harvest_dropbox(ebook):
if ebook.url.find(u'dl=0') >= 0: if ebook.url.find(u'dl=0') >= 0:
@ -210,7 +210,7 @@ def harvest_dropbox(ebook):
logger.warning('couldn\'t get %s', ebook.url) logger.warning('couldn\'t get %s', ebook.url)
else: else:
logger.warning('couldn\'t get dl for %s', ebook.url) logger.warning('couldn\'t get dl for %s', ebook.url)
return None, False return None, 0
def harvest_jbe(ebook): def harvest_jbe(ebook):
doc = get_soup(ebook.url) doc = get_soup(ebook.url)
@ -223,5 +223,6 @@ def harvest_jbe(ebook):
logger.warning('couldn\'t get dl_url for %s', ebook.url) logger.warning('couldn\'t get dl_url for %s', ebook.url)
else: else:
logger.warning('couldn\'t get soup for %s', ebook.url) logger.warning('couldn\'t get soup for %s', ebook.url)
return None, False return None, 0

View File

@ -18,9 +18,9 @@ class Command(BaseCommand):
for online in onlines: for online in onlines:
new_ebf, new = dl_online(online, limiter=rl.delay) new_ebf, new = dl_online(online, limiter=rl.delay)
if new_ebf and new: if new_ebf and new:
done += 1 done += new
self.stdout.write(new_ebf.edition.work.title) self.stdout.write(new_ebf.edition.work.title)
if done == limit or done == 100: if done >= limit or done >= 100:
break break
self.stdout.write('harvested {} ebooks'.format(done)) self.stdout.write('harvested {} ebooks'.format(done))
if done == 100: if done == 100: