allow harvest to harvest multiple ebooks
parent
26e32a4738
commit
5932bc09ed
|
@ -42,14 +42,14 @@ rl = RateLimiter()
|
||||||
|
|
||||||
def dl_online(ebook, limiter=rl.delay):
|
def dl_online(ebook, limiter=rl.delay):
|
||||||
if ebook.format != 'online':
|
if ebook.format != 'online':
|
||||||
return None, False
|
return None, 0
|
||||||
for do_harvest, harvester in harvesters(ebook):
|
for do_harvest, harvester in harvesters(ebook):
|
||||||
if do_harvest:
|
if do_harvest:
|
||||||
for ebf in ebf_if_harvested(ebook.url):
|
for ebf in ebf_if_harvested(ebook.url):
|
||||||
return ebf, False
|
return ebf, 0
|
||||||
limiter(ebook.provider)
|
limiter(ebook.provider)
|
||||||
return harvester(ebook)
|
return harvester(ebook)
|
||||||
return None, False
|
return None, 0
|
||||||
|
|
||||||
|
|
||||||
def harvesters(ebook):
|
def harvesters(ebook):
|
||||||
|
@ -69,7 +69,7 @@ def ebf_if_harvested(url):
|
||||||
def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET'):
|
def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET'):
|
||||||
if not url:
|
if not url:
|
||||||
logger.warning('no url for ebook %s', ebook.id)
|
logger.warning('no url for ebook %s', ebook.id)
|
||||||
return None, False
|
return None, 0
|
||||||
logger.info('making %s' % url)
|
logger.info('making %s' % url)
|
||||||
if method == 'POST':
|
if method == 'POST':
|
||||||
response = requests.post(url, headers={"User-Agent": user_agent})
|
response = requests.post(url, headers={"User-Agent": user_agent})
|
||||||
|
@ -87,12 +87,12 @@ def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET'):
|
||||||
logger.warning('download format for %s is not ebook', url)
|
logger.warning('download format for %s is not ebook', url)
|
||||||
else:
|
else:
|
||||||
logger.warning('couldn\'t get %s', url)
|
logger.warning('couldn\'t get %s', url)
|
||||||
return None, False
|
return None, 0
|
||||||
|
|
||||||
def make_stapled_ebook(urllist, ebook, user_agent=settings.USER_AGENT):
|
def make_stapled_ebook(urllist, ebook, user_agent=settings.USER_AGENT):
|
||||||
pdffile = staple_pdf(urllist, user_agent)
|
pdffile = staple_pdf(urllist, user_agent)
|
||||||
if not pdffile:
|
if not pdffile:
|
||||||
return None, False
|
return None, 0
|
||||||
return make_harvested_ebook(pdffile.getvalue(), ebook, 'pdf')
|
return make_harvested_ebook(pdffile.getvalue(), ebook, 'pdf')
|
||||||
|
|
||||||
def make_harvested_ebook(content, ebook, format, filesize=0):
|
def make_harvested_ebook(content, ebook, format, filesize=0):
|
||||||
|
@ -109,7 +109,7 @@ def make_harvested_ebook(content, ebook, format, filesize=0):
|
||||||
except MemoryError: #huge pdf files cause problems here
|
except MemoryError: #huge pdf files cause problems here
|
||||||
logger.error("memory error saving ebook file for %s", ebook.url)
|
logger.error("memory error saving ebook file for %s", ebook.url)
|
||||||
new_ebf.delete()
|
new_ebf.delete()
|
||||||
return None, False
|
return None, 0
|
||||||
|
|
||||||
new_ebook = Ebook.objects.create(
|
new_ebook = Ebook.objects.create(
|
||||||
edition=ebook.edition,
|
edition=ebook.edition,
|
||||||
|
@ -123,7 +123,7 @@ def make_harvested_ebook(content, ebook, format, filesize=0):
|
||||||
)
|
)
|
||||||
new_ebf.ebook = new_ebook
|
new_ebf.ebook = new_ebook
|
||||||
new_ebf.save()
|
new_ebf.save()
|
||||||
return new_ebf, True
|
return new_ebf, 1
|
||||||
|
|
||||||
def harvest_obp(ebook):
|
def harvest_obp(ebook):
|
||||||
match = OPENBOOKPUB.search(ebook.url)
|
match = OPENBOOKPUB.search(ebook.url)
|
||||||
|
@ -144,7 +144,7 @@ def harvest_obp(ebook):
|
||||||
booknum = match.group(2)
|
booknum = match.group(2)
|
||||||
if not booknum:
|
if not booknum:
|
||||||
logger.warning('couldn\'t get booknum for %s', ebook.url)
|
logger.warning('couldn\'t get booknum for %s', ebook.url)
|
||||||
return None, False
|
return None, 0
|
||||||
dl_url = 'https://www.openbookpublishers.com//download/book_content/{}'.format(booknum)
|
dl_url = 'https://www.openbookpublishers.com//download/book_content/{}'.format(booknum)
|
||||||
made = make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA, method='POST')
|
made = make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA, method='POST')
|
||||||
return made
|
return made
|
||||||
|
@ -192,7 +192,7 @@ def harvest_degruyter(ebook):
|
||||||
logger.warning('couldn\'t get dl_url for %s', ebook.url)
|
logger.warning('couldn\'t get dl_url for %s', ebook.url)
|
||||||
else:
|
else:
|
||||||
logger.warning('couldn\'t get soup for %s', ebook.url)
|
logger.warning('couldn\'t get soup for %s', ebook.url)
|
||||||
return None, False
|
return None, 0
|
||||||
|
|
||||||
def harvest_dropbox(ebook):
|
def harvest_dropbox(ebook):
|
||||||
if ebook.url.find(u'dl=0') >= 0:
|
if ebook.url.find(u'dl=0') >= 0:
|
||||||
|
@ -210,7 +210,7 @@ def harvest_dropbox(ebook):
|
||||||
logger.warning('couldn\'t get %s', ebook.url)
|
logger.warning('couldn\'t get %s', ebook.url)
|
||||||
else:
|
else:
|
||||||
logger.warning('couldn\'t get dl for %s', ebook.url)
|
logger.warning('couldn\'t get dl for %s', ebook.url)
|
||||||
return None, False
|
return None, 0
|
||||||
|
|
||||||
def harvest_jbe(ebook):
|
def harvest_jbe(ebook):
|
||||||
doc = get_soup(ebook.url)
|
doc = get_soup(ebook.url)
|
||||||
|
@ -223,5 +223,6 @@ def harvest_jbe(ebook):
|
||||||
logger.warning('couldn\'t get dl_url for %s', ebook.url)
|
logger.warning('couldn\'t get dl_url for %s', ebook.url)
|
||||||
else:
|
else:
|
||||||
logger.warning('couldn\'t get soup for %s', ebook.url)
|
logger.warning('couldn\'t get soup for %s', ebook.url)
|
||||||
return None, False
|
return None, 0
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -18,9 +18,9 @@ class Command(BaseCommand):
|
||||||
for online in onlines:
|
for online in onlines:
|
||||||
new_ebf, new = dl_online(online, limiter=rl.delay)
|
new_ebf, new = dl_online(online, limiter=rl.delay)
|
||||||
if new_ebf and new:
|
if new_ebf and new:
|
||||||
done += 1
|
done += new
|
||||||
self.stdout.write(new_ebf.edition.work.title)
|
self.stdout.write(new_ebf.edition.work.title)
|
||||||
if done == limit or done == 100:
|
if done >= limit or done >= 100:
|
||||||
break
|
break
|
||||||
self.stdout.write('harvested {} ebooks'.format(done))
|
self.stdout.write('harvested {} ebooks'.format(done))
|
||||||
if done == 100:
|
if done == 100:
|
||||||
|
|
Loading…
Reference in New Issue