add OAPEN "harvest"
parent
c0505f299b
commit
64b03fd40f
|
@ -18,3 +18,5 @@ static/scss/*.css.map
|
|||
assets/*
|
||||
*.ipynb
|
||||
dump.rdb
|
||||
Pipfile
|
||||
*.lock
|
||||
|
|
|
@ -63,6 +63,7 @@ def harvesters(ebook):
|
|||
yield ebook.provider == 'nomos-elibrary.de', harvest_nomos
|
||||
yield ebook.provider == 'frontiersin.org', harvest_frontiersin
|
||||
yield ebook.url.find('link.springer') >= 0, harvest_springerlink
|
||||
yield ebook.provider == 'OAPEN Library', harvest_oapen
|
||||
|
||||
|
||||
def ebf_if_harvested(url):
|
||||
|
@ -353,3 +354,33 @@ def harvest_springerlink(ebook):
|
|||
logger.warning('couldn\'t get any dl_url for %s', ebook.url)
|
||||
return harvested, num
|
||||
|
||||
OAPENPDF = re.compile('^/bitstream.*\.pdf')
|
||||
|
||||
def harvest_oapen(ebook):
|
||||
for old_ebook in ebook.edition.work.ebooks():
|
||||
if (old_ebook.id != ebook.id and
|
||||
old_ebook.provider == ebook.provider and
|
||||
old_ebook.format == 'pdf'):
|
||||
ebook.delete()
|
||||
return None, 0
|
||||
|
||||
harvested = None
|
||||
made = 0
|
||||
if ebook.url.find('oapen.org/record') < 0:
|
||||
return None, 0
|
||||
|
||||
doc = get_soup(ebook.url)
|
||||
try:
|
||||
base = doc.find('base')['href']
|
||||
except:
|
||||
base = ebook.url
|
||||
|
||||
if doc:
|
||||
obj = doc.find('a', href=OAPENPDF)
|
||||
if obj:
|
||||
dl_url = urljoin(base, obj['href'])
|
||||
harvested, made = make_dl_ebook(dl_url, ebook)
|
||||
if made == 0:
|
||||
logger.warning('couldn\'t get any dl_url for %s', ebook.url)
|
||||
return harvested, made
|
||||
|
||||
|
|
|
@ -10,12 +10,15 @@ class Command(BaseCommand):
|
|||
def add_arguments(self, parser):
|
||||
parser.add_argument('limit', nargs='?', type=int, default=0, help="max to harvest")
|
||||
parser.add_argument('--ebook', nargs='?', type=int, default=0, help="ebook to harvest")
|
||||
parser.add_argument('--provider', nargs='?', type=str, default='', help="provider to harvest")
|
||||
|
||||
def handle(self, limit=0, **options):
|
||||
limit = int(limit) if limit else 0
|
||||
rl = RateLimiter()
|
||||
if options.get('ebook'):
|
||||
onlines = Ebook.objects.filter(id=options.get('ebook'))
|
||||
elif options.get('provider'):
|
||||
onlines = Ebook.objects.filter(provider=options.get('provider'), format='online')
|
||||
else:
|
||||
onlines = Ebook.objects.filter(format='online')
|
||||
done = 0
|
||||
|
|
Loading…
Reference in New Issue