diff --git a/.gitignore b/.gitignore index fa27c1c8..3441ee9d 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ static/scss/*.css.map assets/* *.ipynb dump.rdb +Pipfile +*.lock diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 7c88aba2..0786d038 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -63,6 +63,7 @@ def harvesters(ebook): yield ebook.provider == 'nomos-elibrary.de', harvest_nomos yield ebook.provider == 'frontiersin.org', harvest_frontiersin yield ebook.url.find('link.springer') >= 0, harvest_springerlink + yield ebook.provider == 'OAPEN Library', harvest_oapen def ebf_if_harvested(url): @@ -353,3 +354,33 @@ def harvest_springerlink(ebook): logger.warning('couldn\'t get any dl_url for %s', ebook.url) return harvested, num +OAPENPDF = re.compile('^/bitstream.*\.pdf') + +def harvest_oapen(ebook): + for old_ebook in ebook.edition.work.ebooks(): + if (old_ebook.id != ebook.id and + old_ebook.provider == ebook.provider and + old_ebook.format == 'pdf'): + ebook.delete() + return None, 0 + + harvested = None + made = 0 + if ebook.url.find('oapen.org/record') < 0: + return None, 0 + + doc = get_soup(ebook.url) + try: + base = doc.find('base')['href'] + except: + base = ebook.url + + if doc: + obj = doc.find('a', href=OAPENPDF) + if obj: + dl_url = urljoin(base, obj['href']) + harvested, made = make_dl_ebook(dl_url, ebook) + if made == 0: + logger.warning('couldn\'t get any dl_url for %s', ebook.url) + return harvested, made + diff --git a/core/management/commands/harvest_online_ebooks.py b/core/management/commands/harvest_online_ebooks.py index d2f3f353..e0c9d7f3 100644 --- a/core/management/commands/harvest_online_ebooks.py +++ b/core/management/commands/harvest_online_ebooks.py @@ -10,12 +10,15 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('limit', nargs='?', type=int, default=0, help="max to harvest") parser.add_argument('--ebook', nargs='?', type=int, default=0, help="ebook to harvest") + parser.add_argument('--provider', nargs='?', type=str, default='', help="provider to harvest") def handle(self, limit=0, **options): limit = int(limit) if limit else 0 rl = RateLimiter() if options.get('ebook'): onlines = Ebook.objects.filter(id=options.get('ebook')) + elif options.get('provider'): + onlines = Ebook.objects.filter(provider=options.get('provider'), format='online') else: onlines = Ebook.objects.filter(format='online') done = 0