From 02170c9bc2e2127ce3fc1e542981f5147ed6294e Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 1 Mar 2019 21:26:39 -0500 Subject: [PATCH] management commands 1. run an update of providers 2. dedupe the online ebooks 3. should have half the onlines to harvest --- core/management/commands/clear_dg.py | 12 ------ core/management/commands/dedupe_onlines.py | 28 +++++++++++++ .../commands/harvest_online_ebooks.py | 1 + core/management/commands/update_providers.py | 39 +++++++++++++++++++ 4 files changed, 68 insertions(+), 12 deletions(-) delete mode 100644 core/management/commands/clear_dg.py create mode 100644 core/management/commands/dedupe_onlines.py create mode 100644 core/management/commands/update_providers.py diff --git a/core/management/commands/clear_dg.py b/core/management/commands/clear_dg.py deleted file mode 100644 index 7e61f514..00000000 --- a/core/management/commands/clear_dg.py +++ /dev/null @@ -1,12 +0,0 @@ -from django.core.management.base import BaseCommand - -from regluit.core import models, bookloader - -class Command(BaseCommand): - help = "clear deG descriptions" - - def handle(self, **options): - qs=models.Work.objects.filter(description__icontains='degruyter_countdown') - for work in qs: - work.description = '' - work.save() diff --git a/core/management/commands/dedupe_onlines.py b/core/management/commands/dedupe_onlines.py new file mode 100644 index 00000000..b3e8af6c --- /dev/null +++ b/core/management/commands/dedupe_onlines.py @@ -0,0 +1,28 @@ +from django.core.management.base import BaseCommand + +from django.db.models import Count +from regluit.core.models import Work, Ebook, EbookFile + +class Command(BaseCommand): + help = "remove old online ebooks from same provider" + + def handle(self, **options): + allonlines = Work.objects.filter(editions__ebooks__format='online').distinct() + self.stdout.write('%s works with online ebooks' % allonlines.count()) + removed = 0 + for work in allonlines: + onlines = Ebook.objects.filter( + edition__work__id=work.id, + format='online' + ).order_by('-created') + num_onlines = onlines.count() + if num_onlines >= 2: + new_provider = onlines[0].provider + for online in onlines[1:]: + harvested = EbookFile.objects.filter(source=online.url).count() + if not harvested and online.provider == new_provider: + self.stdout.write(online.edition.work.title) + online.delete() + removed += 1 + break + self.stdout.write('%s online ebooks removed') \ No newline at end of file diff --git a/core/management/commands/harvest_online_ebooks.py b/core/management/commands/harvest_online_ebooks.py index 7cd52e48..bc6a8c91 100644 --- a/core/management/commands/harvest_online_ebooks.py +++ b/core/management/commands/harvest_online_ebooks.py @@ -19,6 +19,7 @@ class Command(BaseCommand): new_ebf, new = dl_online(online, limiter=rl.delay) if new_ebf and new: done += 1 + self.stdout.write(unicode(new_ebf.edition.work.title)) if done == limit or done == 50: break self.stdout.write('harvested {} ebooks'.format(done)) diff --git a/core/management/commands/update_providers.py b/core/management/commands/update_providers.py new file mode 100644 index 00000000..a7c39074 --- /dev/null +++ b/core/management/commands/update_providers.py @@ -0,0 +1,39 @@ +from django.core.management.base import BaseCommand +from django.db.models import Q + +from regluit.core.loaders.harvest import dl_online, RateLimiter +from regluit.core.models import Ebook +from regluit.core.loaders.doab_utils import url_to_provider + +class Command(BaseCommand): + help = "recalculate provider from url" + args = "" + + def add_arguments(self, parser): + parser.add_argument('limit', nargs='?', type=int, default=0, help="max to harvest") + + def handle(self, limit=0, **options): + done = 0 + limit = int(limit) if limit else 0 + unstripped = Ebook.objects.filter(Q(provider='') | Q(provider__startswith='www.')) + for ebook in unstripped: + ebook.url = ebook.url.strip() + new_provider = url_to_provider(ebook.url) + if new_provider != ebook.provider: + ebook.provider = new_provider + ebook.save() + done += 1 + self.stdout.write('{} urls or netloc stripped'.format(done)) + done = 0 + stale = Ebook.objects.filter(Q(url__icontains='doi') | Q(url__icontains='hdl')) + for ebook in stale: + new_provider = url_to_provider(ebook.url) + if new_provider != ebook.provider: + ebook.provider = new_provider + ebook.save() + done += 1 + if done > limit or done >= 100: + break + self.stdout.write('{} ebooks updated'.format(done)) + if done == 100: + self.stdout.write('50 is the maximum; repeat to do more')