management commands

1. run an update of providers
2. dedupe the online ebooks
3. should have half the onlines to harvest
pull/94/head
eric 2019-03-01 21:26:39 -05:00
parent ac5c241e09
commit 02170c9bc2
4 changed files with 68 additions and 12 deletions

View File

@ -1,12 +0,0 @@
from django.core.management.base import BaseCommand
from regluit.core import models, bookloader
class Command(BaseCommand):
help = "clear deG descriptions"
def handle(self, **options):
qs=models.Work.objects.filter(description__icontains='degruyter_countdown')
for work in qs:
work.description = ''
work.save()

View File

@ -0,0 +1,28 @@
from django.core.management.base import BaseCommand
from django.db.models import Count
from regluit.core.models import Work, Ebook, EbookFile
class Command(BaseCommand):
help = "remove old online ebooks from same provider"
def handle(self, **options):
allonlines = Work.objects.filter(editions__ebooks__format='online').distinct()
self.stdout.write('%s works with online ebooks' % allonlines.count())
removed = 0
for work in allonlines:
onlines = Ebook.objects.filter(
edition__work__id=work.id,
format='online'
).order_by('-created')
num_onlines = onlines.count()
if num_onlines >= 2:
new_provider = onlines[0].provider
for online in onlines[1:]:
harvested = EbookFile.objects.filter(source=online.url).count()
if not harvested and online.provider == new_provider:
self.stdout.write(online.edition.work.title)
online.delete()
removed += 1
break
self.stdout.write('%s online ebooks removed')

View File

@ -19,6 +19,7 @@ class Command(BaseCommand):
new_ebf, new = dl_online(online, limiter=rl.delay)
if new_ebf and new:
done += 1
self.stdout.write(unicode(new_ebf.edition.work.title))
if done == limit or done == 50:
break
self.stdout.write('harvested {} ebooks'.format(done))

View File

@ -0,0 +1,39 @@
from django.core.management.base import BaseCommand
from django.db.models import Q
from regluit.core.loaders.harvest import dl_online, RateLimiter
from regluit.core.models import Ebook
from regluit.core.loaders.doab_utils import url_to_provider
class Command(BaseCommand):
help = "recalculate provider from url"
args = "<limit>"
def add_arguments(self, parser):
parser.add_argument('limit', nargs='?', type=int, default=0, help="max to harvest")
def handle(self, limit=0, **options):
done = 0
limit = int(limit) if limit else 0
unstripped = Ebook.objects.filter(Q(provider='') | Q(provider__startswith='www.'))
for ebook in unstripped:
ebook.url = ebook.url.strip()
new_provider = url_to_provider(ebook.url)
if new_provider != ebook.provider:
ebook.provider = new_provider
ebook.save()
done += 1
self.stdout.write('{} urls or netloc stripped'.format(done))
done = 0
stale = Ebook.objects.filter(Q(url__icontains='doi') | Q(url__icontains='hdl'))
for ebook in stale:
new_provider = url_to_provider(ebook.url)
if new_provider != ebook.provider:
ebook.provider = new_provider
ebook.save()
done += 1
if done > limit or done >= 100:
break
self.stdout.write('{} ebooks updated'.format(done))
if done == 100:
self.stdout.write('50 is the maximum; repeat to do more')