management commands
1. run an update of providers 2. dedupe the online ebooks 3. should have half the onlines to harvestpull/94/head
parent
ac5c241e09
commit
02170c9bc2
|
@ -1,12 +0,0 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
|
||||
from regluit.core import models, bookloader
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "clear deG descriptions"
|
||||
|
||||
def handle(self, **options):
|
||||
qs=models.Work.objects.filter(description__icontains='degruyter_countdown')
|
||||
for work in qs:
|
||||
work.description = ''
|
||||
work.save()
|
|
@ -0,0 +1,28 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
|
||||
from django.db.models import Count
|
||||
from regluit.core.models import Work, Ebook, EbookFile
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "remove old online ebooks from same provider"
|
||||
|
||||
def handle(self, **options):
|
||||
allonlines = Work.objects.filter(editions__ebooks__format='online').distinct()
|
||||
self.stdout.write('%s works with online ebooks' % allonlines.count())
|
||||
removed = 0
|
||||
for work in allonlines:
|
||||
onlines = Ebook.objects.filter(
|
||||
edition__work__id=work.id,
|
||||
format='online'
|
||||
).order_by('-created')
|
||||
num_onlines = onlines.count()
|
||||
if num_onlines >= 2:
|
||||
new_provider = onlines[0].provider
|
||||
for online in onlines[1:]:
|
||||
harvested = EbookFile.objects.filter(source=online.url).count()
|
||||
if not harvested and online.provider == new_provider:
|
||||
self.stdout.write(online.edition.work.title)
|
||||
online.delete()
|
||||
removed += 1
|
||||
break
|
||||
self.stdout.write('%s online ebooks removed')
|
|
@ -19,6 +19,7 @@ class Command(BaseCommand):
|
|||
new_ebf, new = dl_online(online, limiter=rl.delay)
|
||||
if new_ebf and new:
|
||||
done += 1
|
||||
self.stdout.write(unicode(new_ebf.edition.work.title))
|
||||
if done == limit or done == 50:
|
||||
break
|
||||
self.stdout.write('harvested {} ebooks'.format(done))
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
from django.db.models import Q
|
||||
|
||||
from regluit.core.loaders.harvest import dl_online, RateLimiter
|
||||
from regluit.core.models import Ebook
|
||||
from regluit.core.loaders.doab_utils import url_to_provider
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "recalculate provider from url"
|
||||
args = "<limit>"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('limit', nargs='?', type=int, default=0, help="max to harvest")
|
||||
|
||||
def handle(self, limit=0, **options):
|
||||
done = 0
|
||||
limit = int(limit) if limit else 0
|
||||
unstripped = Ebook.objects.filter(Q(provider='') | Q(provider__startswith='www.'))
|
||||
for ebook in unstripped:
|
||||
ebook.url = ebook.url.strip()
|
||||
new_provider = url_to_provider(ebook.url)
|
||||
if new_provider != ebook.provider:
|
||||
ebook.provider = new_provider
|
||||
ebook.save()
|
||||
done += 1
|
||||
self.stdout.write('{} urls or netloc stripped'.format(done))
|
||||
done = 0
|
||||
stale = Ebook.objects.filter(Q(url__icontains='doi') | Q(url__icontains='hdl'))
|
||||
for ebook in stale:
|
||||
new_provider = url_to_provider(ebook.url)
|
||||
if new_provider != ebook.provider:
|
||||
ebook.provider = new_provider
|
||||
ebook.save()
|
||||
done += 1
|
||||
if done > limit or done >= 100:
|
||||
break
|
||||
self.stdout.write('{} ebooks updated'.format(done))
|
||||
if done == 100:
|
||||
self.stdout.write('50 is the maximum; repeat to do more')
|
Loading…
Reference in New Issue