From e0c0d98c5d53034723b11bcff60144bfcabe18f2 Mon Sep 17 00:00:00 2001 From: eric Date: Mon, 3 May 2021 22:17:46 -0400 Subject: [PATCH] add a command to remove dead doabids --- core/loaders/doab.py | 8 +++++++ core/management/commands/dedupe_doab.py | 28 +++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 core/management/commands/dedupe_doab.py diff --git a/core/loaders/doab.py b/core/loaders/doab.py index 73e67d94..eeb99759 100644 --- a/core/loaders/doab.py +++ b/core/loaders/doab.py @@ -450,6 +450,14 @@ def getdoab(url): return id_match.group(1) return False + +def get_doab_record(doab_id): + record_id = 'oai:directory.doabooks.org:%s' % doab_id + try: + return doab_client.getRecord(metadataPrefix='oai_dc', identifier=record_id) + except IdDoesNotExistError: + return None + def load_doab_oai(from_date, until_date, limit=100): ''' use oai feed to get oai updates diff --git a/core/management/commands/dedupe_doab.py b/core/management/commands/dedupe_doab.py new file mode 100644 index 00000000..097db2ad --- /dev/null +++ b/core/management/commands/dedupe_doab.py @@ -0,0 +1,28 @@ +from django.core.management.base import BaseCommand +from django.db.models import Count,Subquery, OuterRef, IntegerField + +from regluit.core.loaders.doab import get_doab_record +from regluit.core.models import Work, Identifier + + +class Command(BaseCommand): + help = "remove duplicate doab ids " + + def handle(self, **options): + doab_works = Work.objects.annotate( + doab_count=Subquery( + Identifier.objects.filter( + type='doab', + work=OuterRef('pk') + ).values('work') + .annotate(cnt=Count('pk')) + .values('cnt'), + output_field=IntegerField() + ) + ) + for w in doab_works.filter(doab_count__gt=1): + for ident in w.identifiers.filter(type="doab"): + record = get_doab_record(ident.value) + if not record: + self.stdout.write('removing %s' % ident.value) + ident.delete()