diff --git a/core/loaders/doab.py b/core/loaders/doab.py index 73e67d94..eeb99759 100644 --- a/core/loaders/doab.py +++ b/core/loaders/doab.py @@ -450,6 +450,14 @@ def getdoab(url): return id_match.group(1) return False + +def get_doab_record(doab_id): + record_id = 'oai:directory.doabooks.org:%s' % doab_id + try: + return doab_client.getRecord(metadataPrefix='oai_dc', identifier=record_id) + except IdDoesNotExistError: + return None + def load_doab_oai(from_date, until_date, limit=100): ''' use oai feed to get oai updates diff --git a/core/management/commands/dedupe_doab.py b/core/management/commands/dedupe_doab.py new file mode 100644 index 00000000..097db2ad --- /dev/null +++ b/core/management/commands/dedupe_doab.py @@ -0,0 +1,28 @@ +from django.core.management.base import BaseCommand +from django.db.models import Count,Subquery, OuterRef, IntegerField + +from regluit.core.loaders.doab import get_doab_record +from regluit.core.models import Work, Identifier + + +class Command(BaseCommand): + help = "remove duplicate doab ids " + + def handle(self, **options): + doab_works = Work.objects.annotate( + doab_count=Subquery( + Identifier.objects.filter( + type='doab', + work=OuterRef('pk') + ).values('work') + .annotate(cnt=Count('pk')) + .values('cnt'), + output_field=IntegerField() + ) + ) + for w in doab_works.filter(doab_count__gt=1): + for ident in w.identifiers.filter(type="doab"): + record = get_doab_record(ident.value) + if not record: + self.stdout.write('removing %s' % ident.value) + ident.delete()