doab-check/doab_check/doab_oai.py

164 lines
5.2 KiB
Python
Raw Normal View History

2023-02-20 21:03:07 +00:00
#!/usr/bin/env python
# encoding: utf-8
import datetime
import logging
import re
2023-04-21 21:17:04 +00:00
import pytz
from dateutil.parser import isoparse
from dateutil.utils import default_tzinfo
2023-02-20 21:03:07 +00:00
from oaipmh.client import Client
from oaipmh.error import IdDoesNotExistError, NoRecordsMatchError
from oaipmh.metadata import MetadataRegistry
import requests
from .doab_utils import doab_reader
from .models import Item, Link, Timestamp
2023-02-20 21:03:07 +00:00
DOAB_OAIURL = 'https://directory.doabooks.org/oai/request'
DOAB_PATT = re.compile(r'oai:(directory\.doabooks\.org|doab-books):(.*)')
2023-02-20 21:03:07 +00:00
logger = logging.getLogger(__name__)
mdregistry = MetadataRegistry()
mdregistry.registerReader('oai_dc', doab_reader)
doab_client = Client(DOAB_OAIURL, mdregistry)
def unlist(alist):
if not alist:
return None
return alist[0]
def getdoab(url, new_ns=False):
2023-02-20 21:03:07 +00:00
id_match = DOAB_PATT.search(url)
if id_match:
if new_ns:
return f'oai:directory.doabooks.org:{id_match.group(2)}'
return f'oai:doab-books:{id_match.group(2)}'
2023-02-20 21:03:07 +00:00
return False
def add_by_doab(doab_id, record=None):
try:
record = record if record else doab_client.getRecord(
metadataPrefix='oai_dc',
identifier=getdoab(doab_id, new_ns=True)
2023-02-20 21:03:07 +00:00
)
2023-04-19 17:15:53 +00:00
if record[0].isDeleted() or not record[1]:
logger.warning('record %s has no content or is deleted', record)
return set_deleted(record)
2023-02-20 21:03:07 +00:00
metadata = record[1].getMap()
urls = []
for ident in metadata.pop('identifier', []):
if ident.find('doabooks.org') >= 0:
# should already know the doab_id
continue
if ident.startswith('http'):
urls.append(ident)
title = unlist(metadata.pop('title', ['']))
2023-03-30 18:32:59 +00:00
publisher_name = unlist(metadata.pop('publisher', ['']))
2023-02-20 21:03:07 +00:00
item_type = unlist(metadata.pop('type', []))
timestamps = metadata.pop('timestamp', [])
added_item = load_doab_record(
2023-02-20 21:03:07 +00:00
doab_id,
title,
2023-04-19 19:52:34 +00:00
publisher_name if publisher_name else '',
2023-02-20 21:03:07 +00:00
item_type,
urls,
timestamps,
**metadata
)
return added_item
2023-02-20 21:03:07 +00:00
except IdDoesNotExistError as e:
logger.error(e)
return None
2023-03-30 18:32:59 +00:00
def load_doab_record(doab_id, title, publisher_name, item_type, urls, timestamps, **kwargs):
2023-02-20 21:03:07 +00:00
"""
create a record from doabooks.org represented by input parameters
"""
logger.info('load doab %s', doab_id)
(new_item, created) = Item.objects.get_or_create(doab=doab_id)
2023-02-21 01:04:10 +00:00
new_item.title = title
2023-04-19 19:52:34 +00:00
new_item.publisher_name = publisher_name if publisher_name else ''
2023-02-21 01:04:10 +00:00
new_item.resource_type = item_type
new_item.save()
2023-02-20 21:03:07 +00:00
for timestamp in timestamps:
2023-04-21 21:17:04 +00:00
timestamp = default_tzinfo(isoparse(timestamp), pytz.UTC)
2023-02-20 21:03:07 +00:00
(new_timestamp, created) = Timestamp.objects.get_or_create(
datetime=timestamp,
item=new_item)
2023-02-20 21:03:07 +00:00
for url in urls:
url = url.strip()
(link, created) = Link.objects.get_or_create(url=url)
link.items.add(new_item)
for linkrel in new_item.related.filter(role='identifier'):
if linkrel.link.url in urls:
linkrel.status = 1
else:
linkrel.status = 0
return new_item
2023-02-20 21:03:07 +00:00
2023-04-19 17:15:53 +00:00
def set_deleted(record):
if record[0].isDeleted():
ident = record[0].identifier()
doab = getdoab(ident)
try:
item = Item.objects.get(doab=doab)
item.status = 0
item.save()
for linkrel in item.related.all():
linkrel.status = 0
linkrel.save()
2023-04-19 17:15:53 +00:00
return item
except Item.DoesNotExist:
logger.warning(f'no item {doab}')
return None
2023-02-20 21:03:07 +00:00
def load_doab_oai(from_date, until_date, limit=100):
'''
use oai feed to get oai updates
'''
2023-04-21 21:17:04 +00:00
start = datetime.datetime.now(pytz.UTC)
2023-02-20 21:03:07 +00:00
if from_date:
from_ = from_date
else:
# last 15 days
2023-04-26 20:22:05 +00:00
from_ = datetime.datetime.now() - datetime.timedelta(days=7)
2023-02-20 21:03:07 +00:00
num_doabs = 0
new_doabs = 0
lasttime = datetime.datetime(2000, 1, 1)
try:
for record in doab_client.listRecords(metadataPrefix='oai_dc', from_=from_,
until=until_date):
if not record[1]:
2023-04-19 17:15:53 +00:00
# probably a deleted record
set_deleted(record)
2023-02-20 21:03:07 +00:00
continue
item_type = unlist(record[1].getMap().get('type', None))
ident = record[0].identifier()
responsestamp = record[0].datestamp()
lasttime = responsestamp if responsestamp > lasttime else lasttime
doab = getdoab(ident)
if doab:
num_doabs += 1
item = add_by_doab(doab, record=record)
if not item:
2023-02-20 21:03:07 +00:00
logger.error('error for doab #%s', doab)
continue
2023-04-21 21:17:04 +00:00
if item.created > start:
2023-02-20 21:03:07 +00:00
new_doabs += 1
title = item.title
2023-02-20 21:03:07 +00:00
logger.info(u'updated:\t%s\t%s', doab, title)
if num_doabs >= limit:
break
except NoRecordsMatchError:
pass
return num_doabs, new_doabs, lasttime