doab-check/doab_check/doab_oai.py

164 lines
5.2 KiB
Python

#!/usr/bin/env python
# encoding: utf-8
import datetime
import logging
import re
import pytz
from dateutil.parser import isoparse
from dateutil.utils import default_tzinfo
from oaipmh.client import Client
from oaipmh.error import IdDoesNotExistError, NoRecordsMatchError
from oaipmh.metadata import MetadataRegistry
import requests
from .doab_utils import doab_reader
from .models import Item, Link, Timestamp
DOAB_OAIURL = 'https://directory.doabooks.org/oai/request'
DOAB_PATT = re.compile(r'oai:(directory\.doabooks\.org|doab-books):(.*)')
logger = logging.getLogger(__name__)
mdregistry = MetadataRegistry()
mdregistry.registerReader('oai_dc', doab_reader)
doab_client = Client(DOAB_OAIURL, mdregistry)
def unlist(alist):
if not alist:
return None
return alist[0]
def getdoab(url, new_ns=False):
id_match = DOAB_PATT.search(url)
if id_match:
if new_ns:
return f'oai:directory.doabooks.org:{id_match.group(2)}'
return f'oai:doab-books:{id_match.group(2)}'
return False
def add_by_doab(doab_id, record=None):
try:
record = record if record else doab_client.getRecord(
metadataPrefix='oai_dc',
identifier=getdoab(doab_id, new_ns=True)
)
if record[0].isDeleted() or not record[1]:
logger.warning('record %s has no content or is deleted', record)
return set_deleted(record)
metadata = record[1].getMap()
urls = []
for ident in metadata.pop('identifier', []):
if ident.find('doabooks.org') >= 0:
# should already know the doab_id
continue
if ident.startswith('http'):
urls.append(ident)
title = unlist(metadata.pop('title', ['']))
publisher_name = unlist(metadata.pop('publisher', ['']))
item_type = unlist(metadata.pop('type', []))
timestamps = metadata.pop('timestamp', [])
added_item = load_doab_record(
doab_id,
title,
publisher_name if publisher_name else '',
item_type,
urls,
timestamps,
**metadata
)
return added_item
except IdDoesNotExistError as e:
logger.error(e)
return None
def load_doab_record(doab_id, title, publisher_name, item_type, urls, timestamps, **kwargs):
"""
create a record from doabooks.org represented by input parameters
"""
logger.info('load doab %s', doab_id)
(new_item, created) = Item.objects.get_or_create(doab=doab_id)
new_item.title = title
new_item.publisher_name = publisher_name if publisher_name else ''
new_item.resource_type = item_type
new_item.save()
for timestamp in timestamps:
timestamp = default_tzinfo(isoparse(timestamp), pytz.UTC)
(new_timestamp, created) = Timestamp.objects.get_or_create(
datetime=timestamp,
item=new_item)
for url in urls:
url = url.strip()
(link, created) = Link.objects.get_or_create(url=url)
link.items.add(new_item)
for linkrel in new_item.related.filter(role='identifier'):
if linkrel.link.url in urls:
linkrel.status = 1
else:
linkrel.status = 0
return new_item
def set_deleted(record):
if record[0].isDeleted():
ident = record[0].identifier()
doab = getdoab(ident)
try:
item = Item.objects.get(doab=doab)
item.status = 0
item.save()
for linkrel in item.related.all():
linkrel.status = 0
linkrel.save()
return item
except Item.DoesNotExist:
logger.warning(f'no item {doab}')
return None
def load_doab_oai(from_date, until_date, limit=100):
'''
use oai feed to get oai updates
'''
start = datetime.datetime.now(pytz.UTC)
if from_date:
from_ = from_date
else:
# last 15 days
from_ = datetime.datetime.now() - datetime.timedelta(days=7)
num_doabs = 0
new_doabs = 0
lasttime = datetime.datetime(2000, 1, 1)
try:
for record in doab_client.listRecords(metadataPrefix='oai_dc', from_=from_,
until=until_date):
if not record[1]:
# probably a deleted record
set_deleted(record)
continue
item_type = unlist(record[1].getMap().get('type', None))
ident = record[0].identifier()
responsestamp = record[0].datestamp()
lasttime = responsestamp if responsestamp > lasttime else lasttime
doab = getdoab(ident)
if doab:
num_doabs += 1
item = add_by_doab(doab, record=record)
if not item:
logger.error('error for doab #%s', doab)
continue
if item.created > start:
new_doabs += 1
title = item.title
logger.info(u'updated:\t%s\t%s', doab, title)
if num_doabs >= limit:
break
except NoRecordsMatchError:
pass
return num_doabs, new_doabs, lasttime