doab-check/doab_check/doab_utils.py

87 lines
3.2 KiB
Python

"""
doab_utils.py
"""
import logging
import re
from ssl import SSLError
from urllib.parse import urljoin
import requests
from oaipmh.metadata import MetadataReader
from django.conf import settings
logger = logging.getLogger(__name__)
doab_reader = MetadataReader(
fields={
'title': ('textList', 'oai_dc:dc/datacite:title/text()'),
'creator': ('textList', 'oai_dc:dc/datacite:creator/text()'),
'subject': ('textList', 'oai_dc:dc/datacite:subject/text()'),
'description': ('textList', 'oai_dc:dc/dc:description/text()'),
'publisher': ('textList', 'oai_dc:dc/dc:publisher/text()'),
'editor': ('textList', 'oai_dc:dc/datacite:contributor[@type="Editor"]/text()'),
'date': ('textList', 'oai_dc:dc/datacite:date[@type="Issued"]/text()'),
'timestamp': ('textList', 'oai_dc:dc/dc:date/text()'),
'type': ('textList', 'oai_dc:dc/oaire:resourceType/text()'),
'format': ('textList', 'oai_dc:dc/dc:format/text()'),
'identifier': ('textList', 'oai_dc:dc/dc:identifier/text()'),
'source': ('textList', 'oai_dc:dc/dc:source/text()'),
'language': ('textList', 'oai_dc:dc/dc:language/text()'),
'relation': ('textList', 'oai_dc:dc/dc:relation/text()'),
'coverage': ('textList', 'oai_dc:dc/dc:coverage/text()'),
'rights': ('textList', 'oai_dc:dc/oaire:licenseCondition/@uri'),
'isbn': ('textList', 'oai_dc:dc/datacite:alternateIdentifier[@type="ISBN"]/text()'),
'doi': ('textList', 'oai_dc:dc/datacite:alternateIdentifier[@type="DOI"]/text()'),
},
namespaces={
'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
'dc' : 'http://purl.org/dc/elements/1.1/',
'grantor': 'http://purl.org/dc/elements/1.1/',
'publisher': 'http://purl.org/dc/elements/1.1/',
'oapen': 'http://purl.org/dc/elements/1.1/',
'oaire': 'https://raw.githubusercontent.com/rcic/openaire4/master/schemas/4.0/oaire.xsd',
'datacite': 'https://schema.datacite.org/meta/kernel-4.1/metadata.xsd',
'doc': 'http://www.lyncode.com/xoai'
}
)
STREAM_QUERY = 'https://directory.doabooks.org/rest/search?query=handle:{}&expand=bitstreams'
def get_streamdata(handle):
url = STREAM_QUERY.format(handle)
try:
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
items = response.json()
if items:
for stream in items[0]['bitstreams']:
if stream['bundleName'] == "THUMBNAIL":
stream['handle'] = handle
return stream
else:
logger.error("No items in streamdata for %s", handle)
except requests.exceptions.RequestException as e:
logger.error(e)
except SSLError as e:
logger.error(e)
except ValueError as e:
# decoder error
logger.error(e)
COVER_FSTRING = "https://directory.doabooks.org/bitstream/handle/{handle}/{name}?sequence={sequenceId}&isAllowed=y"
def doab_cover(doab_id):
stream_data = get_streamdata(doab_id)
if not stream_data:
logger.error('get_streamdata failed for %s', doab_id)
return None
return COVER_FSTRING.format(**stream_data)