regluit/core/loaders/doab.py

472 lines
17 KiB
Python
Raw Normal View History

2016-11-29 20:37:02 +00:00
#!/usr/bin/env python
# encoding: utf-8
import datetime
import logging
2016-11-29 20:37:02 +00:00
import re
import requests
2018-04-07 22:38:33 +00:00
from django.db.models import Q
from django.core.files.base import ContentFile
2018-04-07 22:38:33 +00:00
from django.core.files.storage import default_storage
from oaipmh.client import Client
from oaipmh.error import IdDoesNotExistError
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
from regluit.core import bookloader, cc
2018-04-07 22:38:33 +00:00
from regluit.core import models, tasks
from regluit.core.bookloader import merge_works
2020-08-16 00:21:56 +00:00
from regluit.core.models.loader import type_for_url
2018-04-18 15:29:57 +00:00
from regluit.core.validation import identifier_cleaner, valid_subject
2018-04-12 19:08:29 +00:00
from . import scrape_language
2020-09-07 14:14:51 +00:00
from .doab_utils import doab_lang_to_iso_639_1, online_to_download
logger = logging.getLogger(__name__)
def unlist(alist):
2018-04-07 22:38:33 +00:00
if not alist:
return None
return alist[0]
2018-04-07 22:38:33 +00:00
SPRINGER_COVER = re.compile(r'ftp.+springer\.de.+(\d{13}\.jpg)$', flags=re.U)
SPRINGER_IMAGE = u'https://images.springer.com/sgw/books/medium/{}.jpg'
def store_doab_cover(doab_id, redo=False):
2018-04-07 22:38:33 +00:00
"""
returns tuple: 1) cover URL, 2) whether newly created (boolean)
"""
2018-04-07 22:38:33 +00:00
cover_file_name = '/doab/%s/cover' % (doab_id)
# if we don't want to redo and the cover exists, return the URL of the cover
2018-04-07 22:38:33 +00:00
if not redo and default_storage.exists(cover_file_name):
return (default_storage.url(cover_file_name), False)
2018-04-07 22:38:33 +00:00
# download cover image to cover_file
url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)
try:
r = requests.get(url, allow_redirects=False) # requests doesn't handle ftp redirects.
if r.status_code == 302:
redirurl = r.headers['Location']
if redirurl.startswith(u'ftp'):
2018-04-07 22:38:33 +00:00
springerftp = SPRINGER_COVER.match(redirurl)
if springerftp:
2018-04-07 22:38:33 +00:00
redirurl = SPRINGER_IMAGE.format(springerftp.groups(1))
r = requests.get(redirurl)
2018-04-17 18:20:44 +00:00
else:
r = requests.get(url)
else:
2018-04-07 22:38:33 +00:00
r = requests.get(url)
cover_file = ContentFile(r.content)
2018-07-10 17:58:38 +00:00
content_type = r.headers.get('content-type', '')
if u'text/html' in content_type:
2019-03-28 01:46:25 +00:00
logger.warning('Cover return html for doab_id={}'.format(doab_id))
2018-07-10 17:58:38 +00:00
return (None, False)
cover_file.content_type = content_type
2019-03-28 01:46:25 +00:00
2018-04-07 22:38:33 +00:00
default_storage.save(cover_file_name, cover_file)
return (default_storage.url(cover_file_name), True)
2020-02-12 22:56:04 +00:00
except Exception as e:
# if there is a problem, return None for cover URL
2016-11-18 18:28:59 +00:00
logger.warning('Failed to make cover image for doab_id={}: {}'.format(doab_id, e))
return (None, False)
2018-04-18 15:29:57 +00:00
def update_cover_doab(doab_id, edition, store_cover=True, redo=True):
"""
update the cover url for work with doab_id
if store_cover is True, use the cover from our own storage
"""
if store_cover:
2018-04-18 15:29:57 +00:00
(cover_url, new_cover) = store_doab_cover(doab_id, redo=redo)
else:
cover_url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)
if cover_url is not None:
edition.cover_image = cover_url
edition.save()
return cover_url
2018-04-07 22:38:33 +00:00
return None
2016-10-12 20:07:54 +00:00
def attach_more_doab_metadata(edition, description, subjects,
2020-07-22 23:10:05 +00:00
publication_date, publisher_name=None, language=None,
dois=[], authors=u''):
2018-04-07 22:38:33 +00:00
"""
2016-10-12 20:07:54 +00:00
for given edition, attach description, subjects, publication date to
corresponding Edition and Work
"""
2018-04-07 22:38:33 +00:00
# if edition doesn't have a publication date, update it
if not edition.publication_date:
edition.publication_date = publication_date
2018-04-07 22:38:33 +00:00
# if edition.publisher_name is empty, set it
2014-07-25 22:16:06 +00:00
if not edition.publisher_name:
edition.set_publisher(publisher_name)
2018-04-07 22:38:33 +00:00
2016-10-28 00:05:43 +00:00
edition.save()
2018-04-07 22:38:33 +00:00
# attach description to work if it's not empty
work = edition.work
if not work.description:
work.description = description
2018-04-07 22:38:33 +00:00
# update subjects
2015-01-30 16:39:48 +00:00
for s in subjects:
if valid_subject(s):
models.Subject.set_by_name(s, work=work)
2018-04-07 22:38:33 +00:00
2016-10-12 20:07:54 +00:00
# set reading level of work if it's empty; doab is for adults.
if not work.age_level:
work.age_level = '18-'
2018-04-07 22:38:33 +00:00
if language and language != 'xx':
2016-10-28 00:05:43 +00:00
work.language = language
work.save()
2018-04-07 22:38:33 +00:00
if authors and authors == authors: # test for authors != NaN
authlist = creator_list(authors)
if edition.authors.all().count() < len(authlist):
edition.authors.clear()
if authlist is not None:
2018-04-07 22:38:33 +00:00
for [rel, auth] in authlist:
edition.add_author(auth, rel)
2018-04-07 22:38:33 +00:00
2020-07-22 23:10:05 +00:00
for doi in dois:
if not edition.work.doi:
models.Identifier.set('doi', doi, work=edition.work)
break
2016-10-12 20:07:54 +00:00
return edition
2016-10-28 18:40:16 +00:00
def add_all_isbns(isbns, work, language=None, title=None):
first_edition = None
2016-10-28 18:40:16 +00:00
for isbn in isbns:
edition = bookloader.add_by_isbn(isbn, work, language=language, title=title)
if edition:
2018-04-07 22:38:33 +00:00
first_edition = first_edition if first_edition else edition
if work and (edition.work_id != work.id):
2018-06-18 21:04:40 +00:00
if work.doab and edition.work.doab and work.doab != edition.work.doab:
if work.created < edition.work.created:
work = merge_works(work, edition.work)
else:
work = merge_works(edition.work, work)
2016-10-28 18:40:16 +00:00
else:
work = edition.work
return work, first_edition
2016-10-28 18:40:16 +00:00
def load_doab_edition(title, doab_id, url, format, rights,
2020-07-22 23:10:05 +00:00
language, isbns, provider, dois=[], **kwargs):
2018-04-07 22:38:33 +00:00
"""
load a record from doabooks.org represented by input parameters and return an ebook
"""
logger.info('load doab {} {} {} {} {}'.format(doab_id, format, rights, language, provider))
url = url.strip()
2016-10-28 00:05:43 +00:00
if language and isinstance(language, list):
language = language[0]
2018-04-12 19:08:29 +00:00
if language == 'xx' and format == 'online':
language = scrape_language(url)
# check to see whether the Edition hasn't already been loaded first
# search by url
ebooks = models.Ebook.objects.filter(url=url)
2018-04-07 22:38:33 +00:00
# 1 match
# > 1 matches
# 0 match
# simplest case -- if match (1 or more), we could check whether any
# ebook.edition.work has a doab id matching given doab_id
2018-04-07 22:38:33 +00:00
# put a migration to force Ebook.url to be unique id
2018-04-07 22:38:33 +00:00
# if yes, then return one of the Edition(s) whose work is doab_id
2018-04-07 22:38:33 +00:00
# if no, then
2016-10-12 20:07:54 +00:00
ebook = None
if len(ebooks) > 1:
2018-04-07 22:38:33 +00:00
raise Exception("There is more than one Ebook matching url {0}".format(url))
elif len(ebooks) == 1:
ebook = ebooks[0]
2018-04-07 22:38:33 +00:00
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
work=ebook.edition.work)
2018-04-18 21:53:21 +00:00
if not ebook.rights:
ebook.rights = rights
ebook.save()
2019-03-28 01:46:25 +00:00
2018-04-07 22:38:33 +00:00
# update the cover id
2018-04-18 15:29:57 +00:00
cover_url = update_cover_doab(doab_id, ebook.edition, redo=False)
2018-04-07 22:38:33 +00:00
# attach more metadata
attach_more_doab_metadata(
2018-04-07 22:38:33 +00:00
ebook.edition,
description=unlist(kwargs.get('description')),
subjects=kwargs.get('subject'),
publication_date=unlist(kwargs.get('date')),
publisher_name=unlist(kwargs.get('publisher')),
language=language,
authors=kwargs.get('creator'),
2020-07-22 23:28:02 +00:00
dois=dois,
)
2016-10-12 20:07:54 +00:00
# make sure all isbns are added
add_all_isbns(isbns, ebook.edition.work, language=language, title=title)
return ebook.edition
2018-04-07 22:38:33 +00:00
2016-10-12 20:07:54 +00:00
# remaining case --> no ebook, load record, create ebook if there is one.
assert not ebooks
2018-04-07 22:38:33 +00:00
2016-10-12 20:07:54 +00:00
# we need to find the right Edition/Work to tie Ebook to...
2018-04-07 22:38:33 +00:00
# look for the Edition with which to associate ebook.
# loop through the isbns to see whether we get one that is not None
2016-10-28 18:40:16 +00:00
work, edition = add_all_isbns(isbns, None, language=language, title=title)
2016-10-28 00:05:43 +00:00
if doab_id and not work:
2016-10-12 20:07:54 +00:00
# make sure there's not already a doab_id
idents = models.Identifier.objects.filter(type='doab', value=doab_id)
for ident in idents:
2016-10-28 18:40:16 +00:00
edition = ident.work.preferred_edition
work = edition.work
2016-10-12 20:07:54 +00:00
break
2018-04-07 22:38:33 +00:00
if edition is not None:
# if this is a new edition, then add related editions SYNCHRONOUSLY
2018-04-07 22:38:33 +00:00
if getattr(edition, 'new', False):
tasks.populate_edition(edition.isbn_13)
edition.refresh_from_db()
2016-10-12 20:07:54 +00:00
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
2018-04-07 22:38:33 +00:00
work=edition.work)
2018-04-07 22:38:33 +00:00
# we need to create Edition(s) de novo
else:
# if there is a Work with doab_id already, attach any new Edition(s)
try:
2016-10-12 20:07:54 +00:00
work = models.Identifier.objects.get(type='doab', value=doab_id).work
except models.Identifier.DoesNotExist:
2016-10-28 00:05:43 +00:00
if language:
work = models.Work(language=language, title=title, age_level='18-')
else:
work = models.Work(language='xx', title=title, age_level='18-')
work.save()
2016-10-12 20:07:54 +00:00
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
2018-04-07 22:38:33 +00:00
work=work)
# if work has any ebooks already, attach the ebook to the corresponding edition
# otherwise pick the first one
2018-04-07 22:38:33 +00:00
# pick the first edition as the one to tie ebook to
editions_with_ebooks = models.Edition.objects.filter(Q(work__id=work.id) & \
Q(ebooks__isnull=False)).distinct()
if editions_with_ebooks:
edition = editions_with_ebooks[0]
2016-10-28 00:05:43 +00:00
elif work.editions.all():
edition = work.editions.all()[0]
else:
edition = models.Edition(work=work, title=title)
edition.save()
2018-04-07 22:38:33 +00:00
# make the edition the selected_edition of the work
work.selected_edition = edition
work.save()
2018-04-07 22:38:33 +00:00
2018-04-18 21:53:21 +00:00
if format in ('pdf', 'epub', 'mobi', 'html', 'online') and rights:
2016-10-12 20:07:54 +00:00
ebook = models.Ebook()
ebook.format = format
ebook.provider = provider
2018-04-07 22:38:33 +00:00
ebook.url = url
2016-10-12 20:07:54 +00:00
ebook.rights = rights
# tie the edition to ebook
ebook.edition = edition
if format == "online":
ebook.active = False
2016-10-12 20:07:54 +00:00
ebook.save()
2018-04-07 22:38:33 +00:00
# update the cover id (could be done separately)
2018-04-18 15:29:57 +00:00
cover_url = update_cover_doab(doab_id, edition, redo=False)
2018-04-07 22:38:33 +00:00
# attach more metadata
attach_more_doab_metadata(
2018-04-07 22:38:33 +00:00
edition,
description=unlist(kwargs.get('description')),
subjects=kwargs.get('subject'),
publication_date=unlist(kwargs.get('date')),
publisher_name=unlist(kwargs.get('publisher')),
authors=kwargs.get('creator'),
2020-07-22 23:10:05 +00:00
dois=dois,
2018-04-07 22:38:33 +00:00
)
2018-07-10 17:58:38 +00:00
if rights:
for ebook in edition.ebooks.all():
if not ebook.rights:
ebook.rights = rights
ebook.save()
return edition
2016-11-29 20:37:02 +00:00
2018-04-07 22:38:33 +00:00
#
2016-11-29 20:37:02 +00:00
#tools to parse the author lists in doab.csv
2018-04-07 22:38:33 +00:00
#
2016-11-29 20:37:02 +00:00
au = re.compile(r'\(Authors?\)', flags=re.U)
ed = re.compile(r'\([^\)]*(dir.|[Eeé]ds?.|org.|coord.|Editor|a cura di|archivist)[^\)]*\)', flags=re.U)
tr = re.compile(r'\([^\)]*([Tt]rans.|tr.|translated by)[^\)]*\)', flags=re.U)
ai = re.compile(r'\([^\)]*(Introduction|Foreword)[^\)]*\)', flags=re.U)
ds = re.compile(r'\([^\)]*(designer)[^\)]*\)', flags=re.U)
cm = re.compile(r'\([^\)]*(comp.)[^\)]*\)', flags=re.U)
namelist = re.compile(r'([^,]+ [^, ]+)(, | and )([^,]+ [^, ]+)', flags=re.U)
namesep = re.compile(r', | and ', flags=re.U)
namesep2 = re.compile(r';|/| and ', flags=re.U)
isbnsep = re.compile(r'[ ,/;\t\.]+|Paper: *|Cloth: *|eISBN: *|Hardcover: *', flags=re.U)
2016-11-29 20:37:02 +00:00
edlist = re.compile(r'([eE]dited by| a cura di|editors)', flags=re.U)
def fnf(auth):
if len(auth) > 60:
return auth #probably corp name
parts = re.sub(r' +', u' ', auth).split(u',')
2016-11-29 20:37:02 +00:00
if len(parts) == 1:
return parts[0].strip()
elif len(parts) == 2:
2018-04-07 22:38:33 +00:00
return u'{} {}'.format(parts[1].strip(), parts[0].strip())
2016-11-29 20:37:02 +00:00
else:
2018-04-07 22:38:33 +00:00
if parts[1].strip() in ('der', 'van', 'von', 'de', 'ter'):
return u'{} {} {}'.format(parts[2].strip(), parts[1].strip(), parts[0].strip())
return u'{} {}, {}'.format(parts[2].strip(), parts[0].strip(), parts[1].strip())
2016-11-29 20:37:02 +00:00
def creator(auth, editor=False):
auth = auth.strip()
if auth in (u'', u'and'):
2016-11-29 20:37:02 +00:00
return None
if re.search(ed, auth) or editor:
return [u'edt', fnf(ed.sub(u'', auth))]
2016-11-29 20:37:02 +00:00
if re.search(tr, auth):
return [u'trl', fnf(tr.sub(u'', auth))]
2016-11-29 20:37:02 +00:00
if re.search(ai, auth):
return [u'aui', fnf(ai.sub(u'', auth))]
2016-11-29 20:37:02 +00:00
if re.search(ds, auth):
return [u'dsr', fnf(ds.sub(u'', auth))]
2016-11-29 20:37:02 +00:00
if re.search(cm, auth):
return [u'com', fnf(cm.sub(u'', auth))]
2018-04-07 22:38:33 +00:00
2016-11-29 20:37:02 +00:00
auth = au.sub('', auth)
return ['aut', fnf(auth)]
def creator_list(creators):
auths = []
for auth in creators:
auths.append(creator(auth))
2016-11-29 20:37:02 +00:00
return auths
DOAB_OAIURL = 'https://www.doabooks.org/oai'
DOAB_PATT = re.compile(r'[\./]doabooks\.org/doab\?.*rid:(\d{1,8}).*')
mdregistry = MetadataRegistry()
mdregistry.registerReader('oai_dc', oai_dc_reader)
doab_client = Client(DOAB_OAIURL, mdregistry)
2018-04-18 15:29:57 +00:00
isbn_cleaner = identifier_cleaner('isbn', quiet=True)
2020-07-22 23:10:05 +00:00
doi_cleaner = identifier_cleaner('doi', quiet=True)
2018-04-18 15:29:57 +00:00
ISBNSEP = re.compile(r'[/]+')
def add_by_doab(doab_id, record=None):
try:
record = record if record else doab_client.getRecord(
metadataPrefix='oai_dc',
identifier='oai:doab-books:{}'.format(doab_id)
)
2019-03-28 01:21:25 +00:00
if not record[1]:
2020-07-26 20:06:33 +00:00
logger.error('No content in record %s', record)
2019-03-28 01:21:25 +00:00
return None
metadata = record[1].getMap()
isbns = []
2020-07-22 23:10:05 +00:00
dois = []
url = None
for ident in metadata.pop('identifier', []):
if ident.startswith('ISBN: '):
2018-04-18 15:29:57 +00:00
isbn_strings = ISBNSEP.split(ident[6:].strip())
for isbn_string in isbn_strings:
isbn = isbn_cleaner(isbn_string)
if isbn:
isbns.append(isbn)
2018-04-09 15:54:16 +00:00
elif ident.find('doabooks.org') >= 0:
# should already know the doab_id
continue
2020-07-22 23:10:05 +00:00
elif ident.startswith('DOI: '):
ident = ident[5:].strip()
ident = doi_cleaner(ident)
if ident:
dois.append(ident)
else:
url = ident
language = doab_lang_to_iso_639_1(unlist(metadata.pop('language', None)))
urls = online_to_download(url)
edition = None
2018-04-18 21:53:21 +00:00
title = unlist(metadata.pop('title', None))
license = cc.license_from_cc_url(unlist(metadata.pop('rights', None)))
for dl_url in urls:
format = type_for_url(dl_url)
if 'format' in metadata:
del metadata['format']
edition = load_doab_edition(
2018-04-18 21:53:21 +00:00
title,
doab_id,
dl_url,
format,
2018-04-18 21:53:21 +00:00
license,
2018-04-12 19:08:29 +00:00
language,
isbns,
2020-09-07 14:14:51 +00:00
models.Ebook.infer_provider(dl_url) if dl_url else None,
2020-07-22 23:10:05 +00:00
dois=dois,
**metadata
)
return edition
2020-07-26 20:06:33 +00:00
except IdDoesNotExistError as e:
logger.error(e)
return None
2018-04-07 22:38:33 +00:00
def getdoab(url):
id_match = DOAB_PATT.search(url)
if id_match:
return id_match.group(1)
return False
def load_doab_oai(from_date, from_id=0, limit=100):
'''
use oai feed to get oai updates
'''
start = datetime.datetime.now()
if from_date:
from_ = from_date
2019-03-28 01:46:25 +00:00
else:
2018-06-15 19:30:04 +00:00
# last 15 days
from_ = datetime.datetime.now() - datetime.timedelta(days=15)
doab_id = None
num_doabs = 0
new_doabs = 0
for record in doab_client.listRecords(metadataPrefix='oai_dc', from_=from_):
if not record[1]:
continue
2018-04-23 19:41:42 +00:00
item_type = unlist(record[1].getMap().get('type', None))
if item_type != 'book':
continue
idents = record[1].getMap()['identifier']
if idents:
for ident in idents:
doab = getdoab(ident)
if doab and int(doab) < from_id:
continue
if doab:
doab_id = doab
num_doabs += 1
e = add_by_doab(doab, record=record)
2020-07-26 20:06:33 +00:00
if not e:
logger.error('null edition for doab #%s', doab)
continue
if e.created > start:
new_doabs += 1
2018-04-17 18:21:21 +00:00
title = e.title if e else None
logger.info(u'updated:\t{}\t{}'.format(doab, title))
if num_doabs >= limit:
break
return num_doabs, new_doabs, doab_id