415 lines
15 KiB
Python
415 lines
15 KiB
Python
#!/usr/bin/env python
|
|
# encoding: utf-8
|
|
import logging
|
|
import json
|
|
import re
|
|
|
|
from itertools import islice
|
|
|
|
import requests
|
|
|
|
from django.db.models import (Q, F)
|
|
|
|
from django.core.files.storage import default_storage
|
|
from django.core.files.base import ContentFile
|
|
|
|
import regluit
|
|
from regluit.core import models, tasks
|
|
from regluit.core import bookloader
|
|
from regluit.core.bookloader import add_by_isbn, merge_works
|
|
from regluit.core.isbn import ISBN
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
springercover = re.compile(r'ftp.+springer\.de.+(\d{13}\.jpg)$', flags=re.U)
|
|
|
|
def store_doab_cover(doab_id, redo=False):
|
|
|
|
"""
|
|
returns tuple: 1) cover URL, 2) whether newly created (boolean)
|
|
"""
|
|
|
|
cover_file_name= '/doab/%s/cover' % (doab_id)
|
|
|
|
# if we don't want to redo and the cover exists, return the URL of the cover
|
|
|
|
if not redo and default_storage.exists(cover_file_name):
|
|
return (default_storage.url(cover_file_name), False)
|
|
|
|
# download cover image to cover_file
|
|
url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)
|
|
try:
|
|
r = requests.get(url, allow_redirects=False) # requests doesn't handle ftp redirects.
|
|
if r.status_code == 302:
|
|
redirurl = r.headers['Location']
|
|
if redirurl.startswith(u'ftp'):
|
|
springerftp = springercover.match(redirurl)
|
|
if springerftp:
|
|
redirurl = u'https://images.springer.com/sgw/books/medium/{}.jpg'.format(springerftp.groups(1))
|
|
r = requests.get(redirurl)
|
|
else:
|
|
r = requests.get(url)
|
|
cover_file = ContentFile(r.content)
|
|
cover_file.content_type = r.headers.get('content-type', '')
|
|
|
|
path = default_storage.save(cover_file_name, cover_file)
|
|
return (default_storage.url(cover_file_name), True)
|
|
except Exception, e:
|
|
# if there is a problem, return None for cover URL
|
|
logger.warning('Failed to make cover image for doab_id={}: {}'.format(doab_id, e))
|
|
return (None, False)
|
|
|
|
def update_cover_doab(doab_id, edition, store_cover=True):
|
|
"""
|
|
update the cover url for work with doab_id
|
|
if store_cover is True, use the cover from our own storage
|
|
"""
|
|
if store_cover:
|
|
(cover_url, new_cover) = store_doab_cover(doab_id)
|
|
else:
|
|
cover_url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)
|
|
|
|
if cover_url is not None:
|
|
edition.cover_image = cover_url
|
|
edition.save()
|
|
return cover_url
|
|
else:
|
|
return None
|
|
|
|
def attach_more_doab_metadata(edition, description, subjects,
|
|
publication_date, publisher_name=None, language=None, authors=u''):
|
|
|
|
"""
|
|
for given edition, attach description, subjects, publication date to
|
|
corresponding Edition and Work
|
|
"""
|
|
# if edition doesn't have a publication date, update it
|
|
if not edition.publication_date:
|
|
edition.publication_date = publication_date
|
|
|
|
# if edition.publisher_name is empty, set it
|
|
if not edition.publisher_name:
|
|
edition.set_publisher(publisher_name)
|
|
|
|
edition.save()
|
|
|
|
# attach description to work if it's not empty
|
|
work = edition.work
|
|
if not work.description:
|
|
work.description = description
|
|
|
|
# update subjects
|
|
for s in subjects:
|
|
if bookloader.valid_subject(s):
|
|
work.subjects.add(models.Subject.objects.get_or_create(name=s)[0])
|
|
|
|
# set reading level of work if it's empty; doab is for adults.
|
|
if not work.age_level:
|
|
work.age_level = '18-'
|
|
|
|
if language:
|
|
work.language = language
|
|
work.save()
|
|
|
|
if authors and authors == authors: # test for authors != NaN
|
|
authlist = creator_list(authors)
|
|
if edition.authors.all().count() < len(authlist):
|
|
edition.authors.clear()
|
|
if authlist is not None:
|
|
for [rel,auth] in authlist:
|
|
edition.add_author(auth, rel)
|
|
|
|
return edition
|
|
|
|
def add_all_isbns(isbns, work, language=None, title=None):
|
|
first_edition = None
|
|
for isbn in isbns:
|
|
first_edition = None
|
|
edition = bookloader.add_by_isbn(isbn, work, language=language, title=title)
|
|
if edition:
|
|
first_edition = first_edition if first_edition else edition
|
|
if work and (edition.work.id != work.id):
|
|
if work.created < edition.work.created:
|
|
merge_works(work, edition.work)
|
|
else:
|
|
merge_works(edition.work, work)
|
|
else:
|
|
work = edition.work
|
|
return first_edition
|
|
|
|
def load_doab_edition(title, doab_id, url, format, rights,
|
|
language, isbns,
|
|
provider, **kwargs):
|
|
|
|
"""
|
|
load a record from doabooks.org represented by input parameters and return an ebook
|
|
"""
|
|
if language and isinstance(language, list):
|
|
language = language[0]
|
|
|
|
# check to see whether the Edition hasn't already been loaded first
|
|
# search by url
|
|
ebooks = models.Ebook.objects.filter(url=url)
|
|
|
|
# 1 match
|
|
# > 1 matches
|
|
# 0 match
|
|
|
|
# simplest case -- if match (1 or more), we could check whether any
|
|
# ebook.edition.work has a doab id matching given doab_id
|
|
|
|
# put a migration to force Ebook.url to be unique id
|
|
|
|
# if yes, then return one of the Edition(s) whose work is doab_id
|
|
# if no, then
|
|
ebook = None
|
|
if len(ebooks) > 1:
|
|
raise Exception("There is more than one Ebook matching url {0}".format(url))
|
|
elif len(ebooks) == 1:
|
|
ebook = ebooks[0]
|
|
doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id,
|
|
work=ebook.edition.work)
|
|
# update the cover id
|
|
cover_url = update_cover_doab(doab_id, ebook.edition)
|
|
|
|
# attach more metadata
|
|
attach_more_doab_metadata(ebook.edition,
|
|
description=kwargs.get('description'),
|
|
subjects=kwargs.get('subject'),
|
|
publication_date=kwargs.get('date'),
|
|
publisher_name=kwargs.get('publisher'),
|
|
language=language,
|
|
authors=kwargs.get('authors'),)
|
|
# make sure all isbns are added
|
|
add_all_isbns(isbns, None, language=language, title=title)
|
|
return ebook
|
|
|
|
# remaining case --> no ebook, load record, create ebook if there is one.
|
|
assert len(ebooks) == 0
|
|
|
|
|
|
# we need to find the right Edition/Work to tie Ebook to...
|
|
|
|
# look for the Edition with which to associate ebook.
|
|
# loop through the isbns to see whether we get one that is not None
|
|
work = None
|
|
edition = add_all_isbns(isbns, None, language=language, title=title)
|
|
if edition:
|
|
edition.refresh_from_db()
|
|
work = edition.work
|
|
|
|
if doab_id and not work:
|
|
# make sure there's not already a doab_id
|
|
idents = models.Identifier.objects.filter(type='doab', value=doab_id)
|
|
for ident in idents:
|
|
edition = ident.work.preferred_edition
|
|
work = edition.work
|
|
break
|
|
|
|
if edition is not None:
|
|
# if this is a new edition, then add related editions asynchronously
|
|
if getattr(edition,'new', False):
|
|
tasks.populate_edition.delay(edition.isbn_13)
|
|
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
|
|
work=edition.work)
|
|
|
|
# we need to create Edition(s) de novo
|
|
else:
|
|
# if there is a Work with doab_id already, attach any new Edition(s)
|
|
try:
|
|
work = models.Identifier.objects.get(type='doab', value=doab_id).work
|
|
except models.Identifier.DoesNotExist:
|
|
if language:
|
|
work = models.Work(language=language, title=title, age_level='18-')
|
|
else:
|
|
work = models.Work(language='xx', title=title, age_level='18-')
|
|
work.save()
|
|
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
|
|
work=work)
|
|
|
|
# if work has any ebooks already, attach the ebook to the corresponding edition
|
|
# otherwise pick the first one
|
|
# pick the first edition as the one to tie ebook to
|
|
editions_with_ebooks = models.Edition.objects.filter(Q(work__id=work.id) & \
|
|
Q(ebooks__isnull=False)).distinct()
|
|
if editions_with_ebooks:
|
|
edition = editions_with_ebooks[0]
|
|
elif work.editions.all():
|
|
edition = work.editions.all()[0]
|
|
else:
|
|
edition = models.Edition(work=work, title=title)
|
|
edition.save()
|
|
|
|
# make the edition the selected_edition of the work
|
|
work.selected_edition = edition
|
|
work.save()
|
|
|
|
if format in ('pdf', 'epub', 'mobi'):
|
|
ebook = models.Ebook()
|
|
ebook.format = format
|
|
ebook.provider = provider
|
|
ebook.url = url
|
|
ebook.rights = rights
|
|
# tie the edition to ebook
|
|
ebook.edition = edition
|
|
ebook.save()
|
|
|
|
# update the cover id (could be done separately)
|
|
cover_url = update_cover_doab(doab_id, edition)
|
|
|
|
# attach more metadata
|
|
attach_more_doab_metadata(edition,
|
|
description=kwargs.get('description'),
|
|
subjects=kwargs.get('subject'),
|
|
publication_date=kwargs.get('date'),
|
|
publisher_name=kwargs.get('publisher'),
|
|
authors=kwargs.get('authors'),)
|
|
return ebook
|
|
|
|
|
|
def load_doab_records(fname, limit=None):
|
|
|
|
success_count = 0
|
|
ebook_count = 0
|
|
|
|
records = json.load(open(fname))
|
|
|
|
for (i, book) in enumerate(islice(records,limit)):
|
|
d = dict(book)
|
|
d['isbns'] = split_isbns(d['isbns_raw']) # use stricter isbn string parsing.
|
|
try:
|
|
ebook = load_doab_edition(**d)
|
|
success_count += 1
|
|
if ebook:
|
|
ebook_count +=1
|
|
except Exception, e:
|
|
logger.error(e)
|
|
logger.error(book)
|
|
|
|
logger.info("Number of records processed: " + str(success_count))
|
|
logger.info("Number of ebooks processed: " + str(ebook_count))
|
|
|
|
"""
|
|
#tools to parse the author lists in doab.csv
|
|
from pandas import DataFrame
|
|
url = "http://www.doabooks.org/doab?func=csv"
|
|
df_csv = DataFrame.from_csv(url)
|
|
|
|
out=[]
|
|
for val in df_csv.values:
|
|
isbn = split_isbns(val[0])
|
|
if isbn:
|
|
auths = []
|
|
if val[2] == val[2] and val[-2] == val[-2]: # test for NaN auths and licenses
|
|
auths = creator_list(val[2])
|
|
out.append(( isbn[0], auths))
|
|
open("/Users/eric/doab_auths.json","w+").write(json.dumps(out,indent=2, separators=(',', ': ')))
|
|
"""
|
|
|
|
au = re.compile(r'\(Authors?\)', flags=re.U)
|
|
ed = re.compile(r'\([^\)]*(dir.|[Eeé]ds?.|org.|coord.|Editor|a cura di|archivist)[^\)]*\)', flags=re.U)
|
|
tr = re.compile(r'\([^\)]*([Tt]rans.|tr.|translated by)[^\)]*\)', flags=re.U)
|
|
ai = re.compile(r'\([^\)]*(Introduction|Foreword)[^\)]*\)', flags=re.U)
|
|
ds = re.compile(r'\([^\)]*(designer)[^\)]*\)', flags=re.U)
|
|
cm = re.compile(r'\([^\)]*(comp.)[^\)]*\)', flags=re.U)
|
|
namelist = re.compile(r'([^,]+ [^, ]+)(, | and )([^,]+ [^, ]+)', flags=re.U)
|
|
namesep = re.compile(r', | and ', flags=re.U)
|
|
namesep2 = re.compile(r';|/| and ', flags=re.U)
|
|
isbnsep = re.compile(r'[ ,/;\t\.]+|Paper: *|Cloth: *|eISBN: *|Hardcover: *', flags=re.U)
|
|
edlist = re.compile(r'([eE]dited by| a cura di|editors)', flags=re.U)
|
|
|
|
def fnf(auth):
|
|
if len(auth) > 60:
|
|
return auth #probably corp name
|
|
parts = re.sub(r' +', u' ', auth).split(u',')
|
|
if len(parts) == 1:
|
|
return parts[0].strip()
|
|
elif len(parts) == 2:
|
|
return u'{} {}'.format(parts[1].strip(),parts[0].strip())
|
|
else:
|
|
if parts[1].strip() in ('der','van', 'von', 'de', 'ter'):
|
|
return u'{} {} {}'.format(parts[2].strip(),parts[1].strip(),parts[0].strip())
|
|
#print auth
|
|
#print re.search(namelist,auth).group(0)
|
|
return u'{} {}, {}'.format(parts[2].strip(),parts[0].strip(),parts[1].strip())
|
|
|
|
|
|
def creator(auth, editor=False):
|
|
auth = auth.strip()
|
|
if auth in (u'', u'and'):
|
|
return None
|
|
if re.search(ed, auth) or editor:
|
|
return [u'edt', fnf(ed.sub(u'', auth))]
|
|
if re.search(tr, auth):
|
|
return [u'trl', fnf(tr.sub(u'', auth))]
|
|
if re.search(ai, auth):
|
|
return [u'aui', fnf(ai.sub(u'', auth))]
|
|
if re.search(ds, auth):
|
|
return [u'dsr', fnf(ds.sub(u'', auth))]
|
|
if re.search(cm, auth):
|
|
return [u'com', fnf(cm.sub(u'', auth))]
|
|
|
|
auth = au.sub('', auth)
|
|
return ['aut', fnf(auth)]
|
|
|
|
def split_auths(auths):
|
|
if ';' in auths or '/' in auths:
|
|
return namesep2.split(auths)
|
|
else:
|
|
nl = namelist.match(auths.strip())
|
|
if nl:
|
|
if nl.group(3).endswith(' de') \
|
|
or ' de ' in nl.group(3) \
|
|
or nl.group(3).endswith(' da') \
|
|
or nl.group(1).endswith(' Jr.') \
|
|
or ' e ' in nl.group(1):
|
|
return [auths]
|
|
else:
|
|
return namesep.split(auths)
|
|
else :
|
|
return [auths]
|
|
|
|
def split_isbns(isbns):
|
|
result = []
|
|
for isbn in isbnsep.split(isbns):
|
|
isbn = ISBN(isbn)
|
|
if isbn.valid:
|
|
result.append(isbn.to_string())
|
|
return result
|
|
|
|
def creator_list(creators):
|
|
auths = []
|
|
if re.search(edlist, creators):
|
|
for auth in split_auths(edlist.sub(u'', creators)):
|
|
if auth:
|
|
auths.append(creator(auth, editor=True))
|
|
else:
|
|
for auth in split_auths(unicode(creators)):
|
|
if auth:
|
|
auths.append(creator(auth))
|
|
return auths
|
|
|
|
def load_doab_auths(fname, limit=None):
|
|
doab_auths = json.load(open(fname))
|
|
recnum = 0
|
|
failed = 0
|
|
for [isbnraw, authlist] in doab_auths:
|
|
isbn = ISBN(isbnraw).to_string()
|
|
try:
|
|
work = models.Identifier.objects.get(type='isbn',value=isbn).work
|
|
except models.Identifier.DoesNotExist:
|
|
print 'isbn = {} not found'.format(isbnraw)
|
|
failed += 1
|
|
if work.preferred_edition.authors.all().count() < len(authlist):
|
|
work.preferred_edition.authors.clear()
|
|
if authlist is None:
|
|
print "null authlist; isbn={}".format(isbn)
|
|
continue
|
|
for [rel,auth] in authlist:
|
|
work.preferred_edition.add_author(auth, rel)
|
|
recnum +=1
|
|
if limit and recnum > limit:
|
|
break
|
|
logger.info("Number of records processed: " + str(recnum))
|
|
logger.info("Number of missing isbns: " + str(failed))
|
|
|