2011-09-07 09:34:03 +00:00
|
|
|
import json
|
|
|
|
import logging
|
2011-12-19 06:33:13 +00:00
|
|
|
import datetime
|
2011-09-07 09:34:03 +00:00
|
|
|
|
|
|
|
import requests
|
2011-11-06 22:42:09 +00:00
|
|
|
from xml.etree import ElementTree
|
2012-02-16 03:36:18 +00:00
|
|
|
from itertools import izip, islice
|
2011-10-19 03:00:07 +00:00
|
|
|
|
|
|
|
from django.db.models import Q
|
2011-11-06 22:42:09 +00:00
|
|
|
from django.conf import settings
|
2011-10-19 03:45:02 +00:00
|
|
|
from django.db import IntegrityError
|
2012-02-10 01:51:10 +00:00
|
|
|
from django.contrib.comments.models import Comment
|
2011-09-09 05:38:28 +00:00
|
|
|
|
2012-02-15 02:01:13 +00:00
|
|
|
import regluit
|
2011-09-09 05:38:28 +00:00
|
|
|
from regluit.core import models
|
2011-12-20 04:26:55 +00:00
|
|
|
import regluit.core.isbn
|
2011-09-09 05:38:28 +00:00
|
|
|
|
2012-02-15 02:01:13 +00:00
|
|
|
|
2011-09-10 11:36:38 +00:00
|
|
|
logger = logging.getLogger(__name__)
|
2011-09-09 05:38:28 +00:00
|
|
|
|
2012-01-18 04:22:07 +00:00
|
|
|
|
2012-01-09 18:55:22 +00:00
|
|
|
def add_by_oclc(isbn, work=None):
|
|
|
|
# this is indirection in case we have a data source other than google
|
|
|
|
return add_by_oclc_from_google(isbn)
|
2011-12-19 06:33:13 +00:00
|
|
|
|
2011-11-06 21:33:04 +00:00
|
|
|
|
2012-01-09 18:55:22 +00:00
|
|
|
def add_by_oclc_from_google(oclc):
|
|
|
|
if oclc:
|
2012-01-10 20:20:02 +00:00
|
|
|
logger.info("adding book by oclc %s" , oclc)
|
2012-01-09 18:55:22 +00:00
|
|
|
else:
|
2011-11-06 21:33:04 +00:00
|
|
|
return None
|
|
|
|
try:
|
2012-01-09 18:55:22 +00:00
|
|
|
return models.Identifier.objects.get(type='oclc', value=oclc).edition
|
|
|
|
except:
|
|
|
|
url = "https://www.googleapis.com/books/v1/volumes"
|
2012-01-31 04:57:23 +00:00
|
|
|
try:
|
|
|
|
results = _get_json(url, {"q": '"OCLC%s"' % oclc})
|
|
|
|
except LookupFailure, e:
|
|
|
|
logger.exception("lookup failure for %s", oclc)
|
|
|
|
return None
|
2012-01-09 18:55:22 +00:00
|
|
|
if not results.has_key('items') or len(results['items']) == 0:
|
2012-01-10 20:20:02 +00:00
|
|
|
logger.warn("no google hits for %s" , oclc)
|
2012-01-09 18:55:22 +00:00
|
|
|
return None
|
|
|
|
|
|
|
|
try:
|
2012-01-18 04:22:07 +00:00
|
|
|
e = add_by_googlebooks_id(results['items'][0]['id'], results=results['items'][0])
|
2012-01-09 18:55:22 +00:00
|
|
|
models.Identifier(type='oclc', value=oclc, edition=e, work=e.work).save()
|
|
|
|
return e
|
|
|
|
except LookupFailure, e:
|
|
|
|
logger.exception("failed to add edition for %s", oclc)
|
|
|
|
except IntegrityError, e:
|
|
|
|
logger.exception("google books data for %s didn't fit our db", oclc)
|
|
|
|
return None
|
2011-11-06 22:42:09 +00:00
|
|
|
|
2011-10-20 18:43:40 +00:00
|
|
|
def add_by_isbn(isbn, work=None):
|
2012-01-09 18:55:22 +00:00
|
|
|
if not isbn:
|
|
|
|
return None
|
2012-01-31 04:57:23 +00:00
|
|
|
try:
|
|
|
|
e = add_by_isbn_from_google(isbn, work=work)
|
|
|
|
except LookupFailure:
|
|
|
|
logger.exception("failed google lookup for %s", isbn)
|
|
|
|
# try again some other time
|
|
|
|
return None
|
2012-01-09 18:55:22 +00:00
|
|
|
if e:
|
|
|
|
return e
|
2012-01-10 20:20:02 +00:00
|
|
|
|
|
|
|
logger.info("null came back from add_by_isbn_from_google: %s", isbn)
|
|
|
|
|
|
|
|
if not work or not work.title:
|
2012-01-09 18:55:22 +00:00
|
|
|
return None
|
2012-01-10 20:20:02 +00:00
|
|
|
|
2012-01-18 04:15:24 +00:00
|
|
|
# if there's a work with a title, we want to create stub editions and
|
|
|
|
# works, even if google doesn't know about it # but if it's not valid,
|
|
|
|
# forget it!
|
|
|
|
|
2012-01-09 18:55:22 +00:00
|
|
|
try:
|
2012-01-18 04:22:07 +00:00
|
|
|
isbn = regluit.core.isbn.ISBN(isbn)
|
2012-01-09 18:55:22 +00:00
|
|
|
except:
|
|
|
|
logger.exception("invalid isbn: %s", isbn)
|
|
|
|
return None
|
|
|
|
if not isbn.valid:
|
|
|
|
return None
|
2012-01-18 04:22:07 +00:00
|
|
|
isbn = isbn.to_string()
|
2012-01-10 20:20:02 +00:00
|
|
|
|
|
|
|
# we don't know the language ->'xx'
|
2012-01-18 04:22:07 +00:00
|
|
|
w = models.Work(title=work.title, language='xx')
|
2012-01-10 20:20:02 +00:00
|
|
|
w.save()
|
2012-01-18 04:22:07 +00:00
|
|
|
e = models.Edition(title=work.title,work=w)
|
2012-01-09 18:55:22 +00:00
|
|
|
e.save()
|
2012-01-18 04:22:07 +00:00
|
|
|
e.new = True
|
2012-01-10 20:20:02 +00:00
|
|
|
models.Identifier(type='isbn', value=isbn, work=w, edition=e).save()
|
2012-01-09 18:55:22 +00:00
|
|
|
return e
|
2012-02-02 14:05:08 +00:00
|
|
|
|
|
|
|
def get_google_isbn_results(isbn):
|
|
|
|
url = "https://www.googleapis.com/books/v1/volumes"
|
|
|
|
try:
|
|
|
|
results = _get_json(url, {"q": "isbn:%s" % isbn})
|
|
|
|
except LookupFailure:
|
|
|
|
logger.exception("lookup failure for %s", isbn)
|
|
|
|
return None
|
|
|
|
if not results.has_key('items') or len(results['items']) == 0:
|
|
|
|
logger.warn("no google hits for %s" , isbn)
|
|
|
|
return None
|
|
|
|
else:
|
|
|
|
return results
|
|
|
|
|
|
|
|
def add_ebooks(item, edition):
|
|
|
|
access_info = item.get('accessInfo')
|
|
|
|
if access_info:
|
2012-02-23 23:51:29 +00:00
|
|
|
edition.public_domain = access_info.get('publicDomain', None)
|
2012-02-02 14:05:08 +00:00
|
|
|
epub = access_info.get('epub')
|
|
|
|
if epub and epub.get('downloadLink'):
|
|
|
|
ebook = models.Ebook(edition=edition, format='epub',
|
|
|
|
url=epub.get('downloadLink'),
|
|
|
|
provider='google')
|
|
|
|
try:
|
|
|
|
ebook.save()
|
|
|
|
except IntegrityError:
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
pdf = access_info.get('pdf')
|
|
|
|
if pdf and pdf.get('downloadLink'):
|
|
|
|
ebook = models.Ebook(edition=edition, format='pdf',
|
|
|
|
url=pdf.get('downloadLink', None),
|
|
|
|
provider='google')
|
|
|
|
try:
|
|
|
|
ebook.save()
|
|
|
|
except IntegrityError:
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
def update_edition(edition):
|
2012-02-15 21:23:10 +00:00
|
|
|
"""
|
|
|
|
attempt to update data associated with input edition and return that updated edition
|
|
|
|
"""
|
|
|
|
|
|
|
|
# if there is no ISBN associated with edition, just return the input edition
|
2012-02-02 14:05:08 +00:00
|
|
|
try:
|
|
|
|
isbn=edition.identifiers.filter(type='isbn')[0].value
|
|
|
|
except models.Identifier.DoesNotExist:
|
|
|
|
return edition
|
|
|
|
|
2012-02-15 21:23:10 +00:00
|
|
|
# do a Google Books lookup on the isbn associated with the edition (there should be either 0 or 1 isbns associated
|
|
|
|
# with an edition because of integrity constraint in Identifier)
|
|
|
|
|
|
|
|
# if we get some data about this isbn back from Google, update the edition data accordingly
|
2012-02-02 14:05:08 +00:00
|
|
|
results=get_google_isbn_results(isbn)
|
|
|
|
if not results:
|
|
|
|
return edition
|
|
|
|
item=results['items'][0]
|
|
|
|
googlebooks_id=item['id']
|
|
|
|
d = item['volumeInfo']
|
2012-02-02 16:33:51 +00:00
|
|
|
if d.has_key('title'):
|
|
|
|
title = d['title']
|
|
|
|
else:
|
|
|
|
title=''
|
2012-02-02 14:05:08 +00:00
|
|
|
if len(title)==0:
|
|
|
|
# need a title to make an edition record; some crap records in GB. use title from parent if available
|
|
|
|
title=edition.work.title
|
|
|
|
|
|
|
|
# check for language change
|
|
|
|
language = d['language']
|
2012-02-15 21:23:10 +00:00
|
|
|
# don't track variants in main language (e.g., traditional vs simplified Chinese)
|
2012-02-02 14:05:08 +00:00
|
|
|
if len(language)>2:
|
|
|
|
language= language[0:2]
|
2012-02-15 21:23:10 +00:00
|
|
|
|
|
|
|
# if the language of the edition no longer matches that of the parent work, attach edition to the
|
2012-02-02 14:05:08 +00:00
|
|
|
if edition.work.language != language:
|
|
|
|
logger.info("reconnecting %s since it is %s instead of %s" %(googlebooks_id, language, edition.work.language))
|
|
|
|
old_work=edition.work
|
2012-02-15 21:23:10 +00:00
|
|
|
|
2012-02-02 14:05:08 +00:00
|
|
|
new_work = models.Work(title=title, language=language)
|
|
|
|
new_work.save()
|
|
|
|
edition.work = new_work
|
|
|
|
edition.save()
|
|
|
|
for identifier in edition.identifiers.all():
|
|
|
|
logger.info("moving identifier %s" % identifier.value)
|
|
|
|
identifier.work=new_work
|
|
|
|
identifier.save()
|
|
|
|
if old_work.editions.count()==0:
|
|
|
|
#a dangling work; make sure nothing else is attached!
|
|
|
|
merge_works(new_work,old_work)
|
|
|
|
|
|
|
|
# update the edition
|
|
|
|
edition.title = title
|
|
|
|
edition.description = d.get('description')
|
|
|
|
edition.publisher = d.get('publisher')
|
|
|
|
edition.publication_date = d.get('publishedDate', '')
|
|
|
|
edition.save()
|
2012-01-09 18:55:22 +00:00
|
|
|
|
2012-02-02 14:05:08 +00:00
|
|
|
# create identifier if needed
|
|
|
|
models.Identifier.get_or_add(type='goog',value=googlebooks_id,edition=edition,work=edition.work)
|
|
|
|
|
|
|
|
for a in d.get('authors', []):
|
|
|
|
a, created = models.Author.objects.get_or_create(name=a)
|
|
|
|
a.editions.add(edition)
|
|
|
|
|
|
|
|
add_ebooks(item, edition)
|
|
|
|
|
|
|
|
return edition
|
|
|
|
|
2012-01-09 18:55:22 +00:00
|
|
|
|
|
|
|
def add_by_isbn_from_google(isbn, work=None):
|
|
|
|
"""add a book to the UnglueIt database from google based on ISBN. The work parameter
|
2011-10-14 04:12:20 +00:00
|
|
|
is optional, and if not supplied the edition will be associated with
|
|
|
|
a stub work.
|
|
|
|
"""
|
2011-12-22 19:29:46 +00:00
|
|
|
if not isbn:
|
|
|
|
return None
|
2011-12-20 04:26:55 +00:00
|
|
|
if len(isbn)==10:
|
2012-01-18 04:22:07 +00:00
|
|
|
isbn = regluit.core.isbn.convert_10_to_13(isbn)
|
2011-12-20 04:26:55 +00:00
|
|
|
|
2011-11-06 21:33:04 +00:00
|
|
|
logger.info("adding book by isbn %s", isbn)
|
2012-02-02 14:05:08 +00:00
|
|
|
|
2012-01-09 18:55:22 +00:00
|
|
|
# check if we already have this isbn
|
|
|
|
edition = get_edition_by_id(type='isbn',value=isbn)
|
|
|
|
if edition:
|
2011-12-22 19:29:46 +00:00
|
|
|
edition.new = False
|
2011-10-19 03:00:07 +00:00
|
|
|
return edition
|
|
|
|
|
2012-02-02 14:05:08 +00:00
|
|
|
results=get_google_isbn_results(isbn)
|
|
|
|
if results:
|
|
|
|
try:
|
|
|
|
return add_by_googlebooks_id(results['items'][0]['id'], work=work, results=results['items'][0], isbn=isbn)
|
|
|
|
except LookupFailure, e:
|
|
|
|
logger.exception("failed to add edition for %s", isbn)
|
|
|
|
except IntegrityError, e:
|
|
|
|
logger.exception("google books data for %s didn't fit our db", isbn)
|
2011-10-10 16:57:10 +00:00
|
|
|
return None
|
2012-02-02 14:05:08 +00:00
|
|
|
else:
|
|
|
|
return None
|
|
|
|
|
2012-01-09 18:55:22 +00:00
|
|
|
def get_work_by_id(type,value):
|
|
|
|
if value:
|
|
|
|
try:
|
|
|
|
return models.Identifier.objects.get(type=type,value=value).work
|
|
|
|
except models.Identifier.DoesNotExist:
|
|
|
|
return None
|
2011-09-07 09:34:03 +00:00
|
|
|
|
2012-01-09 18:55:22 +00:00
|
|
|
def get_edition_by_id(type,value):
|
|
|
|
if value:
|
|
|
|
try:
|
|
|
|
return models.Identifier.objects.get(type=type,value=value).edition
|
|
|
|
except models.Identifier.DoesNotExist:
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2012-01-28 02:44:02 +00:00
|
|
|
def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
|
2011-10-14 04:12:20 +00:00
|
|
|
"""add a book to the UnglueIt database based on the GoogleBooks ID. The
|
|
|
|
work parameter is optional, and if not supplied the edition will be
|
2012-01-28 02:44:02 +00:00
|
|
|
associated with a stub work. isbn can be passed because sometimes passed data won't include it
|
2012-01-09 18:55:22 +00:00
|
|
|
|
2011-10-14 04:12:20 +00:00
|
|
|
"""
|
2011-10-14 04:02:19 +00:00
|
|
|
# don't ping google again if we already know about the edition
|
2011-12-13 14:55:26 +00:00
|
|
|
try:
|
2012-01-31 20:07:14 +00:00
|
|
|
edition = models.Identifier.objects.get(type='goog', value=googlebooks_id).edition
|
|
|
|
edition.new = False
|
|
|
|
return edition
|
2012-01-09 18:55:22 +00:00
|
|
|
except models.Identifier.DoesNotExist:
|
2011-12-13 14:55:26 +00:00
|
|
|
pass
|
2012-01-09 18:55:22 +00:00
|
|
|
|
|
|
|
# if google has been queried by caller, don't call again
|
|
|
|
if results:
|
|
|
|
item =results
|
|
|
|
else:
|
|
|
|
logger.info("loading metadata from google for %s", googlebooks_id)
|
|
|
|
url = "https://www.googleapis.com/books/v1/volumes/%s" % googlebooks_id
|
|
|
|
item = _get_json(url)
|
2011-11-06 21:33:04 +00:00
|
|
|
d = item['volumeInfo']
|
2012-01-31 04:56:20 +00:00
|
|
|
|
2012-02-02 16:33:51 +00:00
|
|
|
if d.has_key('title'):
|
|
|
|
title = d['title']
|
|
|
|
else:
|
|
|
|
title=''
|
2012-01-31 04:56:20 +00:00
|
|
|
if len(title)==0:
|
|
|
|
# need a title to make an edition record; some crap records in GB. use title from parent if available
|
|
|
|
if work:
|
|
|
|
title=work.title
|
|
|
|
else:
|
|
|
|
return None
|
2011-10-14 04:02:19 +00:00
|
|
|
|
2011-12-13 14:55:26 +00:00
|
|
|
# don't add the edition to a work with a different language
|
|
|
|
# https://www.pivotaltracker.com/story/show/17234433
|
2012-01-09 18:55:22 +00:00
|
|
|
language = d['language']
|
2012-01-29 03:16:14 +00:00
|
|
|
if len(language)>2:
|
|
|
|
language= language[0:2]
|
2011-12-13 14:55:26 +00:00
|
|
|
if work and work.language != language:
|
2012-01-10 20:20:02 +00:00
|
|
|
logger.info("not connecting %s since it is %s instead of %s" %
|
2011-12-13 14:55:26 +00:00
|
|
|
(googlebooks_id, language, work.language))
|
2012-01-18 04:22:07 +00:00
|
|
|
work = None
|
2012-01-28 02:44:02 +00:00
|
|
|
# isbn = None
|
|
|
|
if not isbn:
|
|
|
|
for i in d.get('industryIdentifiers', []):
|
|
|
|
if i['type'] == 'ISBN_10' and not isbn:
|
|
|
|
isbn = regluit.core.isbn.convert_10_to_13(i['identifier'])
|
|
|
|
elif i['type'] == 'ISBN_13':
|
|
|
|
isbn = i['identifier']
|
2012-01-09 18:55:22 +00:00
|
|
|
|
|
|
|
# now check to see if there's an existing Work
|
2012-01-31 20:07:14 +00:00
|
|
|
if work:
|
|
|
|
work.new = False
|
2012-01-27 14:35:00 +00:00
|
|
|
if isbn and not work:
|
2012-01-09 18:55:22 +00:00
|
|
|
work = get_work_by_id(type='isbn',value=isbn)
|
2012-01-31 20:07:14 +00:00
|
|
|
if work:
|
|
|
|
work.new = False
|
2012-01-09 18:55:22 +00:00
|
|
|
if not work:
|
2012-01-31 04:56:20 +00:00
|
|
|
work = models.Work.objects.create(title=title, language=language)
|
2012-01-18 04:22:07 +00:00
|
|
|
work.new = True
|
2012-01-09 18:55:22 +00:00
|
|
|
work.save()
|
|
|
|
|
2012-01-27 14:35:00 +00:00
|
|
|
|
|
|
|
# going off to google can take some time, so we want to make sure this edition has not
|
|
|
|
# been created in another thread while we were waiting
|
|
|
|
try:
|
2012-01-31 15:01:10 +00:00
|
|
|
e = models.Identifier.objects.get(type='goog', value=googlebooks_id).edition
|
2012-01-31 20:07:14 +00:00
|
|
|
e.new = False
|
2012-01-31 15:01:10 +00:00
|
|
|
# whoa nellie, somebody else created an edition while we were working.
|
|
|
|
if work.new:
|
|
|
|
work.delete()
|
|
|
|
return e
|
2012-01-27 14:35:00 +00:00
|
|
|
except models.Identifier.DoesNotExist:
|
|
|
|
pass
|
|
|
|
|
2012-01-09 18:55:22 +00:00
|
|
|
|
|
|
|
# because this is a new google id, we have to create a new edition
|
|
|
|
e = models.Edition(work=work)
|
2012-01-31 04:56:20 +00:00
|
|
|
e.title = title
|
2011-10-10 16:57:10 +00:00
|
|
|
e.description = d.get('description')
|
|
|
|
e.publisher = d.get('publisher')
|
2011-11-21 19:13:29 +00:00
|
|
|
e.publication_date = d.get('publishedDate', '')
|
2011-12-13 14:55:26 +00:00
|
|
|
e.save()
|
2012-01-18 04:22:07 +00:00
|
|
|
e.new = True
|
2012-01-09 18:55:22 +00:00
|
|
|
|
|
|
|
# create identifier where needed
|
|
|
|
models.Identifier(type='goog',value=googlebooks_id,edition=e,work=work).save()
|
|
|
|
if isbn:
|
|
|
|
models.Identifier.get_or_add(type='isbn',value=isbn,edition=e,work=work)
|
2011-12-13 14:55:26 +00:00
|
|
|
|
2011-10-10 16:57:10 +00:00
|
|
|
for a in d.get('authors', []):
|
|
|
|
a, created = models.Author.objects.get_or_create(name=a)
|
|
|
|
a.editions.add(e)
|
2011-09-09 05:38:28 +00:00
|
|
|
|
2012-02-02 14:05:08 +00:00
|
|
|
add_ebooks(item, e)
|
2012-01-09 18:55:22 +00:00
|
|
|
|
2011-10-10 16:57:10 +00:00
|
|
|
return e
|
2011-09-09 05:38:28 +00:00
|
|
|
|
|
|
|
|
2011-10-14 04:12:20 +00:00
|
|
|
def add_related(isbn):
|
|
|
|
"""add all books related to a particular ISBN to the UnglueIt database.
|
|
|
|
The initial seed ISBN will be added if it's not already there.
|
|
|
|
"""
|
2011-10-14 04:02:19 +00:00
|
|
|
# make sure the seed edition is there
|
2011-10-20 03:31:16 +00:00
|
|
|
logger.info("adding related editions for %s", isbn)
|
2012-01-28 00:16:46 +00:00
|
|
|
|
|
|
|
new_editions = []
|
|
|
|
|
2011-10-14 04:02:19 +00:00
|
|
|
edition = add_by_isbn(isbn)
|
2012-01-28 00:16:46 +00:00
|
|
|
if edition is None:
|
|
|
|
return new_editions
|
2011-10-14 04:02:19 +00:00
|
|
|
|
|
|
|
# this is the work everything will hang off
|
|
|
|
work = edition.work
|
2012-01-10 20:20:02 +00:00
|
|
|
other_editions = {}
|
2011-10-14 04:02:19 +00:00
|
|
|
for other_isbn in thingisbn(isbn):
|
2011-12-20 04:26:55 +00:00
|
|
|
# 979's come back as 13
|
2012-02-13 22:35:08 +00:00
|
|
|
logger.debug("other_isbn: %s", other_isbn)
|
2011-12-20 04:26:55 +00:00
|
|
|
if len(other_isbn)==10:
|
2012-01-18 04:22:07 +00:00
|
|
|
other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
|
2012-01-09 18:55:22 +00:00
|
|
|
related_edition = add_by_isbn(other_isbn, work=work)
|
2011-10-20 05:23:30 +00:00
|
|
|
|
2012-01-10 20:20:02 +00:00
|
|
|
if related_edition:
|
2012-01-18 04:22:07 +00:00
|
|
|
related_language = related_edition.work.language
|
2012-01-10 20:20:02 +00:00
|
|
|
if edition.work.language == related_language:
|
|
|
|
new_editions.append(related_edition)
|
|
|
|
if related_edition.work != edition.work:
|
2012-02-13 22:35:08 +00:00
|
|
|
logger.debug("merge_works path 1 %s %s", edition.work.id, related_edition.work.id )
|
2012-01-10 20:20:02 +00:00
|
|
|
merge_works(edition.work, related_edition.work)
|
|
|
|
else:
|
|
|
|
if other_editions.has_key(related_language):
|
|
|
|
other_editions[related_language].append(related_edition)
|
|
|
|
else:
|
|
|
|
other_editions[related_language]=[related_edition]
|
|
|
|
|
|
|
|
# group the other language editions together
|
|
|
|
for lang_group in other_editions.itervalues():
|
2012-02-13 22:35:08 +00:00
|
|
|
logger.debug("lang_group (ed, work): %s", [(ed.id, ed.work.id) for ed in lang_group])
|
2012-01-10 20:20:02 +00:00
|
|
|
if len(lang_group)>1:
|
|
|
|
lang_edition = lang_group[0]
|
2012-02-13 22:35:08 +00:00
|
|
|
logger.debug("lang_edition.id: %s", lang_edition.id)
|
|
|
|
# compute the distinct set of works to merge into lang_edition.work
|
|
|
|
works_to_merge = set([ed.work for ed in lang_group[1:]]) - set([lang_edition.work])
|
|
|
|
for w in works_to_merge:
|
|
|
|
logger.debug("merge_works path 2 %s %s", lang_edition.work.id, w.id )
|
|
|
|
merge_works(lang_edition.work, w)
|
2012-01-10 20:20:02 +00:00
|
|
|
|
2011-10-20 05:23:30 +00:00
|
|
|
return new_editions
|
2012-01-10 20:20:02 +00:00
|
|
|
|
2011-10-14 04:12:20 +00:00
|
|
|
|
2011-10-13 01:59:46 +00:00
|
|
|
def thingisbn(isbn):
|
2011-10-14 04:12:20 +00:00
|
|
|
"""given an ISBN return a list of related edition ISBNs, according to
|
2011-12-20 04:26:55 +00:00
|
|
|
Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns, which come back as isbn_13')
|
2011-10-14 04:12:20 +00:00
|
|
|
"""
|
2012-01-10 20:20:02 +00:00
|
|
|
logger.info("looking up %s at ThingISBN" , isbn)
|
2011-10-13 01:59:46 +00:00
|
|
|
url = "http://www.librarything.com/api/thingISBN/%s" % isbn
|
|
|
|
xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content
|
|
|
|
doc = ElementTree.fromstring(xml)
|
|
|
|
return [e.text for e in doc.findall('isbn')]
|
|
|
|
|
2011-10-19 03:45:02 +00:00
|
|
|
|
2011-10-19 03:00:07 +00:00
|
|
|
def merge_works(w1, w2):
|
|
|
|
"""will merge the second work (w2) into the first (w1)
|
|
|
|
"""
|
2012-01-09 18:55:22 +00:00
|
|
|
logger.info("merging work %s into %s", w2, w1)
|
2012-02-13 22:35:08 +00:00
|
|
|
# don't merge if the works are the same or at least one of the works has no id (for example, when w2 has already been deleted)
|
|
|
|
if w1 == w2 or w1.id is None or w2.id is None:
|
|
|
|
return
|
|
|
|
|
2012-01-09 18:55:22 +00:00
|
|
|
for identifier in w2.identifiers.all():
|
|
|
|
identifier.work = w1
|
|
|
|
identifier.save()
|
2012-02-10 01:51:10 +00:00
|
|
|
for comment in Comment.objects.for_model(w2):
|
|
|
|
comment.object_pk = w1.pk
|
|
|
|
comment.save()
|
2011-10-19 03:00:07 +00:00
|
|
|
for edition in w2.editions.all():
|
|
|
|
edition.work = w1
|
|
|
|
edition.save()
|
|
|
|
for campaign in w2.campaigns.all():
|
|
|
|
campaign.work = w1
|
|
|
|
campaign.save()
|
|
|
|
for wishlist in models.Wishlist.objects.filter(works__in=[w2]):
|
2012-01-18 04:22:07 +00:00
|
|
|
w2source = wishlist.work_source(w2)
|
2011-12-08 23:22:05 +00:00
|
|
|
wishlist.remove_work(w2)
|
|
|
|
wishlist.add_work(w1, w2source)
|
2012-02-10 03:30:33 +00:00
|
|
|
|
|
|
|
models.WasWork(was=w2.pk,work=w1).save()
|
|
|
|
for ww in models.WasWork.objects.filter(work = w2):
|
|
|
|
ww.work = w1
|
|
|
|
ww.save()
|
2012-02-13 22:35:08 +00:00
|
|
|
|
2011-10-19 03:00:07 +00:00
|
|
|
w2.delete()
|
2011-10-14 04:12:20 +00:00
|
|
|
|
2011-10-19 03:45:02 +00:00
|
|
|
|
2011-12-19 06:33:13 +00:00
|
|
|
def add_openlibrary(work):
|
2012-01-09 18:55:22 +00:00
|
|
|
if work.openlibrary_lookup is not None:
|
|
|
|
# don't hit OL if we've visited in the past month or so
|
|
|
|
if datetime.datetime.now()- work.openlibrary_lookup < datetime.timedelta(days=30):
|
|
|
|
return
|
2011-12-19 06:33:13 +00:00
|
|
|
work.openlibrary_lookup = datetime.datetime.now()
|
|
|
|
work.save()
|
|
|
|
|
|
|
|
# find the first ISBN match in OpenLibrary
|
|
|
|
logger.info("looking up openlibrary data for work %s", work.id)
|
|
|
|
found = False
|
|
|
|
e = None # openlibrary edition json
|
|
|
|
w = None # openlibrary work json
|
|
|
|
|
|
|
|
# get the 1st openlibrary match by isbn that has an associated work
|
|
|
|
url = "http://openlibrary.org/api/books"
|
|
|
|
params = {"format": "json", "jscmd": "details"}
|
|
|
|
for edition in work.editions.all():
|
2011-12-20 04:26:55 +00:00
|
|
|
isbn_key = "ISBN:%s" % edition.isbn_13
|
2011-12-19 06:33:13 +00:00
|
|
|
params['bibkeys'] = isbn_key
|
2012-01-31 04:57:23 +00:00
|
|
|
try:
|
|
|
|
e = _get_json(url, params, type='ol')
|
|
|
|
except LookupFailure:
|
|
|
|
logger.exception("OL lookup failed for %s", isbn_key)
|
|
|
|
e = {}
|
2011-12-19 06:33:13 +00:00
|
|
|
if e.has_key(isbn_key) and e[isbn_key]['details'].has_key('works'):
|
|
|
|
work_key = e[isbn_key]['details']['works'].pop(0)['key']
|
|
|
|
logger.info("got openlibrary work %s for isbn %s", work_key, isbn_key)
|
2012-01-31 04:57:23 +00:00
|
|
|
try:
|
|
|
|
w = _get_json("http://openlibrary.org" + work_key,type='ol')
|
|
|
|
if w.has_key('subjects'):
|
|
|
|
found = True
|
|
|
|
break
|
|
|
|
except LookupFailure:
|
|
|
|
logger.exception("OL lookup failed for %s", work_key)
|
2011-12-19 06:33:13 +00:00
|
|
|
if not found:
|
|
|
|
logger.warn("unable to find work %s at openlibrary", work.id)
|
|
|
|
return
|
|
|
|
|
|
|
|
# add the subjects to the Work
|
|
|
|
for s in w.get('subjects', []):
|
|
|
|
logger.info("adding subject %s to work %s", s, work.id)
|
|
|
|
subject, created = models.Subject.objects.get_or_create(name=s)
|
|
|
|
work.subjects.add(subject)
|
|
|
|
work.save()
|
2012-01-09 18:55:22 +00:00
|
|
|
|
|
|
|
models.Identifier.get_or_add(type='olwk',value=w['key'],work=work)
|
|
|
|
if e[isbn_key]['details'].has_key('identifiers'):
|
|
|
|
ids = e[isbn_key]['details']['identifiers']
|
|
|
|
if ids.has_key('goodreads'):
|
2012-01-10 20:20:02 +00:00
|
|
|
models.Identifier.get_or_add(type='gdrd',value=ids['goodreads'][0],work=work,edition=edition)
|
2012-01-09 18:55:22 +00:00
|
|
|
if ids.has_key('librarything'):
|
2012-01-10 20:20:02 +00:00
|
|
|
models.Identifier.get_or_add(type='ltwk',value=ids['librarything'][0],work=work)
|
2011-12-19 06:33:13 +00:00
|
|
|
# TODO: add authors here once they are moved from Edition to Work
|
|
|
|
|
|
|
|
|
2012-01-09 18:55:22 +00:00
|
|
|
def _get_json(url, params={}, type='gb'):
|
2011-10-10 19:57:12 +00:00
|
|
|
# TODO: should X-Forwarded-For change based on the request from client?
|
|
|
|
headers = {'User-Agent': settings.USER_AGENT,
|
|
|
|
'Accept': 'application/json',
|
|
|
|
'X-Forwarded-For': '69.174.114.214'}
|
2012-01-18 04:22:07 +00:00
|
|
|
if type == 'gb':
|
2012-01-09 18:55:22 +00:00
|
|
|
params['key'] = settings.GOOGLE_BOOKS_API_KEY
|
2011-09-09 05:38:28 +00:00
|
|
|
response = requests.get(url, params=params, headers=headers)
|
2011-09-07 09:34:03 +00:00
|
|
|
if response.status_code == 200:
|
|
|
|
return json.loads(response.content)
|
|
|
|
else:
|
2011-09-10 11:36:38 +00:00
|
|
|
logger.error("unexpected HTTP response: %s" % response)
|
2012-01-18 04:15:24 +00:00
|
|
|
if response.content:
|
|
|
|
logger.error("response content: %s" % response.content)
|
2011-09-07 09:34:03 +00:00
|
|
|
raise LookupFailure("GET failed: url=%s and params=%s" % (url, params))
|
|
|
|
|
2011-09-09 05:38:28 +00:00
|
|
|
|
2012-02-15 02:01:13 +00:00
|
|
|
def load_gutenberg_edition(title, gutenberg_etext_id, ol_work_id, seed_isbn, url, format, license, lang, publication_date):
|
|
|
|
|
|
|
|
# let's start with instantiating the relevant Work and Edition if they don't already exist
|
|
|
|
|
|
|
|
try:
|
|
|
|
work = models.Identifier.objects.get(type='olwk',value=ol_work_id).work
|
|
|
|
except models.Identifier.DoesNotExist: # try to find an Edition with the seed_isbn and use that work to hang off of
|
|
|
|
sister_edition = add_by_isbn(seed_isbn)
|
|
|
|
if sister_edition.new:
|
|
|
|
# add related editions asynchronously
|
2012-02-16 18:19:36 +00:00
|
|
|
regluit.core.tasks.populate_edition.delay(sister_edition.isbn_13)
|
2012-02-15 02:01:13 +00:00
|
|
|
work = sister_edition.work
|
|
|
|
# attach the olwk identifier to this work if it's not none.
|
|
|
|
if ol_work_id is not None:
|
|
|
|
work_id = models.Identifier.get_or_add(type='olwk',value=ol_work_id, work=work)
|
|
|
|
|
|
|
|
# Now pull out any existing Gutenberg editions tied to the work with the proper Gutenberg ID
|
|
|
|
try:
|
|
|
|
edition = models.Identifier.objects.get( type='gtbg', value=gutenberg_etext_id ).edition
|
|
|
|
except models.Identifier.DoesNotExist:
|
|
|
|
edition = models.Edition()
|
|
|
|
edition.title = title
|
|
|
|
edition.work = work
|
|
|
|
|
|
|
|
edition.save()
|
|
|
|
edition_id = models.Identifier.get_or_add(type='gtbg',value=gutenberg_etext_id, edition=edition, work=work)
|
|
|
|
|
|
|
|
# check to see whether the Edition hasn't already been loaded first
|
|
|
|
# search by url
|
|
|
|
ebooks = models.Ebook.objects.filter(url=url)
|
|
|
|
|
|
|
|
# format: what's the controlled vocab? -- from Google -- alternative would be mimetype
|
|
|
|
|
|
|
|
if len(ebooks):
|
|
|
|
ebook = ebooks[0]
|
|
|
|
elif len(ebooks) == 0: # need to create new ebook
|
|
|
|
ebook = models.Ebook()
|
|
|
|
|
|
|
|
if len(ebooks) > 1:
|
|
|
|
warnings.warn("There is more than one Ebook matching url {0}".format(url))
|
|
|
|
|
|
|
|
|
|
|
|
ebook.format = format
|
|
|
|
ebook.provider = 'gutenberg'
|
|
|
|
ebook.url = url
|
|
|
|
ebook.rights = license
|
|
|
|
|
|
|
|
# is an Ebook instantiable without a corresponding Edition? (No, I think)
|
|
|
|
|
|
|
|
ebook.edition = edition
|
|
|
|
ebook.save()
|
|
|
|
|
|
|
|
return ebook
|
|
|
|
|
2012-02-16 03:36:18 +00:00
|
|
|
def add_missing_isbn_to_editions(max_num=None, confirm=False):
|
|
|
|
"""For each of the editions with Google Books ids, do a lookup and attach ISBNs. Set confirm to True to check db changes made correctly"""
|
2012-02-16 18:44:13 +00:00
|
|
|
logger.info("Number of editions with Google Books IDs but not ISBNs (before): %d",
|
|
|
|
models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count())
|
2012-02-16 03:36:18 +00:00
|
|
|
|
|
|
|
from regluit.experimental import bookdata
|
|
|
|
|
|
|
|
gb = bookdata.GoogleBooks(key=settings.GOOGLE_BOOKS_API_KEY)
|
|
|
|
|
|
|
|
new_isbns = []
|
2012-02-16 18:44:13 +00:00
|
|
|
google_id_not_found = []
|
2012-02-16 03:36:18 +00:00
|
|
|
no_isbn_found = []
|
|
|
|
editions_to_merge = []
|
|
|
|
exceptions = []
|
|
|
|
|
2012-02-16 18:44:13 +00:00
|
|
|
|
2012-02-16 03:36:18 +00:00
|
|
|
for (i, ed) in enumerate(islice(models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn'), max_num)):
|
|
|
|
try:
|
|
|
|
g_id = ed.identifiers.get(type='goog').value
|
|
|
|
except Exception, e:
|
|
|
|
# we might get an exception if there is, for example, more than one Google id attached to this Edition
|
|
|
|
logger.exception("add_missing_isbn_to_editions for edition.id %s: %s", ed.id, e)
|
|
|
|
exceptions.append((ed.id, e))
|
|
|
|
continue
|
|
|
|
|
|
|
|
# try to get ISBN from Google Books
|
|
|
|
try:
|
2012-02-16 18:44:13 +00:00
|
|
|
vol_id = gb.volumeid(g_id)
|
|
|
|
if vol_id is None:
|
|
|
|
google_id_not_found.append((ed.id, g_id))
|
|
|
|
logger.debug("g_id not found: %s", g_id)
|
2012-02-16 03:36:18 +00:00
|
|
|
else:
|
2012-02-16 18:44:13 +00:00
|
|
|
isbn = vol_id.get('isbn')
|
|
|
|
logger.info("g_id, isbn: %s %s", g_id, isbn)
|
|
|
|
if isbn is not None:
|
|
|
|
# check to see whether the isbn is actually already in the db but attached to another Edition
|
|
|
|
existing_isbn_ids = models.Identifier.objects.filter(type='isbn', value=isbn)
|
|
|
|
if len(existing_isbn_ids):
|
|
|
|
# don't try to merge editions right now, just note the need to merge
|
|
|
|
ed2 = existing_isbn_ids[0].edition
|
|
|
|
editions_to_merge.append((ed.id, g_id, isbn, ed2.id))
|
|
|
|
else:
|
|
|
|
new_id = models.Identifier(type='isbn', value=isbn, edition=ed, work=ed.work)
|
|
|
|
new_id.save()
|
|
|
|
new_isbns.append((ed.id, g_id, isbn))
|
|
|
|
else:
|
|
|
|
no_isbn_found.append((ed.id, g_id, None))
|
2012-02-16 03:36:18 +00:00
|
|
|
except Exception, e:
|
|
|
|
logger.exception("add_missing_isbn_to_editions for edition.id %s: %s", ed.id, e)
|
|
|
|
exceptions.append((ed.id, g_id, None, e))
|
|
|
|
|
2012-02-16 18:44:13 +00:00
|
|
|
logger.info("Number of editions with Google Books IDs but not ISBNs (after): %d",
|
|
|
|
models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count())
|
2012-02-16 03:36:18 +00:00
|
|
|
|
|
|
|
ok = None
|
|
|
|
|
|
|
|
if confirm:
|
|
|
|
ok = True
|
|
|
|
for (ed_id, g_id, isbn) in new_isbns:
|
|
|
|
if models.Edition.objects.get(id=ed_id).identifiers.get(type='isbn').value != isbn:
|
|
|
|
ok = False
|
|
|
|
break
|
|
|
|
|
|
|
|
return {
|
|
|
|
'new_isbns': new_isbns,
|
|
|
|
'no_isbn_found': no_isbn_found,
|
|
|
|
'editions_to_merge': editions_to_merge,
|
|
|
|
'exceptions': exceptions,
|
2012-02-16 21:18:22 +00:00
|
|
|
'google_id_not_found': google_id_not_found,
|
2012-02-16 03:36:18 +00:00
|
|
|
'confirm': ok
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-09-07 09:34:03 +00:00
|
|
|
class LookupFailure(Exception):
|
|
|
|
pass
|