Now I have booktests to recalculate clusters

pull/1/head
Raymond Yee 2012-02-17 10:30:09 -08:00
parent 09ab830c20
commit 2e079b2c2e
2 changed files with 228 additions and 62 deletions

View File

@ -11,6 +11,7 @@ import random
random.seed()
import sys, os
import json
# a kludge to allow for isbn.py to be imported
# and not just in the context of the regluit Django app
@ -436,15 +437,14 @@ class FreebaseBooks(object):
self.freebase.login(username,password)
def books(self):
MQL = u"""[{
"type": "/book/book",
"id": null,
"key": [{
"namespace": "/wikipedia/en",
"value": null,
"type": "/type/key"
}]
}]
""".replace("\n"," ")
"type": "/book/book",
"id": null,
"key": [{
"namespace": "/wikipedia/en",
"value": null,
"type": "/type/key"
}]
}]""".replace("\n"," ")
query = json.loads(MQL)
resp = self.freebase.mqlreaditer(query)
for r in resp:
@ -452,18 +452,17 @@ class FreebaseBooks(object):
def book_editions(self):
MQL = u"""[{
"type": "/book/book_edition",
"id": null,
"isbn": [{}],
"ISBN": [{}],
"LCCN": [{}],
"OCLC_number": [{}],
"openlibrary_id": [{}],
"book": {
"id": null,
"name": null
}
}]""".replace("\n"," ")
"type": "/book/book_edition",
"id": null,
"isbn": [{}],
"ISBN": [{}],
"LCCN": [{}],
"OCLC_number": [{}],
"openlibrary_id": [{}],
"book": {
"id": null,
"name": null
}}]""".replace("\n"," ")
query = json.loads(MQL)
resp = self.freebase.mqlreaditer(query)
for r in resp:
@ -471,18 +470,17 @@ class FreebaseBooks(object):
def editions_for_book(self, book_id):
MQL = u"""[{
"type": "/book/book_edition",
"id": null,
"isbn": [{}],
"ISBN": [{}],
"LCCN": [{}],
"OCLC_number": [{}],
"openlibrary_id": [{}],
"book": {
"id": null,
"name": null
}
}]""".replace("\n"," ")
"type": "/book/book_edition",
"id": null,
"isbn": [{}],
"ISBN": [{}],
"LCCN": [{}],
"OCLC_number": [{}],
"openlibrary_id": [{}],
"book": {
"id": null,
"name": null
}}]""".replace("\n"," ")
query = json.loads(MQL)
query[0]["book"]["id"] = book_id
resp = self.freebase.mqlreaditer(query)
@ -491,18 +489,17 @@ class FreebaseBooks(object):
def book_edition_by_id(self,id,id_type):
MQL = u"""[{
"type": "/book/book_edition",
"id": null,
"isbn": [{}],
"ISBN": [{}],
"LCCN": [{}],
"OCLC_number": [{}],
"openlibrary_id": [{}],
"book": {
"id": null,
"name": null
}
}]""".replace("\n"," ")
"type": "/book/book_edition",
"id": null,
"isbn": [{}],
"ISBN": [{}],
"LCCN": [{}],
"OCLC_number": [{}],
"openlibrary_id": [{}],
"book": {
"id": null,
"name": null
}}]""".replace("\n"," ")
query = json.loads(MQL)
if id_type == 'isbn':
query[0][id_type][0].setdefault('name', id)
@ -526,18 +523,18 @@ class FreebaseBooks(object):
elif isbn_val is not None:
isbn_val = isbn_mod.ISBN(isbn_val).to_string('13')
MQL = """[{
"type": "/book/book_edition",
"isbn": {
"name": null
},
"book": {
"editions": [{
"isbn": {
"name": null
}
}]
}
}]""".replace("\n"," ")
"type": "/book/book_edition",
"isbn": {
"name": null
},
"book": {
"editions": [{
"isbn": {
"name": null
}
}]
}
}]""".replace("\n"," ")
query = json.loads(MQL)
query[0]["book"]["editions"][0]["isbn"]["name"] = isbn_val
resp = self.freebase.mqlreaditer(query)
@ -565,7 +562,91 @@ class WorkMapper(object):
yield work_id
if not complete_search:
raise StopIteration()
class LibraryThing(object):
"""
Provide cached access to thingisbn and LT whatwork interface. Allow for a cache file to be loaded and saved
"""
def __init__(self, fname=None):
self.__isbn_to_work_id = {}
self.fname = fname
def __del__(self):
self.save()
def thingisbn(self, isbn, return_work_id=False):
""" if return_work_id is True, we won't try to calculate all the relevant isbns"""
# first, normalize the isbn
isbn = isbn_mod.ISBN(isbn).to_string('13')
if isbn is None: return []
# check to see whether we have isbn already
if isbn in self.__isbn_to_work_id:
# return all isbns with the work id
# print "%s already cached" % (isbn)
work_id = self.__isbn_to_work_id.get(isbn)
if return_work_id:
return work_id
if work_id is not None:
return [k for (k, v) in self.__isbn_to_work_id.items() if v == work_id]
else:
return []
else:
# if isbn is not already cached, do look up and cache the results and return the results
print "calling thingisbn for %s" % (isbn)
results = [isbn_mod.ISBN(k).to_string('13') for k in thingisbn (isbn)]
if len(results):
# look up the librarything work id
work_id = self.whatwork(isbn)
if work_id is not None: # which should be the case since results is not zero-length
self.__isbn_to_work_id.update(dict([(isbn_mod.ISBN(result).to_string('13'), work_id) for result in results]))
else:
logger.exception("work_id should not be None for isbn %s", isbn)
return []
else:
self.__isbn_to_work_id[isbn] = None # mark as not recognized by LT
work_id = None
if return_work_id:
return work_id
else:
return results
def whatwork(self, isbn=None, title=None, author=None):
# if isbn is not None and title, author None then look up results, otherwise just pass along to lt_whatwork
# first, normalize the isbn
isbn = isbn_mod.ISBN(isbn).to_string('13')
if isbn is not None and (title is None and author is None):
if isbn in self.__isbn_to_work_id:
work_id = self.__isbn_to_work_id.get(isbn)
else:
work_id = lt_whatwork(isbn=isbn)
self.__isbn_to_work_id[isbn] = work_id
return work_id
else:
return lt_whatwork(isbn=isbn, title=title, author=author)
def load(self):
try:
f = open(self.fname)
input_data = json.load(f)
f.close()
if isinstance(input_data, dict):
self.__isbn_to_work_id = input_data
return True
else:
return False
except Exception, e:
print e
def save(self):
if self.fname is not None:
f = open(self.fname, "w")
json.dump(self.__isbn_to_work_id, f)
f.close()
return True
else:
return False
def look_up_my_zotero_books_in_hathi():
from regluit.experimental.zotero_books import MyZotero
@ -786,6 +867,17 @@ class LibraryThingTest(TestCase):
self.assertEqual(work_id, SURFACING_LT_WORK_ID)
work_id = lt_whatwork(title='Hamlet', author='Shakespeare')
self.assertEqual(work_id, '2199')
def test_cache(self):
lt = LibraryThing()
res = lt.thingisbn(SURFACING_ISBN)
res2 = lt.thingisbn(SURFACING_ISBN)
self.assertEqual(set(res), set(res2))
self.assertEqual(lt.whatwork(SURFACING_ISBN), SURFACING_LT_WORK_ID)
self.assertEqual(lt.thingisbn(SURFACING_ISBN, return_work_id=True), SURFACING_LT_WORK_ID)
def suite():
@ -793,7 +885,7 @@ def suite():
#testcases = [WorkMapperTest,FreebaseBooksTest, OpenLibraryTest,GoogleBooksTest]
testcases = []
suites = unittest.TestSuite([unittest.TestLoader().loadTestsFromTestCase(testcase) for testcase in testcases])
suites.addTest(LibraryThingTest('test_whatwork'))
suites.addTest(LibraryThingTest('test_cache'))
#suites.addTest(SettingsTest('test_dev_me_alignment')) # give option to test this alignment
return suites

View File

@ -85,25 +85,99 @@ def load_gutenberg_books(fname="/Users/raymondyee/D/Document/Gluejar/Gluejar.git
else:
logger.info("%d null seed_isbn: ebook %s", i, ebook)
def cluster_status():
def cluster_status(max_num=None):
"""Look at the current Work, Edition instances to figure out what needs to be fixed"""
results = OrderedDict([
('number of Works', models.Work.objects.count()),
('number of Editions', models.Edition.objects.count())
('number of Works w/o Identifier', models.Work.objects.filter(identifiers__isnull=True).count()),
('number of Editions', models.Edition.objects.count()),
('number of Editions with ISBN', models.Edition.objects.filter(identifiers__type='isbn').count()),
('number of Editions without ISBNs', models.Edition.objects.exclude(identifiers__type='isbn').count()),
('number of Edition that have both Google Books id and ISBNs',
models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()),
('number of Editions with Google Books IDs but not ISBNs',
models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()),
])
# What needs to be done to recluster editions?
# models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id', 'edition__work__id', 'edition__work__language').count()
# 4 classes -- Edition have ISBN or not & ISBN is recognized or not by LT
# a) ISBN recognized by LT, b) ISBN not recognized by LT, c) no ISBN at all
# [w._meta.get_all_related_objects() for w in works_no_ids] -- try to figure out whether any related objects before deleting
# Are there Edition without ISBNs? Look up the corresponding ISBNs from Google Books and Are they all singletons?
# identify Editions that should be merged (e.g., if one Edition has a Google Books ID and another Edition has one with
# an ISBN tied to that Google Books ID)
from collections import defaultdict
import shutil
import time
from collections import namedtuple
return results
# let's form a key to map all the Editions into
# (lt_work_id (or None), lang, ISBN (if lt_work_id is None or None if we don't know it), ed_id (or None) )
work_clusters = defaultdict(set)
current_map = defaultdict(set)
backup = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data_back.json'
fname = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data.json'
EdInfo = namedtuple('EdInfo', ['isbn', 'ed_id', 'ed_title', 'work_id', 'lang'])
shutil.copy(fname, backup)
lt = bookdata.LibraryThing(fname)
try:
input_file = open(fname, "r")
success = lt.load()
print "success: %s" % (success)
input_file.close()
except Exception, e:
print e
for (i, (isbn, ed_id, ed_title, work_id, lang)) in enumerate(
islice(models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id',
'edition__title', 'edition__work__id', 'edition__work__language'), max_num)):
lt_work_id = lt.thingisbn(isbn, return_work_id=True)
key = (lt_work_id, lang, isbn if lt_work_id is None else None, None)
print i, isbn, lt_work_id, key
work_clusters[key].add(EdInfo(isbn=isbn, ed_id=ed_id, ed_title=ed_title, work_id=work_id, lang=lang))
current_map[work_id].add(key)
lt.save()
# Now add the Editions without any ISBNs
print "editions w/o isbn"
for (i, (ed_id, ed_title, work_id, lang)) in enumerate(
islice(models.Edition.objects.exclude(identifiers__type='isbn').values_list('id',
'title', 'work__id', 'work__language'), None)):
key = (None, lang, None, ed_id)
print i, ed_id, ed_title, key
work_clusters[key].add(EdInfo(isbn=None, ed_id=ed_id, ed_title=ed_title, work_id=work_id, lang=lang))
current_map[work_id].add(key)
print "number of clusters", len(work_clusters)
s = {'work_clusters':work_clusters, 'current_map':current_map, 'results':results}
#
print "new clusters that map over more than one existing Work", \
[(k, len(set(([e.work_id for e in v])))) for (k,v) in s['work_clusters'].items() if len(set(([e.work_id for e in v]))) <> 1 ]
m = current_map
print "existing Works that contain editions from more than 1 new cluster", \
sorted([k for (k,v) in m.items() if len(v) > 1])
return s
def all_editions():
pass