Now I have booktests to recalculate clusters

pull/1/head
Raymond Yee 2012-02-17 10:30:09 -08:00
parent 09ab830c20
commit 2e079b2c2e
2 changed files with 228 additions and 62 deletions

View File

@ -11,6 +11,7 @@ import random
random.seed() random.seed()
import sys, os import sys, os
import json
# a kludge to allow for isbn.py to be imported # a kludge to allow for isbn.py to be imported
# and not just in the context of the regluit Django app # and not just in the context of the regluit Django app
@ -443,8 +444,7 @@ class FreebaseBooks(object):
"value": null, "value": null,
"type": "/type/key" "type": "/type/key"
}] }]
}] }]""".replace("\n"," ")
""".replace("\n"," ")
query = json.loads(MQL) query = json.loads(MQL)
resp = self.freebase.mqlreaditer(query) resp = self.freebase.mqlreaditer(query)
for r in resp: for r in resp:
@ -462,8 +462,7 @@ class FreebaseBooks(object):
"book": { "book": {
"id": null, "id": null,
"name": null "name": null
} }}]""".replace("\n"," ")
}]""".replace("\n"," ")
query = json.loads(MQL) query = json.loads(MQL)
resp = self.freebase.mqlreaditer(query) resp = self.freebase.mqlreaditer(query)
for r in resp: for r in resp:
@ -481,8 +480,7 @@ class FreebaseBooks(object):
"book": { "book": {
"id": null, "id": null,
"name": null "name": null
} }}]""".replace("\n"," ")
}]""".replace("\n"," ")
query = json.loads(MQL) query = json.loads(MQL)
query[0]["book"]["id"] = book_id query[0]["book"]["id"] = book_id
resp = self.freebase.mqlreaditer(query) resp = self.freebase.mqlreaditer(query)
@ -501,8 +499,7 @@ class FreebaseBooks(object):
"book": { "book": {
"id": null, "id": null,
"name": null "name": null
} }}]""".replace("\n"," ")
}]""".replace("\n"," ")
query = json.loads(MQL) query = json.loads(MQL)
if id_type == 'isbn': if id_type == 'isbn':
query[0][id_type][0].setdefault('name', id) query[0][id_type][0].setdefault('name', id)
@ -566,6 +563,90 @@ class WorkMapper(object):
if not complete_search: if not complete_search:
raise StopIteration() raise StopIteration()
class LibraryThing(object):
"""
Provide cached access to thingisbn and LT whatwork interface. Allow for a cache file to be loaded and saved
"""
def __init__(self, fname=None):
self.__isbn_to_work_id = {}
self.fname = fname
def __del__(self):
self.save()
def thingisbn(self, isbn, return_work_id=False):
""" if return_work_id is True, we won't try to calculate all the relevant isbns"""
# first, normalize the isbn
isbn = isbn_mod.ISBN(isbn).to_string('13')
if isbn is None: return []
# check to see whether we have isbn already
if isbn in self.__isbn_to_work_id:
# return all isbns with the work id
# print "%s already cached" % (isbn)
work_id = self.__isbn_to_work_id.get(isbn)
if return_work_id:
return work_id
if work_id is not None:
return [k for (k, v) in self.__isbn_to_work_id.items() if v == work_id]
else:
return []
else:
# if isbn is not already cached, do look up and cache the results and return the results
print "calling thingisbn for %s" % (isbn)
results = [isbn_mod.ISBN(k).to_string('13') for k in thingisbn (isbn)]
if len(results):
# look up the librarything work id
work_id = self.whatwork(isbn)
if work_id is not None: # which should be the case since results is not zero-length
self.__isbn_to_work_id.update(dict([(isbn_mod.ISBN(result).to_string('13'), work_id) for result in results]))
else:
logger.exception("work_id should not be None for isbn %s", isbn)
return []
else:
self.__isbn_to_work_id[isbn] = None # mark as not recognized by LT
work_id = None
if return_work_id:
return work_id
else:
return results
def whatwork(self, isbn=None, title=None, author=None):
# if isbn is not None and title, author None then look up results, otherwise just pass along to lt_whatwork
# first, normalize the isbn
isbn = isbn_mod.ISBN(isbn).to_string('13')
if isbn is not None and (title is None and author is None):
if isbn in self.__isbn_to_work_id:
work_id = self.__isbn_to_work_id.get(isbn)
else:
work_id = lt_whatwork(isbn=isbn)
self.__isbn_to_work_id[isbn] = work_id
return work_id
else:
return lt_whatwork(isbn=isbn, title=title, author=author)
def load(self):
try:
f = open(self.fname)
input_data = json.load(f)
f.close()
if isinstance(input_data, dict):
self.__isbn_to_work_id = input_data
return True
else:
return False
except Exception, e:
print e
def save(self):
if self.fname is not None:
f = open(self.fname, "w")
json.dump(self.__isbn_to_work_id, f)
f.close()
return True
else:
return False
def look_up_my_zotero_books_in_hathi(): def look_up_my_zotero_books_in_hathi():
from regluit.experimental.zotero_books import MyZotero from regluit.experimental.zotero_books import MyZotero
@ -786,6 +867,17 @@ class LibraryThingTest(TestCase):
self.assertEqual(work_id, SURFACING_LT_WORK_ID) self.assertEqual(work_id, SURFACING_LT_WORK_ID)
work_id = lt_whatwork(title='Hamlet', author='Shakespeare') work_id = lt_whatwork(title='Hamlet', author='Shakespeare')
self.assertEqual(work_id, '2199') self.assertEqual(work_id, '2199')
def test_cache(self):
lt = LibraryThing()
res = lt.thingisbn(SURFACING_ISBN)
res2 = lt.thingisbn(SURFACING_ISBN)
self.assertEqual(set(res), set(res2))
self.assertEqual(lt.whatwork(SURFACING_ISBN), SURFACING_LT_WORK_ID)
self.assertEqual(lt.thingisbn(SURFACING_ISBN, return_work_id=True), SURFACING_LT_WORK_ID)
def suite(): def suite():
@ -793,7 +885,7 @@ def suite():
#testcases = [WorkMapperTest,FreebaseBooksTest, OpenLibraryTest,GoogleBooksTest] #testcases = [WorkMapperTest,FreebaseBooksTest, OpenLibraryTest,GoogleBooksTest]
testcases = [] testcases = []
suites = unittest.TestSuite([unittest.TestLoader().loadTestsFromTestCase(testcase) for testcase in testcases]) suites = unittest.TestSuite([unittest.TestLoader().loadTestsFromTestCase(testcase) for testcase in testcases])
suites.addTest(LibraryThingTest('test_whatwork')) suites.addTest(LibraryThingTest('test_cache'))
#suites.addTest(SettingsTest('test_dev_me_alignment')) # give option to test this alignment #suites.addTest(SettingsTest('test_dev_me_alignment')) # give option to test this alignment
return suites return suites

View File

@ -85,25 +85,99 @@ def load_gutenberg_books(fname="/Users/raymondyee/D/Document/Gluejar/Gluejar.git
else: else:
logger.info("%d null seed_isbn: ebook %s", i, ebook) logger.info("%d null seed_isbn: ebook %s", i, ebook)
def cluster_status(): def cluster_status(max_num=None):
"""Look at the current Work, Edition instances to figure out what needs to be fixed""" """Look at the current Work, Edition instances to figure out what needs to be fixed"""
results = OrderedDict([ results = OrderedDict([
('number of Works', models.Work.objects.count()), ('number of Works', models.Work.objects.count()),
('number of Editions', models.Edition.objects.count()) ('number of Works w/o Identifier', models.Work.objects.filter(identifiers__isnull=True).count()),
('number of Editions', models.Edition.objects.count()),
('number of Editions with ISBN', models.Edition.objects.filter(identifiers__type='isbn').count()),
('number of Editions without ISBNs', models.Edition.objects.exclude(identifiers__type='isbn').count()),
('number of Edition that have both Google Books id and ISBNs', ('number of Edition that have both Google Books id and ISBNs',
models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()), models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()),
('number of Editions with Google Books IDs but not ISBNs', ('number of Editions with Google Books IDs but not ISBNs',
models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()), models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()),
]) ])
# What needs to be done to recluster editions? # models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id', 'edition__work__id', 'edition__work__language').count()
# 4 classes -- Edition have ISBN or not & ISBN is recognized or not by LT
# a) ISBN recognized by LT, b) ISBN not recognized by LT, c) no ISBN at all
# [w._meta.get_all_related_objects() for w in works_no_ids] -- try to figure out whether any related objects before deleting
# Are there Edition without ISBNs? Look up the corresponding ISBNs from Google Books and Are they all singletons? # Are there Edition without ISBNs? Look up the corresponding ISBNs from Google Books and Are they all singletons?
# identify Editions that should be merged (e.g., if one Edition has a Google Books ID and another Edition has one with # identify Editions that should be merged (e.g., if one Edition has a Google Books ID and another Edition has one with
# an ISBN tied to that Google Books ID) # an ISBN tied to that Google Books ID)
return results
from collections import defaultdict
import shutil
import time
from collections import namedtuple
# let's form a key to map all the Editions into
# (lt_work_id (or None), lang, ISBN (if lt_work_id is None or None if we don't know it), ed_id (or None) )
work_clusters = defaultdict(set)
current_map = defaultdict(set)
backup = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data_back.json'
fname = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data.json'
EdInfo = namedtuple('EdInfo', ['isbn', 'ed_id', 'ed_title', 'work_id', 'lang'])
shutil.copy(fname, backup)
lt = bookdata.LibraryThing(fname)
try:
input_file = open(fname, "r")
success = lt.load()
print "success: %s" % (success)
input_file.close()
except Exception, e:
print e
for (i, (isbn, ed_id, ed_title, work_id, lang)) in enumerate(
islice(models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id',
'edition__title', 'edition__work__id', 'edition__work__language'), max_num)):
lt_work_id = lt.thingisbn(isbn, return_work_id=True)
key = (lt_work_id, lang, isbn if lt_work_id is None else None, None)
print i, isbn, lt_work_id, key
work_clusters[key].add(EdInfo(isbn=isbn, ed_id=ed_id, ed_title=ed_title, work_id=work_id, lang=lang))
current_map[work_id].add(key)
lt.save()
# Now add the Editions without any ISBNs
print "editions w/o isbn"
for (i, (ed_id, ed_title, work_id, lang)) in enumerate(
islice(models.Edition.objects.exclude(identifiers__type='isbn').values_list('id',
'title', 'work__id', 'work__language'), None)):
key = (None, lang, None, ed_id)
print i, ed_id, ed_title, key
work_clusters[key].add(EdInfo(isbn=None, ed_id=ed_id, ed_title=ed_title, work_id=work_id, lang=lang))
current_map[work_id].add(key)
print "number of clusters", len(work_clusters)
s = {'work_clusters':work_clusters, 'current_map':current_map, 'results':results}
#
print "new clusters that map over more than one existing Work", \
[(k, len(set(([e.work_id for e in v])))) for (k,v) in s['work_clusters'].items() if len(set(([e.work_id for e in v]))) <> 1 ]
m = current_map
print "existing Works that contain editions from more than 1 new cluster", \
sorted([k for (k,v) in m.items() if len(v) > 1])
return s
def all_editions():
pass