Now I have booktests to recalculate clusters
parent
09ab830c20
commit
2e079b2c2e
|
@ -11,6 +11,7 @@ import random
|
|||
random.seed()
|
||||
|
||||
import sys, os
|
||||
import json
|
||||
|
||||
# a kludge to allow for isbn.py to be imported
|
||||
# and not just in the context of the regluit Django app
|
||||
|
@ -436,15 +437,14 @@ class FreebaseBooks(object):
|
|||
self.freebase.login(username,password)
|
||||
def books(self):
|
||||
MQL = u"""[{
|
||||
"type": "/book/book",
|
||||
"id": null,
|
||||
"key": [{
|
||||
"namespace": "/wikipedia/en",
|
||||
"value": null,
|
||||
"type": "/type/key"
|
||||
}]
|
||||
}]
|
||||
""".replace("\n"," ")
|
||||
"type": "/book/book",
|
||||
"id": null,
|
||||
"key": [{
|
||||
"namespace": "/wikipedia/en",
|
||||
"value": null,
|
||||
"type": "/type/key"
|
||||
}]
|
||||
}]""".replace("\n"," ")
|
||||
query = json.loads(MQL)
|
||||
resp = self.freebase.mqlreaditer(query)
|
||||
for r in resp:
|
||||
|
@ -452,18 +452,17 @@ class FreebaseBooks(object):
|
|||
|
||||
def book_editions(self):
|
||||
MQL = u"""[{
|
||||
"type": "/book/book_edition",
|
||||
"id": null,
|
||||
"isbn": [{}],
|
||||
"ISBN": [{}],
|
||||
"LCCN": [{}],
|
||||
"OCLC_number": [{}],
|
||||
"openlibrary_id": [{}],
|
||||
"book": {
|
||||
"id": null,
|
||||
"name": null
|
||||
}
|
||||
}]""".replace("\n"," ")
|
||||
"type": "/book/book_edition",
|
||||
"id": null,
|
||||
"isbn": [{}],
|
||||
"ISBN": [{}],
|
||||
"LCCN": [{}],
|
||||
"OCLC_number": [{}],
|
||||
"openlibrary_id": [{}],
|
||||
"book": {
|
||||
"id": null,
|
||||
"name": null
|
||||
}}]""".replace("\n"," ")
|
||||
query = json.loads(MQL)
|
||||
resp = self.freebase.mqlreaditer(query)
|
||||
for r in resp:
|
||||
|
@ -471,18 +470,17 @@ class FreebaseBooks(object):
|
|||
|
||||
def editions_for_book(self, book_id):
|
||||
MQL = u"""[{
|
||||
"type": "/book/book_edition",
|
||||
"id": null,
|
||||
"isbn": [{}],
|
||||
"ISBN": [{}],
|
||||
"LCCN": [{}],
|
||||
"OCLC_number": [{}],
|
||||
"openlibrary_id": [{}],
|
||||
"book": {
|
||||
"id": null,
|
||||
"name": null
|
||||
}
|
||||
}]""".replace("\n"," ")
|
||||
"type": "/book/book_edition",
|
||||
"id": null,
|
||||
"isbn": [{}],
|
||||
"ISBN": [{}],
|
||||
"LCCN": [{}],
|
||||
"OCLC_number": [{}],
|
||||
"openlibrary_id": [{}],
|
||||
"book": {
|
||||
"id": null,
|
||||
"name": null
|
||||
}}]""".replace("\n"," ")
|
||||
query = json.loads(MQL)
|
||||
query[0]["book"]["id"] = book_id
|
||||
resp = self.freebase.mqlreaditer(query)
|
||||
|
@ -491,18 +489,17 @@ class FreebaseBooks(object):
|
|||
|
||||
def book_edition_by_id(self,id,id_type):
|
||||
MQL = u"""[{
|
||||
"type": "/book/book_edition",
|
||||
"id": null,
|
||||
"isbn": [{}],
|
||||
"ISBN": [{}],
|
||||
"LCCN": [{}],
|
||||
"OCLC_number": [{}],
|
||||
"openlibrary_id": [{}],
|
||||
"book": {
|
||||
"id": null,
|
||||
"name": null
|
||||
}
|
||||
}]""".replace("\n"," ")
|
||||
"type": "/book/book_edition",
|
||||
"id": null,
|
||||
"isbn": [{}],
|
||||
"ISBN": [{}],
|
||||
"LCCN": [{}],
|
||||
"OCLC_number": [{}],
|
||||
"openlibrary_id": [{}],
|
||||
"book": {
|
||||
"id": null,
|
||||
"name": null
|
||||
}}]""".replace("\n"," ")
|
||||
query = json.loads(MQL)
|
||||
if id_type == 'isbn':
|
||||
query[0][id_type][0].setdefault('name', id)
|
||||
|
@ -526,18 +523,18 @@ class FreebaseBooks(object):
|
|||
elif isbn_val is not None:
|
||||
isbn_val = isbn_mod.ISBN(isbn_val).to_string('13')
|
||||
MQL = """[{
|
||||
"type": "/book/book_edition",
|
||||
"isbn": {
|
||||
"name": null
|
||||
},
|
||||
"book": {
|
||||
"editions": [{
|
||||
"isbn": {
|
||||
"name": null
|
||||
}
|
||||
}]
|
||||
}
|
||||
}]""".replace("\n"," ")
|
||||
"type": "/book/book_edition",
|
||||
"isbn": {
|
||||
"name": null
|
||||
},
|
||||
"book": {
|
||||
"editions": [{
|
||||
"isbn": {
|
||||
"name": null
|
||||
}
|
||||
}]
|
||||
}
|
||||
}]""".replace("\n"," ")
|
||||
query = json.loads(MQL)
|
||||
query[0]["book"]["editions"][0]["isbn"]["name"] = isbn_val
|
||||
resp = self.freebase.mqlreaditer(query)
|
||||
|
@ -565,7 +562,91 @@ class WorkMapper(object):
|
|||
yield work_id
|
||||
if not complete_search:
|
||||
raise StopIteration()
|
||||
|
||||
class LibraryThing(object):
|
||||
"""
|
||||
Provide cached access to thingisbn and LT whatwork interface. Allow for a cache file to be loaded and saved
|
||||
"""
|
||||
def __init__(self, fname=None):
|
||||
self.__isbn_to_work_id = {}
|
||||
self.fname = fname
|
||||
def __del__(self):
|
||||
self.save()
|
||||
def thingisbn(self, isbn, return_work_id=False):
|
||||
""" if return_work_id is True, we won't try to calculate all the relevant isbns"""
|
||||
# first, normalize the isbn
|
||||
isbn = isbn_mod.ISBN(isbn).to_string('13')
|
||||
if isbn is None: return []
|
||||
|
||||
# check to see whether we have isbn already
|
||||
if isbn in self.__isbn_to_work_id:
|
||||
# return all isbns with the work id
|
||||
# print "%s already cached" % (isbn)
|
||||
work_id = self.__isbn_to_work_id.get(isbn)
|
||||
|
||||
if return_work_id:
|
||||
return work_id
|
||||
if work_id is not None:
|
||||
return [k for (k, v) in self.__isbn_to_work_id.items() if v == work_id]
|
||||
else:
|
||||
return []
|
||||
else:
|
||||
# if isbn is not already cached, do look up and cache the results and return the results
|
||||
print "calling thingisbn for %s" % (isbn)
|
||||
results = [isbn_mod.ISBN(k).to_string('13') for k in thingisbn (isbn)]
|
||||
if len(results):
|
||||
# look up the librarything work id
|
||||
work_id = self.whatwork(isbn)
|
||||
|
||||
if work_id is not None: # which should be the case since results is not zero-length
|
||||
self.__isbn_to_work_id.update(dict([(isbn_mod.ISBN(result).to_string('13'), work_id) for result in results]))
|
||||
else:
|
||||
logger.exception("work_id should not be None for isbn %s", isbn)
|
||||
return []
|
||||
else:
|
||||
self.__isbn_to_work_id[isbn] = None # mark as not recognized by LT
|
||||
work_id = None
|
||||
|
||||
if return_work_id:
|
||||
return work_id
|
||||
else:
|
||||
return results
|
||||
|
||||
def whatwork(self, isbn=None, title=None, author=None):
|
||||
# if isbn is not None and title, author None then look up results, otherwise just pass along to lt_whatwork
|
||||
# first, normalize the isbn
|
||||
isbn = isbn_mod.ISBN(isbn).to_string('13')
|
||||
if isbn is not None and (title is None and author is None):
|
||||
if isbn in self.__isbn_to_work_id:
|
||||
work_id = self.__isbn_to_work_id.get(isbn)
|
||||
else:
|
||||
work_id = lt_whatwork(isbn=isbn)
|
||||
self.__isbn_to_work_id[isbn] = work_id
|
||||
return work_id
|
||||
else:
|
||||
return lt_whatwork(isbn=isbn, title=title, author=author)
|
||||
def load(self):
|
||||
try:
|
||||
f = open(self.fname)
|
||||
input_data = json.load(f)
|
||||
f.close()
|
||||
|
||||
if isinstance(input_data, dict):
|
||||
self.__isbn_to_work_id = input_data
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
except Exception, e:
|
||||
print e
|
||||
def save(self):
|
||||
if self.fname is not None:
|
||||
f = open(self.fname, "w")
|
||||
json.dump(self.__isbn_to_work_id, f)
|
||||
f.close()
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def look_up_my_zotero_books_in_hathi():
|
||||
from regluit.experimental.zotero_books import MyZotero
|
||||
|
@ -786,6 +867,17 @@ class LibraryThingTest(TestCase):
|
|||
self.assertEqual(work_id, SURFACING_LT_WORK_ID)
|
||||
work_id = lt_whatwork(title='Hamlet', author='Shakespeare')
|
||||
self.assertEqual(work_id, '2199')
|
||||
def test_cache(self):
|
||||
|
||||
lt = LibraryThing()
|
||||
res = lt.thingisbn(SURFACING_ISBN)
|
||||
|
||||
res2 = lt.thingisbn(SURFACING_ISBN)
|
||||
self.assertEqual(set(res), set(res2))
|
||||
|
||||
self.assertEqual(lt.whatwork(SURFACING_ISBN), SURFACING_LT_WORK_ID)
|
||||
|
||||
self.assertEqual(lt.thingisbn(SURFACING_ISBN, return_work_id=True), SURFACING_LT_WORK_ID)
|
||||
|
||||
|
||||
def suite():
|
||||
|
@ -793,7 +885,7 @@ def suite():
|
|||
#testcases = [WorkMapperTest,FreebaseBooksTest, OpenLibraryTest,GoogleBooksTest]
|
||||
testcases = []
|
||||
suites = unittest.TestSuite([unittest.TestLoader().loadTestsFromTestCase(testcase) for testcase in testcases])
|
||||
suites.addTest(LibraryThingTest('test_whatwork'))
|
||||
suites.addTest(LibraryThingTest('test_cache'))
|
||||
#suites.addTest(SettingsTest('test_dev_me_alignment')) # give option to test this alignment
|
||||
return suites
|
||||
|
||||
|
|
|
@ -85,25 +85,99 @@ def load_gutenberg_books(fname="/Users/raymondyee/D/Document/Gluejar/Gluejar.git
|
|||
else:
|
||||
logger.info("%d null seed_isbn: ebook %s", i, ebook)
|
||||
|
||||
def cluster_status():
|
||||
def cluster_status(max_num=None):
|
||||
"""Look at the current Work, Edition instances to figure out what needs to be fixed"""
|
||||
results = OrderedDict([
|
||||
('number of Works', models.Work.objects.count()),
|
||||
('number of Editions', models.Edition.objects.count())
|
||||
('number of Works w/o Identifier', models.Work.objects.filter(identifiers__isnull=True).count()),
|
||||
('number of Editions', models.Edition.objects.count()),
|
||||
('number of Editions with ISBN', models.Edition.objects.filter(identifiers__type='isbn').count()),
|
||||
('number of Editions without ISBNs', models.Edition.objects.exclude(identifiers__type='isbn').count()),
|
||||
('number of Edition that have both Google Books id and ISBNs',
|
||||
models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()),
|
||||
('number of Editions with Google Books IDs but not ISBNs',
|
||||
models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()),
|
||||
])
|
||||
|
||||
# What needs to be done to recluster editions?
|
||||
# models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id', 'edition__work__id', 'edition__work__language').count()
|
||||
# 4 classes -- Edition have ISBN or not & ISBN is recognized or not by LT
|
||||
# a) ISBN recognized by LT, b) ISBN not recognized by LT, c) no ISBN at all
|
||||
|
||||
# [w._meta.get_all_related_objects() for w in works_no_ids] -- try to figure out whether any related objects before deleting
|
||||
|
||||
# Are there Edition without ISBNs? Look up the corresponding ISBNs from Google Books and Are they all singletons?
|
||||
|
||||
# identify Editions that should be merged (e.g., if one Edition has a Google Books ID and another Edition has one with
|
||||
# an ISBN tied to that Google Books ID)
|
||||
|
||||
|
||||
from collections import defaultdict
|
||||
import shutil
|
||||
import time
|
||||
from collections import namedtuple
|
||||
|
||||
return results
|
||||
# let's form a key to map all the Editions into
|
||||
# (lt_work_id (or None), lang, ISBN (if lt_work_id is None or None if we don't know it), ed_id (or None) )
|
||||
|
||||
work_clusters = defaultdict(set)
|
||||
current_map = defaultdict(set)
|
||||
|
||||
backup = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data_back.json'
|
||||
fname = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data.json'
|
||||
|
||||
EdInfo = namedtuple('EdInfo', ['isbn', 'ed_id', 'ed_title', 'work_id', 'lang'])
|
||||
|
||||
shutil.copy(fname, backup)
|
||||
|
||||
lt = bookdata.LibraryThing(fname)
|
||||
|
||||
try:
|
||||
input_file = open(fname, "r")
|
||||
success = lt.load()
|
||||
print "success: %s" % (success)
|
||||
input_file.close()
|
||||
except Exception, e:
|
||||
print e
|
||||
|
||||
for (i, (isbn, ed_id, ed_title, work_id, lang)) in enumerate(
|
||||
islice(models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id',
|
||||
'edition__title', 'edition__work__id', 'edition__work__language'), max_num)):
|
||||
|
||||
lt_work_id = lt.thingisbn(isbn, return_work_id=True)
|
||||
key = (lt_work_id, lang, isbn if lt_work_id is None else None, None)
|
||||
print i, isbn, lt_work_id, key
|
||||
work_clusters[key].add(EdInfo(isbn=isbn, ed_id=ed_id, ed_title=ed_title, work_id=work_id, lang=lang))
|
||||
current_map[work_id].add(key)
|
||||
|
||||
lt.save()
|
||||
|
||||
# Now add the Editions without any ISBNs
|
||||
print "editions w/o isbn"
|
||||
for (i, (ed_id, ed_title, work_id, lang)) in enumerate(
|
||||
islice(models.Edition.objects.exclude(identifiers__type='isbn').values_list('id',
|
||||
'title', 'work__id', 'work__language'), None)):
|
||||
|
||||
key = (None, lang, None, ed_id)
|
||||
print i, ed_id, ed_title, key
|
||||
work_clusters[key].add(EdInfo(isbn=None, ed_id=ed_id, ed_title=ed_title, work_id=work_id, lang=lang))
|
||||
current_map[work_id].add(key)
|
||||
|
||||
print "number of clusters", len(work_clusters)
|
||||
|
||||
s = {'work_clusters':work_clusters, 'current_map':current_map, 'results':results}
|
||||
|
||||
#
|
||||
print "new clusters that map over more than one existing Work", \
|
||||
[(k, len(set(([e.work_id for e in v])))) for (k,v) in s['work_clusters'].items() if len(set(([e.work_id for e in v]))) <> 1 ]
|
||||
|
||||
m = current_map
|
||||
print "existing Works that contain editions from more than 1 new cluster", \
|
||||
sorted([k for (k,v) in m.items() if len(v) > 1])
|
||||
|
||||
return s
|
||||
|
||||
def all_editions():
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue