Now I have booktests to recalculate clusters

2012-02-17 10:30:09 -08:00 · 2012-02-17 10:30:09 -08:00 · 2e079b2c2e
parent 09ab830c20
commit 2e079b2c2e
2 changed files with 228 additions and 62 deletions
--- a/experimental/bookdata.py
+++ b/experimental/bookdata.py
@ -11,6 +11,7 @@ import random
 random.seed()
 import sys, os
 import json
 # a kludge to allow for isbn.py to be imported 
 # and not just in the context of the regluit Django app
@ -443,8 +444,7 @@ class FreebaseBooks(object):
                  "value":     null,
                  "type":      "/type/key"
                }]
-}]
+              }]""".replace("\n"," ")
 """.replace("\n"," ")
        query = json.loads(MQL)
        resp = self.freebase.mqlreaditer(query)
        for r in resp:
@ -462,8 +462,7 @@ class FreebaseBooks(object):
        "book": {
          "id":   null,
          "name": null
-  }
+        }}]""".replace("\n"," ")
 }]""".replace("\n"," ")
        query = json.loads(MQL)
        resp = self.freebase.mqlreaditer(query)
        for r in resp:
@ -481,8 +480,7 @@ class FreebaseBooks(object):
        "book": {
          "id":   null,
          "name": null
-  }
+        }}]""".replace("\n"," ")
 }]""".replace("\n"," ")
        query = json.loads(MQL)
        query[0]["book"]["id"] = book_id
        resp = self.freebase.mqlreaditer(query)
@ -501,8 +499,7 @@ class FreebaseBooks(object):
        "book": {
          "id":   null,
          "name": null
-  }
+        }}]""".replace("\n"," ")
 }]""".replace("\n"," ")
        query = json.loads(MQL)
        if id_type == 'isbn':
            query[0][id_type][0].setdefault('name', id)
@ -566,6 +563,90 @@ class WorkMapper(object):
                    if not complete_search:
                        raise StopIteration()    
 class LibraryThing(object):
    """
    Provide cached access to thingisbn and LT whatwork interface.  Allow for a cache file to be loaded and saved
    """
    def __init__(self, fname=None):
        self.__isbn_to_work_id = {}
        self.fname = fname
    def __del__(self):
        self.save()
    def thingisbn(self, isbn, return_work_id=False):
        """ if return_work_id is True, we won't try to calculate all the relevant isbns"""
        # first, normalize the isbn
        isbn = isbn_mod.ISBN(isbn).to_string('13')
        if isbn is None: return []
        # check to see whether we have isbn already
        if isbn in self.__isbn_to_work_id:
            # return all isbns with the work id
            # print "%s already cached" % (isbn)
            work_id = self.__isbn_to_work_id.get(isbn)
            if return_work_id:
                return work_id            
            if work_id is not None:
                return [k for (k, v) in self.__isbn_to_work_id.items() if v == work_id]
            else:
                return []
        else:
            # if isbn is not already cached, do look up and cache the results and return the results
            print "calling thingisbn for %s" % (isbn)
            results = [isbn_mod.ISBN(k).to_string('13') for k in thingisbn (isbn)]
            if len(results):
                # look up the librarything work id
                work_id = self.whatwork(isbn)
                if work_id is not None: # which should be the case since results is not zero-length
                    self.__isbn_to_work_id.update(dict([(isbn_mod.ISBN(result).to_string('13'), work_id) for result in results]))
                else:
                    logger.exception("work_id should not be None for isbn %s", isbn)
                    return []
            else:
                self.__isbn_to_work_id[isbn] = None  # mark as not recognized by LT
                work_id = None
            if return_work_id:
                return work_id
            else:
                return results
    def whatwork(self, isbn=None, title=None, author=None):
        # if isbn is not None and title, author None then look up results, otherwise just pass along to lt_whatwork
        # first, normalize the isbn
        isbn = isbn_mod.ISBN(isbn).to_string('13')
        if isbn is not None and (title is None and author is None):
            if isbn in self.__isbn_to_work_id:
                work_id = self.__isbn_to_work_id.get(isbn)
            else:
                work_id = lt_whatwork(isbn=isbn)
                self.__isbn_to_work_id[isbn] = work_id
            return work_id
        else:
            return lt_whatwork(isbn=isbn, title=title, author=author)
    def load(self):
        try:
            f = open(self.fname)
            input_data = json.load(f)
            f.close()
            if isinstance(input_data, dict):
                self.__isbn_to_work_id = input_data
                return True
            else:
                return False
        except Exception, e:
            print e
    def save(self):
        if self.fname is not None:
            f = open(self.fname, "w")
            json.dump(self.__isbn_to_work_id, f)
            f.close()
            return True
        else:
            return False
 def look_up_my_zotero_books_in_hathi():
    from regluit.experimental.zotero_books import MyZotero
@ -786,6 +867,17 @@ class LibraryThingTest(TestCase):
        self.assertEqual(work_id, SURFACING_LT_WORK_ID)
        work_id = lt_whatwork(title='Hamlet', author='Shakespeare')
        self.assertEqual(work_id, '2199')
    def test_cache(self):
        lt = LibraryThing()
        res = lt.thingisbn(SURFACING_ISBN)
        res2 = lt.thingisbn(SURFACING_ISBN)
        self.assertEqual(set(res), set(res2))
        self.assertEqual(lt.whatwork(SURFACING_ISBN), SURFACING_LT_WORK_ID)
        self.assertEqual(lt.thingisbn(SURFACING_ISBN, return_work_id=True), SURFACING_LT_WORK_ID)
 def suite():
@ -793,7 +885,7 @@ def suite():
    #testcases = [WorkMapperTest,FreebaseBooksTest, OpenLibraryTest,GoogleBooksTest]
    testcases = []
    suites = unittest.TestSuite([unittest.TestLoader().loadTestsFromTestCase(testcase) for testcase in testcases])
-    suites.addTest(LibraryThingTest('test_whatwork'))
+    suites.addTest(LibraryThingTest('test_cache'))
    #suites.addTest(SettingsTest('test_dev_me_alignment'))  # give option to test this alignment
    return suites    
--- a/test/booktests.py
+++ b/test/booktests.py
@ -85,25 +85,99 @@ def load_gutenberg_books(fname="/Users/raymondyee/D/Document/Gluejar/Gluejar.git
        else:
            logger.info("%d null seed_isbn: ebook %s", i, ebook)
-def cluster_status():
+def cluster_status(max_num=None):
    """Look at the current Work, Edition instances to figure out what needs to be fixed"""
    results = OrderedDict([
        ('number of Works', models.Work.objects.count()),
-        ('number of Editions', models.Edition.objects.count())
+        ('number of Works w/o Identifier', models.Work.objects.filter(identifiers__isnull=True).count()),
        ('number of Editions', models.Edition.objects.count()),
        ('number of Editions with ISBN', models.Edition.objects.filter(identifiers__type='isbn').count()),
        ('number of Editions without ISBNs', models.Edition.objects.exclude(identifiers__type='isbn').count()),
        ('number of Edition that have both Google Books id and ISBNs',
             models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()),
        ('number of Editions with Google Books IDs but not ISBNs',
             models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()),
        ])
-    # What needs to be done to recluster editions?
+    # models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id', 'edition__work__id', 'edition__work__language').count()
    # 4 classes -- Edition have ISBN or not & ISBN is recognized or not by LT
    # a) ISBN recognized by LT, b) ISBN not recognized by LT, c) no ISBN at all
    # [w._meta.get_all_related_objects() for w in works_no_ids] -- try to figure out whether any related objects before deleting
    # Are there Edition without ISBNs?  Look up the corresponding ISBNs from Google Books and Are they all singletons?
    # identify Editions that should be merged (e.g., if one Edition has a Google Books ID and another Edition has one with
    # an ISBN tied to that Google Books ID)
-    return results
+
    from collections import defaultdict
    import shutil
    import time
    from collections import namedtuple
    # let's form a key to map all the Editions into
    # (lt_work_id (or None), lang, ISBN (if lt_work_id is None or None if we don't know it), ed_id (or None) )
    work_clusters = defaultdict(set)
    current_map = defaultdict(set)
    backup = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data_back.json'
    fname = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data.json'
    EdInfo = namedtuple('EdInfo', ['isbn', 'ed_id', 'ed_title', 'work_id', 'lang'])
    shutil.copy(fname, backup)
    lt = bookdata.LibraryThing(fname)
    try:
        input_file = open(fname, "r")
        success = lt.load()
        print "success: %s" % (success)
        input_file.close()
    except Exception, e:
        print e
    for (i, (isbn, ed_id, ed_title, work_id, lang)) in enumerate(
        islice(models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id',
                'edition__title', 'edition__work__id', 'edition__work__language'), max_num)):
        lt_work_id = lt.thingisbn(isbn, return_work_id=True)
        key = (lt_work_id, lang, isbn if lt_work_id is None else None, None)
        print i, isbn, lt_work_id, key
        work_clusters[key].add(EdInfo(isbn=isbn, ed_id=ed_id, ed_title=ed_title, work_id=work_id, lang=lang))
        current_map[work_id].add(key)
    lt.save()
    # Now add the Editions without any ISBNs
    print "editions w/o isbn"
    for (i, (ed_id, ed_title, work_id, lang)) in enumerate(
        islice(models.Edition.objects.exclude(identifiers__type='isbn').values_list('id',
                'title', 'work__id', 'work__language'), None)):
        key = (None, lang, None, ed_id)
        print i, ed_id, ed_title, key
        work_clusters[key].add(EdInfo(isbn=None, ed_id=ed_id, ed_title=ed_title, work_id=work_id, lang=lang))
        current_map[work_id].add(key)
    print "number of clusters", len(work_clusters)
    s = {'work_clusters':work_clusters, 'current_map':current_map, 'results':results}
    #
    print "new clusters that map over more than one existing Work", \
    [(k, len(set(([e.work_id for e in v])))) for (k,v) in s['work_clusters'].items() if len(set(([e.work_id for e in v]))) <> 1 ]
    m = current_map
    print "existing Works that contain editions from more than 1 new cluster", \
        sorted([k for (k,v) in m.items() if len(v) > 1])
    return s
 def all_editions():
    pass