Now I have booktests to recalculate clusters

2012-02-17 10:30:09 -08:00 · 2012-02-17 10:30:09 -08:00 · 2e079b2c2e
parent 09ab830c20
commit 2e079b2c2e
2 changed files with 228 additions and 62 deletions
--- a/experimental/bookdata.py
+++ b/experimental/bookdata.py
@ -11,6 +11,7 @@ import random
 random.seed()

 import sys, os
+import json

 # a kludge to allow for isbn.py to be imported 
 # and not just in the context of the regluit Django app
@ -436,15 +437,14 @@ class FreebaseBooks(object):
            self.freebase.login(username,password)
    def books(self):
        MQL = u"""[{
-  "type": "/book/book",
-  "id":   null,
-  "key": [{
-    "namespace": "/wikipedia/en",
-    "value":     null,
-    "type":      "/type/key"
-  }]
-}]
-""".replace("\n"," ")
+                "type": "/book/book",
+                "id":   null,
+                "key": [{
+                  "namespace": "/wikipedia/en",
+                  "value":     null,
+                  "type":      "/type/key"
+                }]
+              }]""".replace("\n"," ")
        query = json.loads(MQL)
        resp = self.freebase.mqlreaditer(query)
        for r in resp:
@ -452,18 +452,17 @@ class FreebaseBooks(object):

    def book_editions(self):
        MQL = u"""[{
-  "type":        "/book/book_edition",
-  "id":          null,
-  "isbn":        [{}],
-  "ISBN":        [{}],
-  "LCCN":        [{}],
-  "OCLC_number": [{}],
-  "openlibrary_id": [{}],
-  "book": {
-    "id":   null,
-    "name": null
-  }
-}]""".replace("\n"," ")
+        "type":        "/book/book_edition",
+        "id":          null,
+        "isbn":        [{}],
+        "ISBN":        [{}],
+        "LCCN":        [{}],
+        "OCLC_number": [{}],
+        "openlibrary_id": [{}],
+        "book": {
+          "id":   null,
+          "name": null
+        }}]""".replace("\n"," ")
        query = json.loads(MQL)
        resp = self.freebase.mqlreaditer(query)
        for r in resp:
@ -471,18 +470,17 @@ class FreebaseBooks(object):

    def editions_for_book(self, book_id):
        MQL = u"""[{
-  "type":        "/book/book_edition",
-  "id":          null,
-  "isbn":        [{}],
-  "ISBN":        [{}],
-  "LCCN":        [{}],
-  "OCLC_number": [{}],
-  "openlibrary_id": [{}],
-  "book": {
-    "id":   null,
-    "name": null
-  }
-}]""".replace("\n"," ")
+        "type":        "/book/book_edition",
+        "id":          null,
+        "isbn":        [{}],
+        "ISBN":        [{}],
+        "LCCN":        [{}],
+        "OCLC_number": [{}],
+        "openlibrary_id": [{}],
+        "book": {
+          "id":   null,
+          "name": null
+        }}]""".replace("\n"," ")
        query = json.loads(MQL)
        query[0]["book"]["id"] = book_id
        resp = self.freebase.mqlreaditer(query)
@ -491,18 +489,17 @@ class FreebaseBooks(object):
            
    def book_edition_by_id(self,id,id_type):
        MQL = u"""[{
-  "type":        "/book/book_edition",
-  "id":          null,
-  "isbn":        [{}],
-  "ISBN":        [{}],
-  "LCCN":        [{}],
-  "OCLC_number": [{}],
-  "openlibrary_id": [{}],
-  "book": {
-    "id":   null,
-    "name": null
-  }
-}]""".replace("\n"," ")
+        "type":        "/book/book_edition",
+        "id":          null,
+        "isbn":        [{}],
+        "ISBN":        [{}],
+        "LCCN":        [{}],
+        "OCLC_number": [{}],
+        "openlibrary_id": [{}],
+        "book": {
+          "id":   null,
+          "name": null
+        }}]""".replace("\n"," ")
        query = json.loads(MQL)
        if id_type == 'isbn':
            query[0][id_type][0].setdefault('name', id)
@ -526,18 +523,18 @@ class FreebaseBooks(object):
        elif isbn_val is not None:
            isbn_val = isbn_mod.ISBN(isbn_val).to_string('13')
            MQL = """[{
-  "type": "/book/book_edition",
-  "isbn": {
-    "name": null
-  },
-  "book": {
-    "editions": [{
-      "isbn": {
-        "name": null
-      }
-    }]
-  }
-}]""".replace("\n"," ")
+                "type": "/book/book_edition",
+                "isbn": {
+                  "name": null
+                },
+                "book": {
+                  "editions": [{
+                    "isbn": {
+                      "name": null
+                    }
+                  }]
+                }
+              }]""".replace("\n"," ")
            query = json.loads(MQL)
            query[0]["book"]["editions"][0]["isbn"]["name"] = isbn_val
            resp = self.freebase.mqlreaditer(query)
@ -565,7 +562,91 @@ class WorkMapper(object):
                    yield work_id
                    if not complete_search:
                        raise StopIteration()    
+
+class LibraryThing(object):
+    """
+    Provide cached access to thingisbn and LT whatwork interface.  Allow for a cache file to be loaded and saved
+    """
+    def __init__(self, fname=None):
+        self.__isbn_to_work_id = {}
+        self.fname = fname
+    def __del__(self):
+        self.save()
+    def thingisbn(self, isbn, return_work_id=False):
+        """ if return_work_id is True, we won't try to calculate all the relevant isbns"""
+        # first, normalize the isbn
+        isbn = isbn_mod.ISBN(isbn).to_string('13')
+        if isbn is None: return []
+        
+        # check to see whether we have isbn already
+        if isbn in self.__isbn_to_work_id:
+            # return all isbns with the work id
+            # print "%s already cached" % (isbn)
+            work_id = self.__isbn_to_work_id.get(isbn)
+
+            if return_work_id:
+                return work_id            
+            if work_id is not None:
+                return [k for (k, v) in self.__isbn_to_work_id.items() if v == work_id]
+            else:
+                return []
+        else:
+            # if isbn is not already cached, do look up and cache the results and return the results
+            print "calling thingisbn for %s" % (isbn)
+            results = [isbn_mod.ISBN(k).to_string('13') for k in thingisbn (isbn)]
+            if len(results):
+                # look up the librarything work id
+                work_id = self.whatwork(isbn)
+                
+                if work_id is not None: # which should be the case since results is not zero-length
+                    self.__isbn_to_work_id.update(dict([(isbn_mod.ISBN(result).to_string('13'), work_id) for result in results]))
+                else:
+                    logger.exception("work_id should not be None for isbn %s", isbn)
+                    return []
+            else:
+                self.__isbn_to_work_id[isbn] = None  # mark as not recognized by LT
+                work_id = None
+                
+            if return_work_id:
+                return work_id
+            else:
+                return results
+                    
+    def whatwork(self, isbn=None, title=None, author=None):
+        # if isbn is not None and title, author None then look up results, otherwise just pass along to lt_whatwork
+        # first, normalize the isbn
+        isbn = isbn_mod.ISBN(isbn).to_string('13')
+        if isbn is not None and (title is None and author is None):
+            if isbn in self.__isbn_to_work_id:
+                work_id = self.__isbn_to_work_id.get(isbn)
+            else:
+                work_id = lt_whatwork(isbn=isbn)
+                self.__isbn_to_work_id[isbn] = work_id
+            return work_id
+        else:
+            return lt_whatwork(isbn=isbn, title=title, author=author)
+    def load(self):
+        try:
+            f = open(self.fname)
+            input_data = json.load(f)
+            f.close()
            
+            if isinstance(input_data, dict):
+                self.__isbn_to_work_id = input_data
+                return True
+            else:
+                return False
+        except Exception, e:
+            print e
+    def save(self):
+        if self.fname is not None:
+            f = open(self.fname, "w")
+            json.dump(self.__isbn_to_work_id, f)
+            f.close()
+            return True
+        else:
+            return False
+ 

 def look_up_my_zotero_books_in_hathi():
    from regluit.experimental.zotero_books import MyZotero
@ -786,6 +867,17 @@ class LibraryThingTest(TestCase):
        self.assertEqual(work_id, SURFACING_LT_WORK_ID)
        work_id = lt_whatwork(title='Hamlet', author='Shakespeare')
        self.assertEqual(work_id, '2199')
+    def test_cache(self):
+
+        lt = LibraryThing()
+        res = lt.thingisbn(SURFACING_ISBN)
+        
+        res2 = lt.thingisbn(SURFACING_ISBN)
+        self.assertEqual(set(res), set(res2))
+        
+        self.assertEqual(lt.whatwork(SURFACING_ISBN), SURFACING_LT_WORK_ID)
+
+        self.assertEqual(lt.thingisbn(SURFACING_ISBN, return_work_id=True), SURFACING_LT_WORK_ID)
        
        
 def suite():
@ -793,7 +885,7 @@ def suite():
    #testcases = [WorkMapperTest,FreebaseBooksTest, OpenLibraryTest,GoogleBooksTest]
    testcases = []
    suites = unittest.TestSuite([unittest.TestLoader().loadTestsFromTestCase(testcase) for testcase in testcases])
-    suites.addTest(LibraryThingTest('test_whatwork'))
+    suites.addTest(LibraryThingTest('test_cache'))
    #suites.addTest(SettingsTest('test_dev_me_alignment'))  # give option to test this alignment
    return suites    
    
--- a/test/booktests.py
+++ b/test/booktests.py
@ -85,25 +85,99 @@ def load_gutenberg_books(fname="/Users/raymondyee/D/Document/Gluejar/Gluejar.git
        else:
            logger.info("%d null seed_isbn: ebook %s", i, ebook)

-def cluster_status():
+def cluster_status(max_num=None):
    """Look at the current Work, Edition instances to figure out what needs to be fixed"""
    results = OrderedDict([
        ('number of Works', models.Work.objects.count()),
-        ('number of Editions', models.Edition.objects.count())
+        ('number of Works w/o Identifier', models.Work.objects.filter(identifiers__isnull=True).count()),
+        ('number of Editions', models.Edition.objects.count()),
+        ('number of Editions with ISBN', models.Edition.objects.filter(identifiers__type='isbn').count()),
+        ('number of Editions without ISBNs', models.Edition.objects.exclude(identifiers__type='isbn').count()),
        ('number of Edition that have both Google Books id and ISBNs',
             models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()),
        ('number of Editions with Google Books IDs but not ISBNs',
             models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()),
        ])
    
-    # What needs to be done to recluster editions?
+    # models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id', 'edition__work__id', 'edition__work__language').count()
+    # 4 classes -- Edition have ISBN or not & ISBN is recognized or not by LT
+    # a) ISBN recognized by LT, b) ISBN not recognized by LT, c) no ISBN at all
+    
+    # [w._meta.get_all_related_objects() for w in works_no_ids] -- try to figure out whether any related objects before deleting
    
    # Are there Edition without ISBNs?  Look up the corresponding ISBNs from Google Books and Are they all singletons?
    
    # identify Editions that should be merged (e.g., if one Edition has a Google Books ID and another Edition has one with
    # an ISBN tied to that Google Books ID)
+
+
+    from collections import defaultdict
+    import shutil
+    import time
+    from collections import namedtuple
    
-    return results
+    # let's form a key to map all the Editions into
+    # (lt_work_id (or None), lang, ISBN (if lt_work_id is None or None if we don't know it), ed_id (or None) )
+    
+    work_clusters = defaultdict(set)
+    current_map = defaultdict(set)
+    
+    backup = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data_back.json'
+    fname = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data.json'
+    
+    EdInfo = namedtuple('EdInfo', ['isbn', 'ed_id', 'ed_title', 'work_id', 'lang'])
+    
+    shutil.copy(fname, backup)
+        
+    lt = bookdata.LibraryThing(fname)
+
+    try:
+        input_file = open(fname, "r")
+        success = lt.load()
+        print "success: %s" % (success)
+        input_file.close()
+    except Exception, e:
+        print e
+    
+    for (i, (isbn, ed_id, ed_title, work_id, lang)) in enumerate(
+        islice(models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id',
+                'edition__title', 'edition__work__id', 'edition__work__language'), max_num)):
+        
+        lt_work_id = lt.thingisbn(isbn, return_work_id=True)
+        key = (lt_work_id, lang, isbn if lt_work_id is None else None, None)
+        print i, isbn, lt_work_id, key
+        work_clusters[key].add(EdInfo(isbn=isbn, ed_id=ed_id, ed_title=ed_title, work_id=work_id, lang=lang))
+        current_map[work_id].add(key)
+    
+    lt.save()
+    
+    # Now add the Editions without any ISBNs
+    print "editions w/o isbn"
+    for (i, (ed_id, ed_title, work_id, lang)) in enumerate(
+        islice(models.Edition.objects.exclude(identifiers__type='isbn').values_list('id',
+                'title', 'work__id', 'work__language'), None)):
+        
+        key = (None, lang, None, ed_id)
+        print i, ed_id, ed_title, key
+        work_clusters[key].add(EdInfo(isbn=None, ed_id=ed_id, ed_title=ed_title, work_id=work_id, lang=lang))
+        current_map[work_id].add(key)
+
+    print "number of clusters", len(work_clusters)
+    
+    s = {'work_clusters':work_clusters, 'current_map':current_map, 'results':results}
+    
+    #
+    print "new clusters that map over more than one existing Work", \
+    [(k, len(set(([e.work_id for e in v])))) for (k,v) in s['work_clusters'].items() if len(set(([e.work_id for e in v]))) <> 1 ]
+    
+    m = current_map
+    print "existing Works that contain editions from more than 1 new cluster", \
+        sorted([k for (k,v) in m.items() if len(v) > 1])
+    
+    return s
+
+def all_editions():
+    pass