Merge branch 'production'

2012-02-24 14:56:45 -08:00 · 2012-02-24 14:56:45 -08:00 · 538d225ad0
parent a9675bdd10 e5988b468e
commit 538d225ad0
5 changed files with 345 additions and 64 deletions
--- a/core/bookloader.py
+++ b/core/bookloader.py
@ -636,6 +636,7 @@ def add_missing_isbn_to_editions(max_num=None, confirm=False):
        'no_isbn_found': no_isbn_found,
        'editions_to_merge': editions_to_merge, 
        'exceptions': exceptions,
+        'google_id_not_found': google_id_not_found,
        'confirm': ok
    }

--- a/core/management/commands/zap_frankenworks.py
+++ b/core/management/commands/zap_frankenworks.py
@ -0,0 +1,34 @@
+"""
+Dispose of the Frankenworks and recluster the works.  Print out email addresses of those whose wishlists have been
+affected.
+"""
+
+from django.core.management.base import BaseCommand
+from regluit.test import booktests
+
+
+class Command(BaseCommand):
+    help = "Dispose of the Frankenworks and recluster the works.  Print out email addresses of those whose wishlists have been affected."
+    args = "<do>"
+    
+    def handle(self, do, **options):
+
+        try:
+            do = str(do)
+            if do.lower() == 'true':
+                do = True
+            else:
+                do = False
+        except:
+            do = False
+            
+        print "before..."
+        s = booktests.cluster_status()
+        print s['results']
+        
+        booktests.clean_frankenworks(s, do=do)
+        s = booktests.cluster_status()
+        print "after cleanup...."
+        print "results ", s['results']
+        print "scattered clusters ", s['scattered_clusters']
+        print "franken works", s['franken_works']
--- a/experimental/bookdata.py
+++ b/experimental/bookdata.py
@ -11,6 +11,7 @@ import random
 random.seed()

 import sys, os
+import json

 # a kludge to allow for isbn.py to be imported 
 # and not just in the context of the regluit Django app
@ -436,15 +437,14 @@ class FreebaseBooks(object):
            self.freebase.login(username,password)
    def books(self):
        MQL = u"""[{
-  "type": "/book/book",
-  "id":   null,
-  "key": [{
-    "namespace": "/wikipedia/en",
-    "value":     null,
-    "type":      "/type/key"
-  }]
-}]
-""".replace("\n"," ")
+                "type": "/book/book",
+                "id":   null,
+                "key": [{
+                  "namespace": "/wikipedia/en",
+                  "value":     null,
+                  "type":      "/type/key"
+                }]
+              }]""".replace("\n"," ")
        query = json.loads(MQL)
        resp = self.freebase.mqlreaditer(query)
        for r in resp:
@ -452,18 +452,17 @@ class FreebaseBooks(object):

    def book_editions(self):
        MQL = u"""[{
-  "type":        "/book/book_edition",
-  "id":          null,
-  "isbn":        [{}],
-  "ISBN":        [{}],
-  "LCCN":        [{}],
-  "OCLC_number": [{}],
-  "openlibrary_id": [{}],
-  "book": {
-    "id":   null,
-    "name": null
-  }
-}]""".replace("\n"," ")
+        "type":        "/book/book_edition",
+        "id":          null,
+        "isbn":        [{}],
+        "ISBN":        [{}],
+        "LCCN":        [{}],
+        "OCLC_number": [{}],
+        "openlibrary_id": [{}],
+        "book": {
+          "id":   null,
+          "name": null
+        }}]""".replace("\n"," ")
        query = json.loads(MQL)
        resp = self.freebase.mqlreaditer(query)
        for r in resp:
@ -471,18 +470,17 @@ class FreebaseBooks(object):

    def editions_for_book(self, book_id):
        MQL = u"""[{
-  "type":        "/book/book_edition",
-  "id":          null,
-  "isbn":        [{}],
-  "ISBN":        [{}],
-  "LCCN":        [{}],
-  "OCLC_number": [{}],
-  "openlibrary_id": [{}],
-  "book": {
-    "id":   null,
-    "name": null
-  }
-}]""".replace("\n"," ")
+        "type":        "/book/book_edition",
+        "id":          null,
+        "isbn":        [{}],
+        "ISBN":        [{}],
+        "LCCN":        [{}],
+        "OCLC_number": [{}],
+        "openlibrary_id": [{}],
+        "book": {
+          "id":   null,
+          "name": null
+        }}]""".replace("\n"," ")
        query = json.loads(MQL)
        query[0]["book"]["id"] = book_id
        resp = self.freebase.mqlreaditer(query)
@ -491,18 +489,17 @@ class FreebaseBooks(object):
            
    def book_edition_by_id(self,id,id_type):
        MQL = u"""[{
-  "type":        "/book/book_edition",
-  "id":          null,
-  "isbn":        [{}],
-  "ISBN":        [{}],
-  "LCCN":        [{}],
-  "OCLC_number": [{}],
-  "openlibrary_id": [{}],
-  "book": {
-    "id":   null,
-    "name": null
-  }
-}]""".replace("\n"," ")
+        "type":        "/book/book_edition",
+        "id":          null,
+        "isbn":        [{}],
+        "ISBN":        [{}],
+        "LCCN":        [{}],
+        "OCLC_number": [{}],
+        "openlibrary_id": [{}],
+        "book": {
+          "id":   null,
+          "name": null
+        }}]""".replace("\n"," ")
        query = json.loads(MQL)
        if id_type == 'isbn':
            query[0][id_type][0].setdefault('name', id)
@ -526,18 +523,18 @@ class FreebaseBooks(object):
        elif isbn_val is not None:
            isbn_val = isbn_mod.ISBN(isbn_val).to_string('13')
            MQL = """[{
-  "type": "/book/book_edition",
-  "isbn": {
-    "name": null
-  },
-  "book": {
-    "editions": [{
-      "isbn": {
-        "name": null
-      }
-    }]
-  }
-}]""".replace("\n"," ")
+                "type": "/book/book_edition",
+                "isbn": {
+                  "name": null
+                },
+                "book": {
+                  "editions": [{
+                    "isbn": {
+                      "name": null
+                    }
+                  }]
+                }
+              }]""".replace("\n"," ")
            query = json.loads(MQL)
            query[0]["book"]["editions"][0]["isbn"]["name"] = isbn_val
            resp = self.freebase.mqlreaditer(query)
@ -565,7 +562,91 @@ class WorkMapper(object):
                    yield work_id
                    if not complete_search:
                        raise StopIteration()    
+
+class LibraryThing(object):
+    """
+    Provide cached access to thingisbn and LT whatwork interface.  Allow for a cache file to be loaded and saved
+    """
+    def __init__(self, fname=None):
+        self.__isbn_to_work_id = {}
+        self.fname = fname
+    def __del__(self):
+        self.save()
+    def thingisbn(self, isbn, return_work_id=False):
+        """ if return_work_id is True, we won't try to calculate all the relevant isbns"""
+        # first, normalize the isbn
+        isbn = isbn_mod.ISBN(isbn).to_string('13')
+        if isbn is None: return []
+        
+        # check to see whether we have isbn already
+        if isbn in self.__isbn_to_work_id:
+            # return all isbns with the work id
+            # print "%s already cached" % (isbn)
+            work_id = self.__isbn_to_work_id.get(isbn)
+
+            if return_work_id:
+                return work_id            
+            if work_id is not None:
+                return [k for (k, v) in self.__isbn_to_work_id.items() if v == work_id]
+            else:
+                return []
+        else:
+            # if isbn is not already cached, do look up and cache the results and return the results
+            print "calling thingisbn for %s" % (isbn)
+            results = [isbn_mod.ISBN(k).to_string('13') for k in thingisbn (isbn)]
+            if len(results):
+                # look up the librarything work id
+                work_id = self.whatwork(isbn)
+                
+                if work_id is not None: # which should be the case since results is not zero-length
+                    self.__isbn_to_work_id.update(dict([(isbn_mod.ISBN(result).to_string('13'), work_id) for result in results]))
+                else:
+                    logger.exception("work_id should not be None for isbn %s", isbn)
+                    return []
+            else:
+                self.__isbn_to_work_id[isbn] = None  # mark as not recognized by LT
+                work_id = None
+                
+            if return_work_id:
+                return work_id
+            else:
+                return results
+                    
+    def whatwork(self, isbn=None, title=None, author=None):
+        # if isbn is not None and title, author None then look up results, otherwise just pass along to lt_whatwork
+        # first, normalize the isbn
+        isbn = isbn_mod.ISBN(isbn).to_string('13')
+        if isbn is not None and (title is None and author is None):
+            if isbn in self.__isbn_to_work_id:
+                work_id = self.__isbn_to_work_id.get(isbn)
+            else:
+                work_id = lt_whatwork(isbn=isbn)
+                self.__isbn_to_work_id[isbn] = work_id
+            return work_id
+        else:
+            return lt_whatwork(isbn=isbn, title=title, author=author)
+    def load(self):
+        try:
+            f = open(self.fname)
+            input_data = json.load(f)
+            f.close()
            
+            if isinstance(input_data, dict):
+                self.__isbn_to_work_id = input_data
+                return True
+            else:
+                return False
+        except Exception, e:
+            print e
+    def save(self):
+        if self.fname is not None:
+            f = open(self.fname, "w")
+            json.dump(self.__isbn_to_work_id, f)
+            f.close()
+            return True
+        else:
+            return False
+ 

 def look_up_my_zotero_books_in_hathi():
    from regluit.experimental.zotero_books import MyZotero
@ -786,6 +867,17 @@ class LibraryThingTest(TestCase):
        self.assertEqual(work_id, SURFACING_LT_WORK_ID)
        work_id = lt_whatwork(title='Hamlet', author='Shakespeare')
        self.assertEqual(work_id, '2199')
+    def test_cache(self):
+
+        lt = LibraryThing()
+        res = lt.thingisbn(SURFACING_ISBN)
+        
+        res2 = lt.thingisbn(SURFACING_ISBN)
+        self.assertEqual(set(res), set(res2))
+        
+        self.assertEqual(lt.whatwork(SURFACING_ISBN), SURFACING_LT_WORK_ID)
+
+        self.assertEqual(lt.thingisbn(SURFACING_ISBN, return_work_id=True), SURFACING_LT_WORK_ID)
        
        
 def suite():
@ -793,7 +885,7 @@ def suite():
    #testcases = [WorkMapperTest,FreebaseBooksTest, OpenLibraryTest,GoogleBooksTest]
    testcases = []
    suites = unittest.TestSuite([unittest.TestLoader().loadTestsFromTestCase(testcase) for testcase in testcases])
-    suites.addTest(LibraryThingTest('test_whatwork'))
+    suites.addTest(LibraryThingTest('test_cache'))
    #suites.addTest(SettingsTest('test_dev_me_alignment'))  # give option to test this alignment
    return suites    
    
--- a/experimental/lt_data.json.gz
+++ b/experimental/lt_data.json.gz
--- a/test/booktests.py
+++ b/test/booktests.py
@ -1,12 +1,15 @@
 from regluit.core import librarything, bookloader, models, tasks
-from collections import OrderedDict
-from itertools import izip, islice
+from collections import OrderedDict, defaultdict, namedtuple
+from itertools import izip, islice, repeat
 import django

 from django.db.models import Q, F
 from regluit.core import bookloader
+from django.contrib.comments.models import Comment
+
 import warnings
 import datetime
+from regluit import experimental
 from regluit.experimental import bookdata
 from datetime import datetime
 import json
@ -14,6 +17,20 @@ import json
 import logging
 logger = logging.getLogger(__name__)

+def dictset(itertuple):
+    s = defaultdict(set)
+    for (k, v) in itertuple:
+        s[k].add(v)
+    return s
+
+def dictlist(itertuple):
+    d = defaultdict(list)
+    for (k, v) in itertuple:
+        d[k].append(v)
+    return d    
+    
+EdInfo = namedtuple('EdInfo', ['isbn', 'ed_id', 'ed_title', 'ed_created', 'work_id', 'work_created', 'lang'])
+    
 def ry_lt_books():
    """return parsing of rdhyee's LibraryThing collection"""
    lt = librarything.LibraryThing('rdhyee')
@ -85,26 +102,163 @@ def load_gutenberg_books(fname="/Users/raymondyee/D/Document/Gluejar/Gluejar.git
        else:
            logger.info("%d null seed_isbn: ebook %s", i, ebook)

-def cluster_status():
+def cluster_status(max_num=None):
    """Look at the current Work, Edition instances to figure out what needs to be fixed"""
    results = OrderedDict([
        ('number of Works', models.Work.objects.count()),
-        ('number of Editions', models.Edition.objects.count())
+        ('number of Works w/o Identifier', models.Work.objects.filter(identifiers__isnull=True).count()),
+        ('number of Editions', models.Edition.objects.count()),
+        ('number of Editions with ISBN', models.Edition.objects.filter(identifiers__type='isbn').count()),
+        ('number of Editions without ISBNs', models.Edition.objects.exclude(identifiers__type='isbn').count()),
        ('number of Edition that have both Google Books id and ISBNs',
             models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()),
        ('number of Editions with Google Books IDs but not ISBNs',
             models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()),
        ])
    
-    # What needs to be done to recluster editions?
+    # models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id', 'edition__work__id', 'edition__work__language').count()
+    # 4 classes -- Edition have ISBN or not & ISBN is recognized or not by LT
+    # a) ISBN recognized by LT, b) ISBN not recognized by LT, c) no ISBN at all
+    
+    # [w._meta.get_all_related_objects() for w in works_no_ids] -- try to figure out whether any related objects before deleting
    
    # Are there Edition without ISBNs?  Look up the corresponding ISBNs from Google Books and Are they all singletons?
    
    # identify Editions that should be merged (e.g., if one Edition has a Google Books ID and another Edition has one with
    # an ISBN tied to that Google Books ID)
+
+
+    import shutil
+    import time
+    import operator
+ 
    
-    return results
+    # let's form a key to map all the Editions into
+    # (lt_work_id (or None), lang, ISBN (if lt_work_id is None or None if we don't know it), ed_id (or None) )
    
+    work_clusters = defaultdict(set)
+    current_map = defaultdict(set)
+    
+    #backup = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data_back.json'
+    backup = '{0}/lt_data_back.json'.format(experimental.__path__[0])
+    #fname = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data.json'
+    fname = '{0}/lt_data.json'.format(experimental.__path__[0])
+    
+    shutil.copy(fname, backup)
+        
+    lt = bookdata.LibraryThing(fname)
+
+    try:
+        input_file = open(fname, "r")
+        success = lt.load()
+        print "success: %s" % (success)
+        input_file.close()
+    except Exception, e:
+        print e
+    
+    for (i, (isbn, ed_id, ed_title, ed_created,  work_id, work_created, lang)) in enumerate(
+        islice(models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id',
+                'edition__title', 'edition__created', 'edition__work__id',
+                'edition__work__created', 'edition__work__language'), max_num)):
+        
+        lt_work_id = lt.thingisbn(isbn, return_work_id=True)
+        key = (lt_work_id, lang, isbn if lt_work_id is None else None, None)
+        print i, isbn, lt_work_id, key
+        work_clusters[key].add(EdInfo(isbn=isbn, ed_id=ed_id, ed_title=ed_title, ed_created=ed_created,
+                                      work_id=work_id, work_created=work_created, lang=lang))
+        current_map[work_id].add(key)
+    
+    lt.save()
+    
+    # Now add the Editions without any ISBNs
+    print "editions w/o isbn"
+    for (i, (ed_id, ed_title, ed_created, work_id, work_created, lang)) in enumerate(
+        islice(models.Edition.objects.exclude(identifiers__type='isbn').values_list('id',
+                'title', 'created', 'work__id', 'work__created', 'work__language' ), None)):
+        
+        key = (None, lang, None, ed_id)
+        print i, ed_id, ed_title.encode('ascii','ignore'), key
+        work_clusters[key].add(EdInfo(isbn=None, ed_id=ed_id, ed_title=ed_title, ed_created=ed_created,
+                                      work_id=work_id, work_created=work_created, lang=lang))
+        current_map[work_id].add(key)
+
+    print "number of clusters", len(work_clusters)
+    
+    # all unglue.it Works that contain Editions belonging to more than one newly calculated cluster are "FrankenWorks"
+    franken_works = sorted([k for (k,v) in current_map.items() if len(v) > 1])
+    
+    # let's calculate the list of users affected if delete the Frankenworks, the number of works deleted from their wishlist
+    # specifically a list of emails to send out
+    
+    affected_works = [models.Work.objects.get(id=w_id)  for w_id in franken_works]
+    affected_wishlists = set(reduce(operator.add, [list(w.wishlists.all())  for w in affected_works])) if len(affected_works) else set()
+    
+    affected_emails = [w.user.email  for w in affected_wishlists]
+    affected_editions = reduce(operator.add, [list(w.editions.all()) for w in affected_works]) if len(affected_works) else []
+    
+    # calculate the Comments that would have to be deleted too.
+    affected_comments = reduce(operator.add, [list(Comment.objects.for_model(w)) for w in affected_works]) if len(affected_works) else []
+    
+    # calculate the inverse of work_clusters
+    wcp = dict(reduce(operator.add, [ list( izip([ed.ed_id for ed in eds], repeat(k))) for (k,eds) in work_clusters.items()]))
+    
+    # (I'm not completely sure of this calc -- but the datetime of the latest franken-event)
+    latest_franken_event = max([ max([min(map(lambda x: x[1], v)) for v in dictlist([(wcp[ed["id"]], (ed["id"], ed["created"].isoformat()))
+        for ed in models.Work.objects.get(id=w_id).editions.values('id', 'created')]).values()])
+         for w_id in franken_works]) if len(franken_works) else None
+    
+    scattered_clusters = [(k, len(set(([e.work_id for e in v])))) for (k,v) in work_clusters.items() if len(set(([e.work_id for e in v]))) <> 1 ]    
+    
+    s = {'work_clusters':work_clusters, 'current_map':current_map, 'results':results, 'franken_works': franken_works,
+         'wcp':wcp, 'latest_franken_event': latest_franken_event, 'affected_works':affected_works,
+         'affected_comments': affected_comments, 'scattered_clusters': scattered_clusters,
+         'affected_emails': affected_emails}
+    
+    return s
+
+def clean_frankenworks(s, do=False):
+    # list out the email addresses of accounts with wishlists to be affected
+    
+    print "number of email addresses: ", len(s['affected_emails'])
+    print ", ".join(s['affected_emails'])
+    
+    # list the works we delete
+    print "number of FrankenWorks", len(s['franken_works'])
+    print s['franken_works']
+    
+    # delete the affected comments
+    print "deleting comments"
+    for (i, comment) in enumerate(s['affected_comments']):
+        print i, "deleting ", comment
+        if do:
+            comment.delete()
+    
+    # delete the Frankenworks
+    print "deleting Frankenworks"
+    for (i, work) in enumerate(s['affected_works']):
+        print i, "deleting ", work.id
+        if do:
+            work.delete()    
+    
+    # run reclustering surgically -- calculate a set of ISBNs to feed to bookloader.add_related
+    
+    # assuming x is a set
+    popisbn = lambda x: list(x)[0].isbn if len(x) else None
+    
+    # group scattered_clusters by LT work id
+    scattered_lt = dictlist([(k[0], k) for (k,v) in s['scattered_clusters']])
+    isbns = map(popisbn, [s['work_clusters'][k[0]] for k in scattered_lt.values()])
+    
+    print "running bookloader"
+    for (i, isbn) in enumerate(isbns):
+        print i, isbn
+        if do:
+            bookloader.add_related(isbn)
+    
+    
+    
+    
+