Code to repick the seed isbn to find isbns that are more likely to be found in a wide variety of data sources

2012-02-27 08:46:34 -08:00 · 2012-02-27 08:46:34 -08:00 · 86fb15b8bc
parent f7220d9812
commit 86fb15b8bc
1 changed files with 117 additions and 6 deletions
--- a/experimental/gutenberg/gutenberg.py
+++ b/experimental/gutenberg/gutenberg.py
@ -19,7 +19,7 @@ from urllib import urlencode
 from pprint import pprint
 from collections import defaultdict, OrderedDict

-from itertools import islice, chain, izip
+from itertools import islice, chain, izip, repeat
 import operator
 import time

@ -64,7 +64,8 @@ def grouper(iterable, page_size):
        if len(page) == page_size:
            yield page
            page= []
-    yield page
+    if len(page):
+        yield page
    
 def singleton(cls):
    instances = {}
@ -103,6 +104,8 @@ class SeedISBN(Base):
    id = Column(u'id', Integer(11), primary_key=True, nullable=False)
    results = Column(u'results', MEDIUMTEXT())
    seed_isbn = Column(u'seed_isbn', String(length=13))
+    title = Column(u'title', Text())
+    title_error = Column(u'title_error', Text())


 class GutenbergText(object):
@ -710,8 +713,34 @@ def seed_isbn(olwk_ids, freebase_ids, lang='en'):
               'len_all_isbns': len(all_isbns)}
    return (candidate_seed_isbn, details)

+def candidate_subcluster_from_lt_clusters_by_lang(lang, lt_clusters_by_lang):
+    """
+    Boil the candidate down to a single ISBN:  take a random ISBN from the list of all ISBNs in the requested
+    language subcluster within the largest cluster that has such a language subcluster.
+    Return None if there is no matching sub-language
+    Try to find an ISBN that has good overlap with Freebase and OpenLibrary   
+    """
+    candidate_subclusters = filter(lambda x: x[0] is not None,
+                                   [(c.get(lang), len(reduce(operator.add,c.values()))) for c in lt_clusters_by_lang]
+                            )
+
+    if len(candidate_subclusters):
+        candidate_subcluster = max(candidate_subclusters, key=lambda x:x[1])
+    else:
+        candidate_subcluster = []
+        
+    return candidate_seed_isbn
+
 def report_on_seed_isbn(seed_isbn_result):
+    """
+    return a dictionary interpreting the output of the seed isbn calculation
+    """
    s = seed_isbn_result
+    
+    # what proportion of all the ISBNS does the largest cluster make of all the ISBNs
+    # x is an iterable of cluster lengths
+    dominance = lambda x: float(max(x))/float(sum(x)) if len(x) else None
+    
    report = OrderedDict([
        ("seed isbn",  s[0]),
        ("the Google info we have on the seed isbn", s[1]['gbooks_data'].get(s[0])),
@ -730,7 +759,8 @@ def report_on_seed_isbn(seed_isbn_result):
            for c in s[1]['lt_clusters_by_lang']]),
        ("size of the sub-cluster including the seed isbn", len(filter(lambda x: s[0] in x,
                reduce(operator.add , [c.values() for c in s[1]['lt_clusters_by_lang']]))[0]) \
-                if s[0] is not None else None)
+                if s[0] is not None else None),
+        ("dominance of largest cluster", dominance([len(cluster) for cluster in s[1]['lt_clusters']]))
    ])
    return report

@ -813,7 +843,9 @@ def calc_seed_isbns(ids=None, max=None, override=False, max_consecutive_error=3)
            
            
 def reports_in_db(max=None):
-    
+    """
+    a generator of all the Gutenberg seed isbn calculations 
+    """
    gluejar_db = GluejarDB()
    gutenberg_done = gluejar_db.session.query(SeedISBN).all()
    for s in islice(gutenberg_done, max):
@ -874,6 +906,84 @@ def export_to_json(obj, max=None,fname=None):
    
    return json.dumps(obj)

+def calc_titles_for_seed_isbns(max_num=None, do=False):
+    """
+    For the seedisbns, calculate the titles
+    """
+    db = GluejarDB()
+
+    # title is Null and title_error is Null
+    #titles_to_calc = db.session.query(SeedISBN).filter(and_(SeedISBN.title==None, SeedISBN.title_error==None)).all()
+    titles_to_calc = db.session.query(SeedISBN, GutenbergText.lang, GutenbergText.title). \
+        join(GutenbergText, SeedISBN.gutenberg_etext_id==GutenbergText.etext_id).  \
+        filter(and_(SeedISBN.title==None, SeedISBN.title_error==None)).all()
+
+    page_size = 5
+    
+    for page in grouper(islice(titles_to_calc, max_num), page_size):
+        query = list(izip([edition.seed_isbn for (edition, lang, gt_title) in page], repeat('isbn')))
+        try:
+            res = OpenLibrary.read(query)
+        except Exception, e:
+            print e
+            
+        for (edition, lang, gt_title) in page:
+            title_error = None
+            try:
+                title = res.get('isbn:{0}'.format(edition.seed_isbn))['records'].values()[0]['data']['title']
+            except Exception, e:
+                title = None
+                title_error = str(e)
+            if do and title is not None:
+                edition.title = title
+                edition.title_error = title_error
+                db.commit_db()
+            yield (edition.seed_isbn, title)
+
+ 
+def repick_seed_isbn(max_num=None, do=False, print_progress=False):
+    """
+    Let's try to get ISBNs in the cluster that are in OpenLibrary, Freebase, and Librarything if possible
+    """
+    gluejar_db = GluejarDB()
+    gutenberg_done = gluejar_db.session.query(SeedISBN, GutenbergText.lang, GutenbergText.title).join(GutenbergText, SeedISBN.gutenberg_etext_id==GutenbergText.etext_id).all()
+    # need to join with GutenbergText table to get lang and Gutenberg title
+    for (i, (s, lang, gt_title)) in enumerate(islice(gutenberg_done, max_num)):
+        # calculate the dominant cluster
+        results = json.loads(s.results)
+        candidate_subclusters = filter(lambda x: x[0] is not None,
+                               [(c.get(lang), len(reduce(operator.add,c.values()))) for c in results[1]['lt_clusters_by_lang']]
+                        )
+            
+        # remember that the cluster is the first element in the tuple and a length in the 2nd element
+        if len(candidate_subclusters):
+            candidate_subcluster = set(max(candidate_subclusters, key=lambda x:x[1])[0])
+        else:
+            candidate_subcluster = set([])
+            
+        # confirm that the current seed isbn is in the candidate subcluster
+        current_seed_ok = s.seed_isbn in candidate_subcluster
+            
+        # see whether we can get a seed isbn that, in addition to LibraryThing,
+        # is recognized by OpenLibrary and Freebase too...2nd priority
+        # is just OL, 3rd is Freebase and the 4th) just LT
+        fb_isbns = set(results[1]['fb_isbns'])
+        ol_isbns = set(results[1]['ol_isbns'])
+        
+        seeds = (candidate_subcluster & fb_isbns & ol_isbns) or (candidate_subcluster & ol_isbns) or \
+            (candidate_subcluster & fb_isbns) or candidate_subcluster
+        
+        new_seed_isbn = None
+        
+        if do and len(seeds):
+            new_seed_isbn = seeds.pop()
+            s.seed_isbn = new_seed_isbn
+            gluejar_db.commit_db()
+                
+        if print_progress:
+            print i, s.gutenberg_etext_id, s.seed_isbn, lang, gt_title, seeds, current_seed_ok, new_seed_isbn
+        yield (s.gutenberg_etext_id, s.seed_isbn, lang, gt_title, seeds, current_seed_ok, new_seed_isbn)
+    
    
 class FreebaseClient(object):
    def __init__(self, username=None, password=None, main_or_sandbox='main'):
@ -1123,8 +1233,9 @@ if __name__ == '__main__':
    
    #unittest.main()

-    print list(gutenberg_and_seed_isbn(max=10))
-            
+    #print list(gutenberg_and_seed_isbn(max=10))
+     
+    print list(repick_seed_isbn(10))      
    
    #suites = suite()
    #suites = unittest.defaultTestLoader.loadTestsFromModule(__import__('__main__'))