Code to repick the seed isbn to find isbns that are more likely to be found in a wide variety of data sources

2012-02-27 08:46:34 -08:00 · 2012-02-27 08:46:34 -08:00 · 86fb15b8bc
parent f7220d9812
commit 86fb15b8bc
1 changed files with 117 additions and 6 deletions
--- a/experimental/gutenberg/gutenberg.py
+++ b/experimental/gutenberg/gutenberg.py
@ -19,7 +19,7 @@ from urllib import urlencode
 from pprint import pprint
 from collections import defaultdict, OrderedDict
-from itertools import islice, chain, izip
+from itertools import islice, chain, izip, repeat
 import operator
 import time
@ -64,7 +64,8 @@ def grouper(iterable, page_size):
        if len(page) == page_size:
            yield page
            page= []
-    yield page
+    if len(page):
        yield page
 def singleton(cls):
    instances = {}
@ -103,6 +104,8 @@ class SeedISBN(Base):
    id = Column(u'id', Integer(11), primary_key=True, nullable=False)
    results = Column(u'results', MEDIUMTEXT())
    seed_isbn = Column(u'seed_isbn', String(length=13))
    title = Column(u'title', Text())
    title_error = Column(u'title_error', Text())
 class GutenbergText(object):
@ -710,8 +713,34 @@ def seed_isbn(olwk_ids, freebase_ids, lang='en'):
               'len_all_isbns': len(all_isbns)}
    return (candidate_seed_isbn, details)
 def candidate_subcluster_from_lt_clusters_by_lang(lang, lt_clusters_by_lang):
    """
    Boil the candidate down to a single ISBN:  take a random ISBN from the list of all ISBNs in the requested
    language subcluster within the largest cluster that has such a language subcluster.
    Return None if there is no matching sub-language
    Try to find an ISBN that has good overlap with Freebase and OpenLibrary   
    """
    candidate_subclusters = filter(lambda x: x[0] is not None,
                                   [(c.get(lang), len(reduce(operator.add,c.values()))) for c in lt_clusters_by_lang]
                            )
    if len(candidate_subclusters):
        candidate_subcluster = max(candidate_subclusters, key=lambda x:x[1])
    else:
        candidate_subcluster = []
    return candidate_seed_isbn
 def report_on_seed_isbn(seed_isbn_result):
    """
    return a dictionary interpreting the output of the seed isbn calculation
    """
    s = seed_isbn_result
    # what proportion of all the ISBNS does the largest cluster make of all the ISBNs
    # x is an iterable of cluster lengths
    dominance = lambda x: float(max(x))/float(sum(x)) if len(x) else None
    report = OrderedDict([
        ("seed isbn",  s[0]),
        ("the Google info we have on the seed isbn", s[1]['gbooks_data'].get(s[0])),
@ -730,7 +759,8 @@ def report_on_seed_isbn(seed_isbn_result):
            for c in s[1]['lt_clusters_by_lang']]),
        ("size of the sub-cluster including the seed isbn", len(filter(lambda x: s[0] in x,
                reduce(operator.add , [c.values() for c in s[1]['lt_clusters_by_lang']]))[0]) \
-                if s[0] is not None else None)
+                if s[0] is not None else None),
        ("dominance of largest cluster", dominance([len(cluster) for cluster in s[1]['lt_clusters']]))
    ])
    return report
@ -813,7 +843,9 @@ def calc_seed_isbns(ids=None, max=None, override=False, max_consecutive_error=3)
 def reports_in_db(max=None):
-    
+    """
    a generator of all the Gutenberg seed isbn calculations 
    """
    gluejar_db = GluejarDB()
    gutenberg_done = gluejar_db.session.query(SeedISBN).all()
    for s in islice(gutenberg_done, max):
@ -874,6 +906,84 @@ def export_to_json(obj, max=None,fname=None):
    return json.dumps(obj)
 def calc_titles_for_seed_isbns(max_num=None, do=False):
    """
    For the seedisbns, calculate the titles
    """
    db = GluejarDB()
    # title is Null and title_error is Null
    #titles_to_calc = db.session.query(SeedISBN).filter(and_(SeedISBN.title==None, SeedISBN.title_error==None)).all()
    titles_to_calc = db.session.query(SeedISBN, GutenbergText.lang, GutenbergText.title). \
        join(GutenbergText, SeedISBN.gutenberg_etext_id==GutenbergText.etext_id).  \
        filter(and_(SeedISBN.title==None, SeedISBN.title_error==None)).all()
    page_size = 5
    for page in grouper(islice(titles_to_calc, max_num), page_size):
        query = list(izip([edition.seed_isbn for (edition, lang, gt_title) in page], repeat('isbn')))
        try:
            res = OpenLibrary.read(query)
        except Exception, e:
            print e
        for (edition, lang, gt_title) in page:
            title_error = None
            try:
                title = res.get('isbn:{0}'.format(edition.seed_isbn))['records'].values()[0]['data']['title']
            except Exception, e:
                title = None
                title_error = str(e)
            if do and title is not None:
                edition.title = title
                edition.title_error = title_error
                db.commit_db()
            yield (edition.seed_isbn, title)
 def repick_seed_isbn(max_num=None, do=False, print_progress=False):
    """
    Let's try to get ISBNs in the cluster that are in OpenLibrary, Freebase, and Librarything if possible
    """
    gluejar_db = GluejarDB()
    gutenberg_done = gluejar_db.session.query(SeedISBN, GutenbergText.lang, GutenbergText.title).join(GutenbergText, SeedISBN.gutenberg_etext_id==GutenbergText.etext_id).all()
    # need to join with GutenbergText table to get lang and Gutenberg title
    for (i, (s, lang, gt_title)) in enumerate(islice(gutenberg_done, max_num)):
        # calculate the dominant cluster
        results = json.loads(s.results)
        candidate_subclusters = filter(lambda x: x[0] is not None,
                               [(c.get(lang), len(reduce(operator.add,c.values()))) for c in results[1]['lt_clusters_by_lang']]
                        )
        # remember that the cluster is the first element in the tuple and a length in the 2nd element
        if len(candidate_subclusters):
            candidate_subcluster = set(max(candidate_subclusters, key=lambda x:x[1])[0])
        else:
            candidate_subcluster = set([])
        # confirm that the current seed isbn is in the candidate subcluster
        current_seed_ok = s.seed_isbn in candidate_subcluster
        # see whether we can get a seed isbn that, in addition to LibraryThing,
        # is recognized by OpenLibrary and Freebase too...2nd priority
        # is just OL, 3rd is Freebase and the 4th) just LT
        fb_isbns = set(results[1]['fb_isbns'])
        ol_isbns = set(results[1]['ol_isbns'])
        seeds = (candidate_subcluster & fb_isbns & ol_isbns) or (candidate_subcluster & ol_isbns) or \
            (candidate_subcluster & fb_isbns) or candidate_subcluster
        new_seed_isbn = None
        if do and len(seeds):
            new_seed_isbn = seeds.pop()
            s.seed_isbn = new_seed_isbn
            gluejar_db.commit_db()
        if print_progress:
            print i, s.gutenberg_etext_id, s.seed_isbn, lang, gt_title, seeds, current_seed_ok, new_seed_isbn
        yield (s.gutenberg_etext_id, s.seed_isbn, lang, gt_title, seeds, current_seed_ok, new_seed_isbn)
 class FreebaseClient(object):
    def __init__(self, username=None, password=None, main_or_sandbox='main'):
@ -1123,8 +1233,9 @@ if __name__ == '__main__':
    #unittest.main()
-    print list(gutenberg_and_seed_isbn(max=10))
+    #print list(gutenberg_and_seed_isbn(max=10))
-            
+     
    print list(repick_seed_isbn(10))      
    #suites = suite()
    #suites = unittest.defaultTestLoader.loadTestsFromModule(__import__('__main__'))