Code to repick the seed isbn to find isbns that are more likely to be found in a wide variety of data sources

pull/1/head
Raymond Yee 2012-02-27 08:46:34 -08:00
parent f7220d9812
commit 86fb15b8bc
1 changed files with 117 additions and 6 deletions

View File

@ -19,7 +19,7 @@ from urllib import urlencode
from pprint import pprint from pprint import pprint
from collections import defaultdict, OrderedDict from collections import defaultdict, OrderedDict
from itertools import islice, chain, izip from itertools import islice, chain, izip, repeat
import operator import operator
import time import time
@ -64,7 +64,8 @@ def grouper(iterable, page_size):
if len(page) == page_size: if len(page) == page_size:
yield page yield page
page= [] page= []
yield page if len(page):
yield page
def singleton(cls): def singleton(cls):
instances = {} instances = {}
@ -103,6 +104,8 @@ class SeedISBN(Base):
id = Column(u'id', Integer(11), primary_key=True, nullable=False) id = Column(u'id', Integer(11), primary_key=True, nullable=False)
results = Column(u'results', MEDIUMTEXT()) results = Column(u'results', MEDIUMTEXT())
seed_isbn = Column(u'seed_isbn', String(length=13)) seed_isbn = Column(u'seed_isbn', String(length=13))
title = Column(u'title', Text())
title_error = Column(u'title_error', Text())
class GutenbergText(object): class GutenbergText(object):
@ -710,8 +713,34 @@ def seed_isbn(olwk_ids, freebase_ids, lang='en'):
'len_all_isbns': len(all_isbns)} 'len_all_isbns': len(all_isbns)}
return (candidate_seed_isbn, details) return (candidate_seed_isbn, details)
def candidate_subcluster_from_lt_clusters_by_lang(lang, lt_clusters_by_lang):
"""
Boil the candidate down to a single ISBN: take a random ISBN from the list of all ISBNs in the requested
language subcluster within the largest cluster that has such a language subcluster.
Return None if there is no matching sub-language
Try to find an ISBN that has good overlap with Freebase and OpenLibrary
"""
candidate_subclusters = filter(lambda x: x[0] is not None,
[(c.get(lang), len(reduce(operator.add,c.values()))) for c in lt_clusters_by_lang]
)
if len(candidate_subclusters):
candidate_subcluster = max(candidate_subclusters, key=lambda x:x[1])
else:
candidate_subcluster = []
return candidate_seed_isbn
def report_on_seed_isbn(seed_isbn_result): def report_on_seed_isbn(seed_isbn_result):
"""
return a dictionary interpreting the output of the seed isbn calculation
"""
s = seed_isbn_result s = seed_isbn_result
# what proportion of all the ISBNS does the largest cluster make of all the ISBNs
# x is an iterable of cluster lengths
dominance = lambda x: float(max(x))/float(sum(x)) if len(x) else None
report = OrderedDict([ report = OrderedDict([
("seed isbn", s[0]), ("seed isbn", s[0]),
("the Google info we have on the seed isbn", s[1]['gbooks_data'].get(s[0])), ("the Google info we have on the seed isbn", s[1]['gbooks_data'].get(s[0])),
@ -730,7 +759,8 @@ def report_on_seed_isbn(seed_isbn_result):
for c in s[1]['lt_clusters_by_lang']]), for c in s[1]['lt_clusters_by_lang']]),
("size of the sub-cluster including the seed isbn", len(filter(lambda x: s[0] in x, ("size of the sub-cluster including the seed isbn", len(filter(lambda x: s[0] in x,
reduce(operator.add , [c.values() for c in s[1]['lt_clusters_by_lang']]))[0]) \ reduce(operator.add , [c.values() for c in s[1]['lt_clusters_by_lang']]))[0]) \
if s[0] is not None else None) if s[0] is not None else None),
("dominance of largest cluster", dominance([len(cluster) for cluster in s[1]['lt_clusters']]))
]) ])
return report return report
@ -813,7 +843,9 @@ def calc_seed_isbns(ids=None, max=None, override=False, max_consecutive_error=3)
def reports_in_db(max=None): def reports_in_db(max=None):
"""
a generator of all the Gutenberg seed isbn calculations
"""
gluejar_db = GluejarDB() gluejar_db = GluejarDB()
gutenberg_done = gluejar_db.session.query(SeedISBN).all() gutenberg_done = gluejar_db.session.query(SeedISBN).all()
for s in islice(gutenberg_done, max): for s in islice(gutenberg_done, max):
@ -874,6 +906,84 @@ def export_to_json(obj, max=None,fname=None):
return json.dumps(obj) return json.dumps(obj)
def calc_titles_for_seed_isbns(max_num=None, do=False):
"""
For the seedisbns, calculate the titles
"""
db = GluejarDB()
# title is Null and title_error is Null
#titles_to_calc = db.session.query(SeedISBN).filter(and_(SeedISBN.title==None, SeedISBN.title_error==None)).all()
titles_to_calc = db.session.query(SeedISBN, GutenbergText.lang, GutenbergText.title). \
join(GutenbergText, SeedISBN.gutenberg_etext_id==GutenbergText.etext_id). \
filter(and_(SeedISBN.title==None, SeedISBN.title_error==None)).all()
page_size = 5
for page in grouper(islice(titles_to_calc, max_num), page_size):
query = list(izip([edition.seed_isbn for (edition, lang, gt_title) in page], repeat('isbn')))
try:
res = OpenLibrary.read(query)
except Exception, e:
print e
for (edition, lang, gt_title) in page:
title_error = None
try:
title = res.get('isbn:{0}'.format(edition.seed_isbn))['records'].values()[0]['data']['title']
except Exception, e:
title = None
title_error = str(e)
if do and title is not None:
edition.title = title
edition.title_error = title_error
db.commit_db()
yield (edition.seed_isbn, title)
def repick_seed_isbn(max_num=None, do=False, print_progress=False):
"""
Let's try to get ISBNs in the cluster that are in OpenLibrary, Freebase, and Librarything if possible
"""
gluejar_db = GluejarDB()
gutenberg_done = gluejar_db.session.query(SeedISBN, GutenbergText.lang, GutenbergText.title).join(GutenbergText, SeedISBN.gutenberg_etext_id==GutenbergText.etext_id).all()
# need to join with GutenbergText table to get lang and Gutenberg title
for (i, (s, lang, gt_title)) in enumerate(islice(gutenberg_done, max_num)):
# calculate the dominant cluster
results = json.loads(s.results)
candidate_subclusters = filter(lambda x: x[0] is not None,
[(c.get(lang), len(reduce(operator.add,c.values()))) for c in results[1]['lt_clusters_by_lang']]
)
# remember that the cluster is the first element in the tuple and a length in the 2nd element
if len(candidate_subclusters):
candidate_subcluster = set(max(candidate_subclusters, key=lambda x:x[1])[0])
else:
candidate_subcluster = set([])
# confirm that the current seed isbn is in the candidate subcluster
current_seed_ok = s.seed_isbn in candidate_subcluster
# see whether we can get a seed isbn that, in addition to LibraryThing,
# is recognized by OpenLibrary and Freebase too...2nd priority
# is just OL, 3rd is Freebase and the 4th) just LT
fb_isbns = set(results[1]['fb_isbns'])
ol_isbns = set(results[1]['ol_isbns'])
seeds = (candidate_subcluster & fb_isbns & ol_isbns) or (candidate_subcluster & ol_isbns) or \
(candidate_subcluster & fb_isbns) or candidate_subcluster
new_seed_isbn = None
if do and len(seeds):
new_seed_isbn = seeds.pop()
s.seed_isbn = new_seed_isbn
gluejar_db.commit_db()
if print_progress:
print i, s.gutenberg_etext_id, s.seed_isbn, lang, gt_title, seeds, current_seed_ok, new_seed_isbn
yield (s.gutenberg_etext_id, s.seed_isbn, lang, gt_title, seeds, current_seed_ok, new_seed_isbn)
class FreebaseClient(object): class FreebaseClient(object):
def __init__(self, username=None, password=None, main_or_sandbox='main'): def __init__(self, username=None, password=None, main_or_sandbox='main'):
@ -1123,8 +1233,9 @@ if __name__ == '__main__':
#unittest.main() #unittest.main()
print list(gutenberg_and_seed_isbn(max=10)) #print list(gutenberg_and_seed_isbn(max=10))
print list(repick_seed_isbn(10))
#suites = suite() #suites = suite()
#suites = unittest.defaultTestLoader.loadTestsFromModule(__import__('__main__')) #suites = unittest.defaultTestLoader.loadTestsFromModule(__import__('__main__'))