Code to repick the seed isbn to find isbns that are more likely to be found in a wide variety of data sources

pull/1/head
Raymond Yee 2012-02-27 08:46:34 -08:00
parent f7220d9812
commit 86fb15b8bc
1 changed files with 117 additions and 6 deletions

View File

@ -19,7 +19,7 @@ from urllib import urlencode
from pprint import pprint
from collections import defaultdict, OrderedDict
from itertools import islice, chain, izip
from itertools import islice, chain, izip, repeat
import operator
import time
@ -64,7 +64,8 @@ def grouper(iterable, page_size):
if len(page) == page_size:
yield page
page= []
yield page
if len(page):
yield page
def singleton(cls):
instances = {}
@ -103,6 +104,8 @@ class SeedISBN(Base):
id = Column(u'id', Integer(11), primary_key=True, nullable=False)
results = Column(u'results', MEDIUMTEXT())
seed_isbn = Column(u'seed_isbn', String(length=13))
title = Column(u'title', Text())
title_error = Column(u'title_error', Text())
class GutenbergText(object):
@ -710,8 +713,34 @@ def seed_isbn(olwk_ids, freebase_ids, lang='en'):
'len_all_isbns': len(all_isbns)}
return (candidate_seed_isbn, details)
def candidate_subcluster_from_lt_clusters_by_lang(lang, lt_clusters_by_lang):
"""
Boil the candidate down to a single ISBN: take a random ISBN from the list of all ISBNs in the requested
language subcluster within the largest cluster that has such a language subcluster.
Return None if there is no matching sub-language
Try to find an ISBN that has good overlap with Freebase and OpenLibrary
"""
candidate_subclusters = filter(lambda x: x[0] is not None,
[(c.get(lang), len(reduce(operator.add,c.values()))) for c in lt_clusters_by_lang]
)
if len(candidate_subclusters):
candidate_subcluster = max(candidate_subclusters, key=lambda x:x[1])
else:
candidate_subcluster = []
return candidate_seed_isbn
def report_on_seed_isbn(seed_isbn_result):
"""
return a dictionary interpreting the output of the seed isbn calculation
"""
s = seed_isbn_result
# what proportion of all the ISBNS does the largest cluster make of all the ISBNs
# x is an iterable of cluster lengths
dominance = lambda x: float(max(x))/float(sum(x)) if len(x) else None
report = OrderedDict([
("seed isbn", s[0]),
("the Google info we have on the seed isbn", s[1]['gbooks_data'].get(s[0])),
@ -730,7 +759,8 @@ def report_on_seed_isbn(seed_isbn_result):
for c in s[1]['lt_clusters_by_lang']]),
("size of the sub-cluster including the seed isbn", len(filter(lambda x: s[0] in x,
reduce(operator.add , [c.values() for c in s[1]['lt_clusters_by_lang']]))[0]) \
if s[0] is not None else None)
if s[0] is not None else None),
("dominance of largest cluster", dominance([len(cluster) for cluster in s[1]['lt_clusters']]))
])
return report
@ -813,7 +843,9 @@ def calc_seed_isbns(ids=None, max=None, override=False, max_consecutive_error=3)
def reports_in_db(max=None):
"""
a generator of all the Gutenberg seed isbn calculations
"""
gluejar_db = GluejarDB()
gutenberg_done = gluejar_db.session.query(SeedISBN).all()
for s in islice(gutenberg_done, max):
@ -874,6 +906,84 @@ def export_to_json(obj, max=None,fname=None):
return json.dumps(obj)
def calc_titles_for_seed_isbns(max_num=None, do=False):
"""
For the seedisbns, calculate the titles
"""
db = GluejarDB()
# title is Null and title_error is Null
#titles_to_calc = db.session.query(SeedISBN).filter(and_(SeedISBN.title==None, SeedISBN.title_error==None)).all()
titles_to_calc = db.session.query(SeedISBN, GutenbergText.lang, GutenbergText.title). \
join(GutenbergText, SeedISBN.gutenberg_etext_id==GutenbergText.etext_id). \
filter(and_(SeedISBN.title==None, SeedISBN.title_error==None)).all()
page_size = 5
for page in grouper(islice(titles_to_calc, max_num), page_size):
query = list(izip([edition.seed_isbn for (edition, lang, gt_title) in page], repeat('isbn')))
try:
res = OpenLibrary.read(query)
except Exception, e:
print e
for (edition, lang, gt_title) in page:
title_error = None
try:
title = res.get('isbn:{0}'.format(edition.seed_isbn))['records'].values()[0]['data']['title']
except Exception, e:
title = None
title_error = str(e)
if do and title is not None:
edition.title = title
edition.title_error = title_error
db.commit_db()
yield (edition.seed_isbn, title)
def repick_seed_isbn(max_num=None, do=False, print_progress=False):
"""
Let's try to get ISBNs in the cluster that are in OpenLibrary, Freebase, and Librarything if possible
"""
gluejar_db = GluejarDB()
gutenberg_done = gluejar_db.session.query(SeedISBN, GutenbergText.lang, GutenbergText.title).join(GutenbergText, SeedISBN.gutenberg_etext_id==GutenbergText.etext_id).all()
# need to join with GutenbergText table to get lang and Gutenberg title
for (i, (s, lang, gt_title)) in enumerate(islice(gutenberg_done, max_num)):
# calculate the dominant cluster
results = json.loads(s.results)
candidate_subclusters = filter(lambda x: x[0] is not None,
[(c.get(lang), len(reduce(operator.add,c.values()))) for c in results[1]['lt_clusters_by_lang']]
)
# remember that the cluster is the first element in the tuple and a length in the 2nd element
if len(candidate_subclusters):
candidate_subcluster = set(max(candidate_subclusters, key=lambda x:x[1])[0])
else:
candidate_subcluster = set([])
# confirm that the current seed isbn is in the candidate subcluster
current_seed_ok = s.seed_isbn in candidate_subcluster
# see whether we can get a seed isbn that, in addition to LibraryThing,
# is recognized by OpenLibrary and Freebase too...2nd priority
# is just OL, 3rd is Freebase and the 4th) just LT
fb_isbns = set(results[1]['fb_isbns'])
ol_isbns = set(results[1]['ol_isbns'])
seeds = (candidate_subcluster & fb_isbns & ol_isbns) or (candidate_subcluster & ol_isbns) or \
(candidate_subcluster & fb_isbns) or candidate_subcluster
new_seed_isbn = None
if do and len(seeds):
new_seed_isbn = seeds.pop()
s.seed_isbn = new_seed_isbn
gluejar_db.commit_db()
if print_progress:
print i, s.gutenberg_etext_id, s.seed_isbn, lang, gt_title, seeds, current_seed_ok, new_seed_isbn
yield (s.gutenberg_etext_id, s.seed_isbn, lang, gt_title, seeds, current_seed_ok, new_seed_isbn)
class FreebaseClient(object):
def __init__(self, username=None, password=None, main_or_sandbox='main'):
@ -1123,8 +1233,9 @@ if __name__ == '__main__':
#unittest.main()
print list(gutenberg_and_seed_isbn(max=10))
#print list(gutenberg_and_seed_isbn(max=10))
print list(repick_seed_isbn(10))
#suites = suite()
#suites = unittest.defaultTestLoader.loadTestsFromModule(__import__('__main__'))