Code to repick the seed isbn to find isbns that are more likely to be found in a wide variety of data sources
parent
f7220d9812
commit
86fb15b8bc
|
@ -19,7 +19,7 @@ from urllib import urlencode
|
|||
from pprint import pprint
|
||||
from collections import defaultdict, OrderedDict
|
||||
|
||||
from itertools import islice, chain, izip
|
||||
from itertools import islice, chain, izip, repeat
|
||||
import operator
|
||||
import time
|
||||
|
||||
|
@ -64,7 +64,8 @@ def grouper(iterable, page_size):
|
|||
if len(page) == page_size:
|
||||
yield page
|
||||
page= []
|
||||
yield page
|
||||
if len(page):
|
||||
yield page
|
||||
|
||||
def singleton(cls):
|
||||
instances = {}
|
||||
|
@ -103,6 +104,8 @@ class SeedISBN(Base):
|
|||
id = Column(u'id', Integer(11), primary_key=True, nullable=False)
|
||||
results = Column(u'results', MEDIUMTEXT())
|
||||
seed_isbn = Column(u'seed_isbn', String(length=13))
|
||||
title = Column(u'title', Text())
|
||||
title_error = Column(u'title_error', Text())
|
||||
|
||||
|
||||
class GutenbergText(object):
|
||||
|
@ -710,8 +713,34 @@ def seed_isbn(olwk_ids, freebase_ids, lang='en'):
|
|||
'len_all_isbns': len(all_isbns)}
|
||||
return (candidate_seed_isbn, details)
|
||||
|
||||
def candidate_subcluster_from_lt_clusters_by_lang(lang, lt_clusters_by_lang):
|
||||
"""
|
||||
Boil the candidate down to a single ISBN: take a random ISBN from the list of all ISBNs in the requested
|
||||
language subcluster within the largest cluster that has such a language subcluster.
|
||||
Return None if there is no matching sub-language
|
||||
Try to find an ISBN that has good overlap with Freebase and OpenLibrary
|
||||
"""
|
||||
candidate_subclusters = filter(lambda x: x[0] is not None,
|
||||
[(c.get(lang), len(reduce(operator.add,c.values()))) for c in lt_clusters_by_lang]
|
||||
)
|
||||
|
||||
if len(candidate_subclusters):
|
||||
candidate_subcluster = max(candidate_subclusters, key=lambda x:x[1])
|
||||
else:
|
||||
candidate_subcluster = []
|
||||
|
||||
return candidate_seed_isbn
|
||||
|
||||
def report_on_seed_isbn(seed_isbn_result):
|
||||
"""
|
||||
return a dictionary interpreting the output of the seed isbn calculation
|
||||
"""
|
||||
s = seed_isbn_result
|
||||
|
||||
# what proportion of all the ISBNS does the largest cluster make of all the ISBNs
|
||||
# x is an iterable of cluster lengths
|
||||
dominance = lambda x: float(max(x))/float(sum(x)) if len(x) else None
|
||||
|
||||
report = OrderedDict([
|
||||
("seed isbn", s[0]),
|
||||
("the Google info we have on the seed isbn", s[1]['gbooks_data'].get(s[0])),
|
||||
|
@ -730,7 +759,8 @@ def report_on_seed_isbn(seed_isbn_result):
|
|||
for c in s[1]['lt_clusters_by_lang']]),
|
||||
("size of the sub-cluster including the seed isbn", len(filter(lambda x: s[0] in x,
|
||||
reduce(operator.add , [c.values() for c in s[1]['lt_clusters_by_lang']]))[0]) \
|
||||
if s[0] is not None else None)
|
||||
if s[0] is not None else None),
|
||||
("dominance of largest cluster", dominance([len(cluster) for cluster in s[1]['lt_clusters']]))
|
||||
])
|
||||
return report
|
||||
|
||||
|
@ -813,7 +843,9 @@ def calc_seed_isbns(ids=None, max=None, override=False, max_consecutive_error=3)
|
|||
|
||||
|
||||
def reports_in_db(max=None):
|
||||
|
||||
"""
|
||||
a generator of all the Gutenberg seed isbn calculations
|
||||
"""
|
||||
gluejar_db = GluejarDB()
|
||||
gutenberg_done = gluejar_db.session.query(SeedISBN).all()
|
||||
for s in islice(gutenberg_done, max):
|
||||
|
@ -874,6 +906,84 @@ def export_to_json(obj, max=None,fname=None):
|
|||
|
||||
return json.dumps(obj)
|
||||
|
||||
def calc_titles_for_seed_isbns(max_num=None, do=False):
|
||||
"""
|
||||
For the seedisbns, calculate the titles
|
||||
"""
|
||||
db = GluejarDB()
|
||||
|
||||
# title is Null and title_error is Null
|
||||
#titles_to_calc = db.session.query(SeedISBN).filter(and_(SeedISBN.title==None, SeedISBN.title_error==None)).all()
|
||||
titles_to_calc = db.session.query(SeedISBN, GutenbergText.lang, GutenbergText.title). \
|
||||
join(GutenbergText, SeedISBN.gutenberg_etext_id==GutenbergText.etext_id). \
|
||||
filter(and_(SeedISBN.title==None, SeedISBN.title_error==None)).all()
|
||||
|
||||
page_size = 5
|
||||
|
||||
for page in grouper(islice(titles_to_calc, max_num), page_size):
|
||||
query = list(izip([edition.seed_isbn for (edition, lang, gt_title) in page], repeat('isbn')))
|
||||
try:
|
||||
res = OpenLibrary.read(query)
|
||||
except Exception, e:
|
||||
print e
|
||||
|
||||
for (edition, lang, gt_title) in page:
|
||||
title_error = None
|
||||
try:
|
||||
title = res.get('isbn:{0}'.format(edition.seed_isbn))['records'].values()[0]['data']['title']
|
||||
except Exception, e:
|
||||
title = None
|
||||
title_error = str(e)
|
||||
if do and title is not None:
|
||||
edition.title = title
|
||||
edition.title_error = title_error
|
||||
db.commit_db()
|
||||
yield (edition.seed_isbn, title)
|
||||
|
||||
|
||||
def repick_seed_isbn(max_num=None, do=False, print_progress=False):
|
||||
"""
|
||||
Let's try to get ISBNs in the cluster that are in OpenLibrary, Freebase, and Librarything if possible
|
||||
"""
|
||||
gluejar_db = GluejarDB()
|
||||
gutenberg_done = gluejar_db.session.query(SeedISBN, GutenbergText.lang, GutenbergText.title).join(GutenbergText, SeedISBN.gutenberg_etext_id==GutenbergText.etext_id).all()
|
||||
# need to join with GutenbergText table to get lang and Gutenberg title
|
||||
for (i, (s, lang, gt_title)) in enumerate(islice(gutenberg_done, max_num)):
|
||||
# calculate the dominant cluster
|
||||
results = json.loads(s.results)
|
||||
candidate_subclusters = filter(lambda x: x[0] is not None,
|
||||
[(c.get(lang), len(reduce(operator.add,c.values()))) for c in results[1]['lt_clusters_by_lang']]
|
||||
)
|
||||
|
||||
# remember that the cluster is the first element in the tuple and a length in the 2nd element
|
||||
if len(candidate_subclusters):
|
||||
candidate_subcluster = set(max(candidate_subclusters, key=lambda x:x[1])[0])
|
||||
else:
|
||||
candidate_subcluster = set([])
|
||||
|
||||
# confirm that the current seed isbn is in the candidate subcluster
|
||||
current_seed_ok = s.seed_isbn in candidate_subcluster
|
||||
|
||||
# see whether we can get a seed isbn that, in addition to LibraryThing,
|
||||
# is recognized by OpenLibrary and Freebase too...2nd priority
|
||||
# is just OL, 3rd is Freebase and the 4th) just LT
|
||||
fb_isbns = set(results[1]['fb_isbns'])
|
||||
ol_isbns = set(results[1]['ol_isbns'])
|
||||
|
||||
seeds = (candidate_subcluster & fb_isbns & ol_isbns) or (candidate_subcluster & ol_isbns) or \
|
||||
(candidate_subcluster & fb_isbns) or candidate_subcluster
|
||||
|
||||
new_seed_isbn = None
|
||||
|
||||
if do and len(seeds):
|
||||
new_seed_isbn = seeds.pop()
|
||||
s.seed_isbn = new_seed_isbn
|
||||
gluejar_db.commit_db()
|
||||
|
||||
if print_progress:
|
||||
print i, s.gutenberg_etext_id, s.seed_isbn, lang, gt_title, seeds, current_seed_ok, new_seed_isbn
|
||||
yield (s.gutenberg_etext_id, s.seed_isbn, lang, gt_title, seeds, current_seed_ok, new_seed_isbn)
|
||||
|
||||
|
||||
class FreebaseClient(object):
|
||||
def __init__(self, username=None, password=None, main_or_sandbox='main'):
|
||||
|
@ -1123,8 +1233,9 @@ if __name__ == '__main__':
|
|||
|
||||
#unittest.main()
|
||||
|
||||
print list(gutenberg_and_seed_isbn(max=10))
|
||||
|
||||
#print list(gutenberg_and_seed_isbn(max=10))
|
||||
|
||||
print list(repick_seed_isbn(10))
|
||||
|
||||
#suites = suite()
|
||||
#suites = unittest.defaultTestLoader.loadTestsFromModule(__import__('__main__'))
|
||||
|
|
Loading…
Reference in New Issue