Code to repick the seed isbn to find isbns that are more likely to be found in a wide variety of data sources
parent
f7220d9812
commit
86fb15b8bc
|
@ -19,7 +19,7 @@ from urllib import urlencode
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
from collections import defaultdict, OrderedDict
|
from collections import defaultdict, OrderedDict
|
||||||
|
|
||||||
from itertools import islice, chain, izip
|
from itertools import islice, chain, izip, repeat
|
||||||
import operator
|
import operator
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
@ -64,7 +64,8 @@ def grouper(iterable, page_size):
|
||||||
if len(page) == page_size:
|
if len(page) == page_size:
|
||||||
yield page
|
yield page
|
||||||
page= []
|
page= []
|
||||||
yield page
|
if len(page):
|
||||||
|
yield page
|
||||||
|
|
||||||
def singleton(cls):
|
def singleton(cls):
|
||||||
instances = {}
|
instances = {}
|
||||||
|
@ -103,6 +104,8 @@ class SeedISBN(Base):
|
||||||
id = Column(u'id', Integer(11), primary_key=True, nullable=False)
|
id = Column(u'id', Integer(11), primary_key=True, nullable=False)
|
||||||
results = Column(u'results', MEDIUMTEXT())
|
results = Column(u'results', MEDIUMTEXT())
|
||||||
seed_isbn = Column(u'seed_isbn', String(length=13))
|
seed_isbn = Column(u'seed_isbn', String(length=13))
|
||||||
|
title = Column(u'title', Text())
|
||||||
|
title_error = Column(u'title_error', Text())
|
||||||
|
|
||||||
|
|
||||||
class GutenbergText(object):
|
class GutenbergText(object):
|
||||||
|
@ -710,8 +713,34 @@ def seed_isbn(olwk_ids, freebase_ids, lang='en'):
|
||||||
'len_all_isbns': len(all_isbns)}
|
'len_all_isbns': len(all_isbns)}
|
||||||
return (candidate_seed_isbn, details)
|
return (candidate_seed_isbn, details)
|
||||||
|
|
||||||
|
def candidate_subcluster_from_lt_clusters_by_lang(lang, lt_clusters_by_lang):
|
||||||
|
"""
|
||||||
|
Boil the candidate down to a single ISBN: take a random ISBN from the list of all ISBNs in the requested
|
||||||
|
language subcluster within the largest cluster that has such a language subcluster.
|
||||||
|
Return None if there is no matching sub-language
|
||||||
|
Try to find an ISBN that has good overlap with Freebase and OpenLibrary
|
||||||
|
"""
|
||||||
|
candidate_subclusters = filter(lambda x: x[0] is not None,
|
||||||
|
[(c.get(lang), len(reduce(operator.add,c.values()))) for c in lt_clusters_by_lang]
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(candidate_subclusters):
|
||||||
|
candidate_subcluster = max(candidate_subclusters, key=lambda x:x[1])
|
||||||
|
else:
|
||||||
|
candidate_subcluster = []
|
||||||
|
|
||||||
|
return candidate_seed_isbn
|
||||||
|
|
||||||
def report_on_seed_isbn(seed_isbn_result):
|
def report_on_seed_isbn(seed_isbn_result):
|
||||||
|
"""
|
||||||
|
return a dictionary interpreting the output of the seed isbn calculation
|
||||||
|
"""
|
||||||
s = seed_isbn_result
|
s = seed_isbn_result
|
||||||
|
|
||||||
|
# what proportion of all the ISBNS does the largest cluster make of all the ISBNs
|
||||||
|
# x is an iterable of cluster lengths
|
||||||
|
dominance = lambda x: float(max(x))/float(sum(x)) if len(x) else None
|
||||||
|
|
||||||
report = OrderedDict([
|
report = OrderedDict([
|
||||||
("seed isbn", s[0]),
|
("seed isbn", s[0]),
|
||||||
("the Google info we have on the seed isbn", s[1]['gbooks_data'].get(s[0])),
|
("the Google info we have on the seed isbn", s[1]['gbooks_data'].get(s[0])),
|
||||||
|
@ -730,7 +759,8 @@ def report_on_seed_isbn(seed_isbn_result):
|
||||||
for c in s[1]['lt_clusters_by_lang']]),
|
for c in s[1]['lt_clusters_by_lang']]),
|
||||||
("size of the sub-cluster including the seed isbn", len(filter(lambda x: s[0] in x,
|
("size of the sub-cluster including the seed isbn", len(filter(lambda x: s[0] in x,
|
||||||
reduce(operator.add , [c.values() for c in s[1]['lt_clusters_by_lang']]))[0]) \
|
reduce(operator.add , [c.values() for c in s[1]['lt_clusters_by_lang']]))[0]) \
|
||||||
if s[0] is not None else None)
|
if s[0] is not None else None),
|
||||||
|
("dominance of largest cluster", dominance([len(cluster) for cluster in s[1]['lt_clusters']]))
|
||||||
])
|
])
|
||||||
return report
|
return report
|
||||||
|
|
||||||
|
@ -813,7 +843,9 @@ def calc_seed_isbns(ids=None, max=None, override=False, max_consecutive_error=3)
|
||||||
|
|
||||||
|
|
||||||
def reports_in_db(max=None):
|
def reports_in_db(max=None):
|
||||||
|
"""
|
||||||
|
a generator of all the Gutenberg seed isbn calculations
|
||||||
|
"""
|
||||||
gluejar_db = GluejarDB()
|
gluejar_db = GluejarDB()
|
||||||
gutenberg_done = gluejar_db.session.query(SeedISBN).all()
|
gutenberg_done = gluejar_db.session.query(SeedISBN).all()
|
||||||
for s in islice(gutenberg_done, max):
|
for s in islice(gutenberg_done, max):
|
||||||
|
@ -874,6 +906,84 @@ def export_to_json(obj, max=None,fname=None):
|
||||||
|
|
||||||
return json.dumps(obj)
|
return json.dumps(obj)
|
||||||
|
|
||||||
|
def calc_titles_for_seed_isbns(max_num=None, do=False):
|
||||||
|
"""
|
||||||
|
For the seedisbns, calculate the titles
|
||||||
|
"""
|
||||||
|
db = GluejarDB()
|
||||||
|
|
||||||
|
# title is Null and title_error is Null
|
||||||
|
#titles_to_calc = db.session.query(SeedISBN).filter(and_(SeedISBN.title==None, SeedISBN.title_error==None)).all()
|
||||||
|
titles_to_calc = db.session.query(SeedISBN, GutenbergText.lang, GutenbergText.title). \
|
||||||
|
join(GutenbergText, SeedISBN.gutenberg_etext_id==GutenbergText.etext_id). \
|
||||||
|
filter(and_(SeedISBN.title==None, SeedISBN.title_error==None)).all()
|
||||||
|
|
||||||
|
page_size = 5
|
||||||
|
|
||||||
|
for page in grouper(islice(titles_to_calc, max_num), page_size):
|
||||||
|
query = list(izip([edition.seed_isbn for (edition, lang, gt_title) in page], repeat('isbn')))
|
||||||
|
try:
|
||||||
|
res = OpenLibrary.read(query)
|
||||||
|
except Exception, e:
|
||||||
|
print e
|
||||||
|
|
||||||
|
for (edition, lang, gt_title) in page:
|
||||||
|
title_error = None
|
||||||
|
try:
|
||||||
|
title = res.get('isbn:{0}'.format(edition.seed_isbn))['records'].values()[0]['data']['title']
|
||||||
|
except Exception, e:
|
||||||
|
title = None
|
||||||
|
title_error = str(e)
|
||||||
|
if do and title is not None:
|
||||||
|
edition.title = title
|
||||||
|
edition.title_error = title_error
|
||||||
|
db.commit_db()
|
||||||
|
yield (edition.seed_isbn, title)
|
||||||
|
|
||||||
|
|
||||||
|
def repick_seed_isbn(max_num=None, do=False, print_progress=False):
|
||||||
|
"""
|
||||||
|
Let's try to get ISBNs in the cluster that are in OpenLibrary, Freebase, and Librarything if possible
|
||||||
|
"""
|
||||||
|
gluejar_db = GluejarDB()
|
||||||
|
gutenberg_done = gluejar_db.session.query(SeedISBN, GutenbergText.lang, GutenbergText.title).join(GutenbergText, SeedISBN.gutenberg_etext_id==GutenbergText.etext_id).all()
|
||||||
|
# need to join with GutenbergText table to get lang and Gutenberg title
|
||||||
|
for (i, (s, lang, gt_title)) in enumerate(islice(gutenberg_done, max_num)):
|
||||||
|
# calculate the dominant cluster
|
||||||
|
results = json.loads(s.results)
|
||||||
|
candidate_subclusters = filter(lambda x: x[0] is not None,
|
||||||
|
[(c.get(lang), len(reduce(operator.add,c.values()))) for c in results[1]['lt_clusters_by_lang']]
|
||||||
|
)
|
||||||
|
|
||||||
|
# remember that the cluster is the first element in the tuple and a length in the 2nd element
|
||||||
|
if len(candidate_subclusters):
|
||||||
|
candidate_subcluster = set(max(candidate_subclusters, key=lambda x:x[1])[0])
|
||||||
|
else:
|
||||||
|
candidate_subcluster = set([])
|
||||||
|
|
||||||
|
# confirm that the current seed isbn is in the candidate subcluster
|
||||||
|
current_seed_ok = s.seed_isbn in candidate_subcluster
|
||||||
|
|
||||||
|
# see whether we can get a seed isbn that, in addition to LibraryThing,
|
||||||
|
# is recognized by OpenLibrary and Freebase too...2nd priority
|
||||||
|
# is just OL, 3rd is Freebase and the 4th) just LT
|
||||||
|
fb_isbns = set(results[1]['fb_isbns'])
|
||||||
|
ol_isbns = set(results[1]['ol_isbns'])
|
||||||
|
|
||||||
|
seeds = (candidate_subcluster & fb_isbns & ol_isbns) or (candidate_subcluster & ol_isbns) or \
|
||||||
|
(candidate_subcluster & fb_isbns) or candidate_subcluster
|
||||||
|
|
||||||
|
new_seed_isbn = None
|
||||||
|
|
||||||
|
if do and len(seeds):
|
||||||
|
new_seed_isbn = seeds.pop()
|
||||||
|
s.seed_isbn = new_seed_isbn
|
||||||
|
gluejar_db.commit_db()
|
||||||
|
|
||||||
|
if print_progress:
|
||||||
|
print i, s.gutenberg_etext_id, s.seed_isbn, lang, gt_title, seeds, current_seed_ok, new_seed_isbn
|
||||||
|
yield (s.gutenberg_etext_id, s.seed_isbn, lang, gt_title, seeds, current_seed_ok, new_seed_isbn)
|
||||||
|
|
||||||
|
|
||||||
class FreebaseClient(object):
|
class FreebaseClient(object):
|
||||||
def __init__(self, username=None, password=None, main_or_sandbox='main'):
|
def __init__(self, username=None, password=None, main_or_sandbox='main'):
|
||||||
|
@ -1123,8 +1233,9 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
#unittest.main()
|
#unittest.main()
|
||||||
|
|
||||||
print list(gutenberg_and_seed_isbn(max=10))
|
#print list(gutenberg_and_seed_isbn(max=10))
|
||||||
|
|
||||||
|
print list(repick_seed_isbn(10))
|
||||||
|
|
||||||
#suites = suite()
|
#suites = suite()
|
||||||
#suites = unittest.defaultTestLoader.loadTestsFromModule(__import__('__main__'))
|
#suites = unittest.defaultTestLoader.loadTestsFromModule(__import__('__main__'))
|
||||||
|
|
Loading…
Reference in New Issue