Code that I'm now running in quasi-production on my laptop to compute the seed isbn. Let's see how it goes
parent
b5c663f82f
commit
cfc3dd3549
|
@ -34,7 +34,7 @@ import freebase
|
||||||
import logging
|
import logging
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
GOOGLE_BOOKS_KEY = "AIzaSyCewoH_s2LmrxWD5XNwed3izNnA3dUqMlo"
|
GOOGLE_BOOKS_KEY = "AIzaSyDsrHCUsUFNAf65cFPSF8MZTKj8C9oMuj8"
|
||||||
|
|
||||||
MASHUPBOOK_ISBN_13 = '9781590598580'
|
MASHUPBOOK_ISBN_13 = '9781590598580'
|
||||||
MASHUPBOOK_ISBN_10 = '159059858X'
|
MASHUPBOOK_ISBN_10 = '159059858X'
|
||||||
|
@ -84,7 +84,7 @@ def thingisbn(isbn):
|
||||||
"""given an ISBN return a list of related edition ISBNs, according to
|
"""given an ISBN return a list of related edition ISBNs, according to
|
||||||
Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns, which come back as isbn_13')
|
Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns, which come back as isbn_13')
|
||||||
"""
|
"""
|
||||||
logger.info("looking up %s at ThingISBN" , isbn)
|
logger.debug("looking up %s at ThingISBN" , isbn)
|
||||||
url = "http://www.librarything.com/api/thingISBN/%s" % isbn
|
url = "http://www.librarything.com/api/thingISBN/%s" % isbn
|
||||||
xml = requests.get(url, headers={"User-Agent": USER_AGENT}).content
|
xml = requests.get(url, headers={"User-Agent": USER_AGENT}).content
|
||||||
doc = ElementTree.fromstring(xml)
|
doc = ElementTree.fromstring(xml)
|
||||||
|
@ -325,20 +325,21 @@ class OpenLibrary(object):
|
||||||
return None
|
return None
|
||||||
@classmethod
|
@classmethod
|
||||||
def xisbn(cls,isbn_val=None, work_id=None, page_size=5):
|
def xisbn(cls,isbn_val=None, work_id=None, page_size=5):
|
||||||
|
logger.debug("isbn_val, work_id, page_size: %s %s %d", isbn_val, work_id, page_size)
|
||||||
isbns = set()
|
isbns = set()
|
||||||
|
|
||||||
if isbn_val is None and work_id is None:
|
if isbn_val is None and work_id is None:
|
||||||
raise Exception("One of isbn or work_id must be specified")
|
raise Exception("One of isbn or work_id must be specified")
|
||||||
elif isbn_val is not None and work_id is not None:
|
elif isbn_val is not None and work_id is not None:
|
||||||
raise Exception("Only only of isbn or work_id can be specified")
|
raise Exception("Only one of isbn or work_id can be specified")
|
||||||
|
|
||||||
if isbn_val is not None:
|
if isbn_val is not None:
|
||||||
# figure out the work_id and then pass back all the ISBNs from the manifestations of the work
|
# figure out the work_id and then pass back all the ISBNs from the manifestations of the work
|
||||||
try:
|
try:
|
||||||
isbn_val = isbn_mod.ISBN(isbn_val).to_string('13')
|
isbn_val = isbn_mod.ISBN(isbn_val).to_string('13')
|
||||||
isbns.add(isbn_val)
|
if isbn_val is not None:
|
||||||
yield isbn_val
|
isbns.add(isbn_val)
|
||||||
|
yield isbn_val
|
||||||
|
|
||||||
work_ids = list(cls.works([(isbn_val,'isbn')]))
|
work_ids = list(cls.works([(isbn_val,'isbn')]))
|
||||||
if len(work_ids):
|
if len(work_ids):
|
||||||
|
@ -363,7 +364,7 @@ class OpenLibrary(object):
|
||||||
if isbn:
|
if isbn:
|
||||||
try:
|
try:
|
||||||
isbn = isbn_mod.ISBN(isbn).to_string('13')
|
isbn = isbn_mod.ISBN(isbn).to_string('13')
|
||||||
if isbn not in isbns:
|
if isbn is not None and isbn not in isbns:
|
||||||
isbns.add(isbn)
|
isbns.add(isbn)
|
||||||
yield isbn
|
yield isbn
|
||||||
except isbn_mod.ISBNException:
|
except isbn_mod.ISBNException:
|
||||||
|
|
|
@ -27,12 +27,14 @@ import re
|
||||||
from itertools import islice, izip
|
from itertools import islice, izip
|
||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
|
import json
|
||||||
|
|
||||||
from google.refine import refine
|
from google.refine import refine
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Text, Sequence, Boolean, not_, and_, DateTime
|
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Text, Sequence, Boolean, not_, and_, DateTime
|
||||||
|
from sqlalchemy.dialects.mysql import MEDIUMTEXT
|
||||||
from sqlalchemy.orm import mapper, sessionmaker
|
from sqlalchemy.orm import mapper, sessionmaker
|
||||||
from sqlalchemy.exc import IntegrityError
|
from sqlalchemy.exc import IntegrityError
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
@ -90,19 +92,18 @@ def get_or_create(session, model, defaults=None, **kwargs):
|
||||||
|
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
|
|
||||||
class SeedISBN(object):
|
class SeedISBN(Base):
|
||||||
"""
|
|
||||||
CREATE TABLE `SeedISBN` (
|
__tablename__ = 'SeedISBN'
|
||||||
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
|
__table_args__ = {'mysql_engine':'InnoDB'}
|
||||||
`gutenberg_etext_id` int(11) DEFAULT NULL,
|
|
||||||
`seed_isbn` char(13) DEFAULT NULL,
|
#column definitions
|
||||||
`results` mediumtext,
|
calculated = Column(u'calculated', DateTime, default=datetime.utcnow())
|
||||||
`calculated` timestamp NULL DEFAULT NULL,
|
error = Column(u'error', Text())
|
||||||
`error` text,
|
gutenberg_etext_id = Column(u'gutenberg_etext_id', Integer(11), index=True)
|
||||||
PRIMARY KEY (`id`)
|
id = Column(u'id', Integer(11), primary_key=True, nullable=False)
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
results = Column(u'results', MEDIUMTEXT())
|
||||||
"""
|
seed_isbn = Column(u'seed_isbn', String(length=13))
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class GutenbergText(object):
|
class GutenbergText(object):
|
||||||
|
@ -201,24 +202,36 @@ class MappingError(Base):
|
||||||
@singleton
|
@singleton
|
||||||
class GluejarDB(object):
|
class GluejarDB(object):
|
||||||
def __init__(self, user="gluejar", pw="gluejar", db="Gluejar", host="127.0.0.1", port=3306):
|
def __init__(self, user="gluejar", pw="gluejar", db="Gluejar", host="127.0.0.1", port=3306):
|
||||||
mysql_connect_path = "mysql+mysqldb://%s:%s@%s:%s/%s?charset=utf8" % (user,pw,host,port,db)
|
self.mysql_connect_path = "mysql+mysqldb://%s:%s@%s:%s/%s?charset=utf8" % (user,pw,host,port,db)
|
||||||
engine = create_engine(mysql_connect_path, echo=False)
|
self.engine = create_engine(self.mysql_connect_path, echo=False)
|
||||||
|
|
||||||
metadata = MetaData(engine)
|
self.metadata = MetaData(self.engine)
|
||||||
Base.metadata.create_all(engine)
|
Base.metadata.create_all(self.engine)
|
||||||
|
|
||||||
gutenbergtext = Table('GutenbergText', metadata, autoload=True)
|
gutenbergtext = Table('GutenbergText', self.metadata, autoload=True)
|
||||||
mapper(GutenbergText, gutenbergtext)
|
mapper(GutenbergText, gutenbergtext)
|
||||||
|
|
||||||
gutenbergfile = Table('GutenbergFile', metadata, autoload=True)
|
gutenbergfile = Table('GutenbergFile', self.metadata, autoload=True)
|
||||||
mapper(GutenbergFile, gutenbergfile)
|
mapper(GutenbergFile, gutenbergfile)
|
||||||
|
|
||||||
seedisbn = Table('SeedISBN', metadata, autoload=True)
|
#seedisbn = Table('SeedISBN', self.metadata, autoload=True)
|
||||||
mapper(SeedISBN, seedisbn)
|
#mapper(SeedISBN, seedisbn)
|
||||||
|
|
||||||
Session = sessionmaker(bind=engine)
|
Session = sessionmaker(bind=self.engine)
|
||||||
session = Session()
|
session = Session()
|
||||||
self.session = session
|
self.session = session
|
||||||
|
def _reflect(self):
|
||||||
|
for table in self.metadata.tables.values():
|
||||||
|
print """
|
||||||
|
class %s(Base):
|
||||||
|
__table__ = Table(%r, Base.metadata, autoload=True)
|
||||||
|
|
||||||
|
""" % (table.name, table.name)
|
||||||
|
def _sqlautocode(self):
|
||||||
|
"""
|
||||||
|
spit out some code to help us run sqlautocode
|
||||||
|
"""
|
||||||
|
return "sqlautocode -o model.py %s" % (self.mysql_connect_path)
|
||||||
def commit_db(self):
|
def commit_db(self):
|
||||||
self.session.commit()
|
self.session.commit()
|
||||||
def rollback(self):
|
def rollback(self):
|
||||||
|
@ -610,8 +623,8 @@ def seed_isbn(olwk_ids, freebase_ids, lang='en'):
|
||||||
fb = FreebaseBooks()
|
fb = FreebaseBooks()
|
||||||
gb = GoogleBooks(key=GOOGLE_BOOKS_KEY)
|
gb = GoogleBooks(key=GOOGLE_BOOKS_KEY)
|
||||||
|
|
||||||
fb_isbn_set = reduce(operator.or_,[set(fb.xisbn(book_id=freebase_id)) for freebase_id in freebase_ids])
|
fb_isbn_set = reduce(operator.or_,[set(fb.xisbn(book_id=freebase_id)) for freebase_id in freebase_ids]) if len(freebase_ids) else set()
|
||||||
ol_isbn_set = reduce(operator.or_,[set(OpenLibrary.xisbn(work_id=olwk_id)) for olwk_id in olwk_ids])
|
ol_isbn_set = reduce(operator.or_,[set(OpenLibrary.xisbn(work_id=olwk_id)) for olwk_id in olwk_ids]) if len(olwk_ids) else set()
|
||||||
|
|
||||||
#lt_isbn_set = set(map(lambda x: isbn_mod.ISBN(x).to_string('13'), thingisbn(SURFACING_ISBN)))
|
#lt_isbn_set = set(map(lambda x: isbn_mod.ISBN(x).to_string('13'), thingisbn(SURFACING_ISBN)))
|
||||||
|
|
||||||
|
@ -645,13 +658,13 @@ def seed_isbn(olwk_ids, freebase_ids, lang='en'):
|
||||||
logger.debug("unrecognized by LT %s %d", lt_unrecognized, len(lt_unrecognized))
|
logger.debug("unrecognized by LT %s %d", lt_unrecognized, len(lt_unrecognized))
|
||||||
|
|
||||||
# figure out new ISBNs found by LT
|
# figure out new ISBNs found by LT
|
||||||
new_isbns = (reduce(operator.or_,lt_clusters) | lt_unrecognized) - (fb_isbn_set | ol_isbn_set)
|
new_isbns = ((reduce(operator.or_,lt_clusters) if len(lt_clusters) else set())| lt_unrecognized) - (fb_isbn_set | ol_isbn_set)
|
||||||
logger.debug( "new isbns from LT %s %d", new_isbns, len(new_isbns))
|
logger.debug( "new isbns from LT %s %d", new_isbns, len(new_isbns))
|
||||||
|
|
||||||
gbooks_data = {}
|
gbooks_data = {}
|
||||||
|
|
||||||
# then pass to Google books to get info, including language
|
# then pass to Google books to get info, including language
|
||||||
all_isbns = (reduce(operator.or_,lt_clusters) | lt_unrecognized)
|
all_isbns = ((reduce(operator.or_,lt_clusters) if len(lt_clusters) else set()) | lt_unrecognized)
|
||||||
for (i, isbn) in enumerate(all_isbns):
|
for (i, isbn) in enumerate(all_isbns):
|
||||||
gbooks_data[isbn] = gb.isbn(isbn)
|
gbooks_data[isbn] = gb.isbn(isbn)
|
||||||
logger.debug("%d %s %s", i, isbn, gbooks_data[isbn])
|
logger.debug("%d %s %s", i, isbn, gbooks_data[isbn])
|
||||||
|
@ -703,7 +716,7 @@ def report_on_seed_isbn(seed_isbn_result):
|
||||||
("seed isbn", s[0]),
|
("seed isbn", s[0]),
|
||||||
("the Google info we have on the seed isbn", s[1]['gbooks_data'].get(s[0])),
|
("the Google info we have on the seed isbn", s[1]['gbooks_data'].get(s[0])),
|
||||||
("lang", s[1]['lang']),
|
("lang", s[1]['lang']),
|
||||||
("Freebase ids", s[1]['fb_isbns']),
|
("Freebase ids", s[1]['freebase_ids']),
|
||||||
("number of OL ids", len(s[1]['olwk_ids'])),
|
("number of OL ids", len(s[1]['olwk_ids'])),
|
||||||
("total number of ISBNs from pooling FB + OL + LT", s[1]['len_all_isbns']),
|
("total number of ISBNs from pooling FB + OL + LT", s[1]['len_all_isbns']),
|
||||||
("number of FB isbns", len(s[1]['fb_isbns'])),
|
("number of FB isbns", len(s[1]['fb_isbns'])),
|
||||||
|
@ -725,33 +738,93 @@ def surfacing_seed_isbn():
|
||||||
SURFACING_WORK_OLID = 'OL675829W'
|
SURFACING_WORK_OLID = 'OL675829W'
|
||||||
surfacing_fb_id = '/m/05p_vg'
|
surfacing_fb_id = '/m/05p_vg'
|
||||||
book_isbn = '9780446311076'
|
book_isbn = '9780446311076'
|
||||||
return seed_isbn(olwk_ids=(SURFACING_WORK_OLID,), freebase_ids=(surfacing_fb_id,))
|
return seed_isbn(olwk_ids=(SURFACING_WORK_OLID,), freebase_ids=(surfacing_fb_id,), lang='en')
|
||||||
|
|
||||||
def ry_mashups_seed_isbn():
|
def ry_mashups_seed_isbn():
|
||||||
olid = "OL10306321W"
|
olid = "OL10306321W"
|
||||||
fb_id = "/en/pro_web_2_0_mashups_remixing_data_and_web_services"
|
fb_id = "/en/pro_web_2_0_mashups_remixing_data_and_web_services"
|
||||||
return seed_isbn(olwk_ids=(olid,), freebase_ids=(fb_id,))
|
return seed_isbn(olwk_ids=(olid,), freebase_ids=(fb_id,), lang='en')
|
||||||
|
|
||||||
def moby_dick_seed_isbn():
|
def moby_dick_seed_isbn():
|
||||||
return seed_isbn(olwk_ids=('OL102749W',), freebase_ids=('/en/moby-dick',))
|
return seed_isbn(olwk_ids=('OL102749W',), freebase_ids=('/en/moby-dick',), lang='en')
|
||||||
|
|
||||||
|
def calc_seed_isbns(ids=None, max=None, override=False, max_consecutive_error=3):
|
||||||
|
|
||||||
def calc_seed_isbns(ids=None, max=None, override=False):
|
|
||||||
# if ids specified, work through them
|
# if ids specified, work through them
|
||||||
# loop through all Gutenberg ids, see whethether the seed_isbn has been calculated -- and if not, do so.
|
# loop through all Gutenberg ids, see whethether the seed_isbn has been calculated -- and if not, do so.
|
||||||
|
|
||||||
# collate all the ol work ids for a given gutenberg id
|
current_error_count = 0
|
||||||
|
|
||||||
gluejar_db = GluejarDB()
|
gluejar_db = GluejarDB()
|
||||||
gutenberg_done = set([gluejar_db.session.query(SeedISBN.gutenberg_etext_id).all()])
|
|
||||||
|
|
||||||
|
# pull out a set of Gutenberg text ids that already in the SeedISBN table so that we have the option of
|
||||||
|
# not recalculating those Gutenberg texts
|
||||||
|
gutenberg_done = set(map(lambda x: x[0], gluejar_db.session.query(SeedISBN.gutenberg_etext_id).all()))
|
||||||
|
logger.debug("gutenberg_done %s", gutenberg_done )
|
||||||
|
|
||||||
|
# collate all the ol work ids and Freebase ids for a given gutenberg id
|
||||||
if ids is None:
|
if ids is None:
|
||||||
gutenberg_with_ol = defaultdict(set)
|
g_ids = set()
|
||||||
for mapping in gutenberg_to_ol_mapping(max=max):
|
ol_ids = defaultdict(set)
|
||||||
logger.debug(mapping)
|
fb_ids = defaultdict(set)
|
||||||
gutenberg_with_ol[mapping["gutenberg_etext_id"]].add(mapping["olid"])
|
lang = {}
|
||||||
ids = gutenberg_
|
for mapping in gutenberg_to_ol_mapping():
|
||||||
|
g_id = mapping["gutenberg_etext_id"]
|
||||||
|
g_ids.add(g_id)
|
||||||
|
ol_ids[g_id].add(mapping["olid"])
|
||||||
|
fb_ids[g_id].add(mapping["freebase_id"])
|
||||||
|
lang[g_id] = mapping["lang"]
|
||||||
|
logger.debug("len(g_ids): %d", len(g_ids))
|
||||||
|
# turn the mapping into a series of tuples that can be fed to seed_isbn
|
||||||
|
if not override:
|
||||||
|
logger.debug("len(g_ids) before subtracting gutenberg_done: %d", len(g_ids))
|
||||||
|
logger.debug("len(gutenberg_done): %d", len(gutenberg_done))
|
||||||
|
g_ids -= gutenberg_done
|
||||||
|
logger.debug("len(g_ids) after subtracting gutenberg_done: %d", len(g_ids))
|
||||||
|
|
||||||
|
ids = [(g_id, tuple(ol_ids[g_id]), tuple(fb_ids[g_id]), lang[g_id]) for g_id in g_ids]
|
||||||
|
logger.debug("len(ids): %d", len(ids))
|
||||||
|
|
||||||
|
for (i, work_id) in enumerate(islice(ids, max)):
|
||||||
|
if current_error_count >= max_consecutive_error:
|
||||||
|
break
|
||||||
|
(g_id, args) = (work_id[0], work_id[1:])
|
||||||
|
logger.info("i, g_id, args: %d %s %s", i, g_id, args)
|
||||||
|
(seed, created) = get_or_create(gluejar_db.session, SeedISBN, gutenberg_etext_id=g_id)
|
||||||
|
try:
|
||||||
|
s = seed_isbn(*args)
|
||||||
|
seed.calculated = datetime.utcnow()
|
||||||
|
seed.seed_isbn = s[0]
|
||||||
|
seed.error = None
|
||||||
|
seed.results = json.dumps(s)
|
||||||
|
current_error_count = 0
|
||||||
|
yield (g_id, s)
|
||||||
|
except Exception, e:
|
||||||
|
current_error_count += 1
|
||||||
|
seed.seed_isbn = None
|
||||||
|
seed.calculated = datetime.utcnow()
|
||||||
|
seed.error = str(e)
|
||||||
|
seed.results = None
|
||||||
|
logger.warning(str(e))
|
||||||
|
yield (g_id, e)
|
||||||
|
finally:
|
||||||
|
gluejar_db.commit_db()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def reports_in_db(max=None):
|
||||||
|
|
||||||
|
gluejar_db = GluejarDB()
|
||||||
|
gutenberg_done = gluejar_db.session.query(SeedISBN).all()
|
||||||
|
for s in islice(gutenberg_done, max):
|
||||||
|
yield report_on_seed_isbn(json.loads(s.results))
|
||||||
|
|
||||||
|
def results_in_db(max=None):
|
||||||
|
gluejar_db = GluejarDB()
|
||||||
|
gutenberg_done = gluejar_db.session.query(SeedISBN).all()
|
||||||
|
for s in islice(gutenberg_done, max):
|
||||||
|
yield json.loads(s.results)
|
||||||
|
|
||||||
return gutenberg_with_ol
|
|
||||||
|
|
||||||
class FreebaseClient(object):
|
class FreebaseClient(object):
|
||||||
def __init__(self, username=None, password=None, main_or_sandbox='main'):
|
def __init__(self, username=None, password=None, main_or_sandbox='main'):
|
||||||
|
@ -871,6 +944,9 @@ class DatabaseTest(unittest.TestCase):
|
||||||
|
|
||||||
class ChainTest(unittest.TestCase):
|
class ChainTest(unittest.TestCase):
|
||||||
def test_chain(self):
|
def test_chain(self):
|
||||||
|
"""
|
||||||
|
Make sure that I (RY) understoo that itertools.ichain worked by actually chaining together a series of iterators into 1
|
||||||
|
"""
|
||||||
self.assertTrue(True)
|
self.assertTrue(True)
|
||||||
max = None
|
max = None
|
||||||
sizes = [5, 8, 9]
|
sizes = [5, 8, 9]
|
||||||
|
@ -997,9 +1073,14 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
#unittest.main()
|
#unittest.main()
|
||||||
|
|
||||||
calc_seed_isbns()
|
for (i,s) in enumerate(calc_seed_isbns(max=100)):
|
||||||
|
try:
|
||||||
|
print i, report_on_seed_isbn(s[1])
|
||||||
|
except Exception, e:
|
||||||
|
print i, e
|
||||||
|
|
||||||
suites = suite()
|
|
||||||
|
#suites = suite()
|
||||||
#suites = unittest.defaultTestLoader.loadTestsFromModule(__import__('__main__'))
|
#suites = unittest.defaultTestLoader.loadTestsFromModule(__import__('__main__'))
|
||||||
#unittest.TextTestRunner().run(suites)
|
#unittest.TextTestRunner().run(suites)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue