Code that I'm now running in quasi-production on my laptop to compute the seed isbn. Let's see how it goes

2012-02-10 19:15:35 -08:00 · 2012-02-10 19:15:35 -08:00 · cfc3dd3549
parent b5c663f82f
commit cfc3dd3549
2 changed files with 131 additions and 49 deletions
--- a/experimental/bookdata.py
+++ b/experimental/bookdata.py
@ -34,7 +34,7 @@ import freebase
 import logging
 logger = logging.getLogger(__name__)

-GOOGLE_BOOKS_KEY = "AIzaSyCewoH_s2LmrxWD5XNwed3izNnA3dUqMlo"
+GOOGLE_BOOKS_KEY = "AIzaSyDsrHCUsUFNAf65cFPSF8MZTKj8C9oMuj8"

 MASHUPBOOK_ISBN_13 = '9781590598580'
 MASHUPBOOK_ISBN_10 = '159059858X'
@ -84,7 +84,7 @@ def thingisbn(isbn):
    """given an ISBN return a list of related edition ISBNs, according to 
    Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns, which come back as isbn_13')
    """
-    logger.info("looking up %s at ThingISBN" , isbn)
+    logger.debug("looking up %s at ThingISBN" , isbn)
    url = "http://www.librarything.com/api/thingISBN/%s" % isbn
    xml = requests.get(url, headers={"User-Agent": USER_AGENT}).content
    doc = ElementTree.fromstring(xml)
@ -325,20 +325,21 @@ class OpenLibrary(object):
            return None
    @classmethod
    def xisbn(cls,isbn_val=None, work_id=None, page_size=5):
-        
+        logger.debug("isbn_val, work_id, page_size: %s %s %d", isbn_val, work_id, page_size)
        isbns = set()
        
        if isbn_val is None and work_id is None:
            raise Exception("One of isbn or work_id must be specified")
        elif isbn_val is not None and work_id is not None:
-            raise Exception("Only only of isbn or work_id can be specified")
+            raise Exception("Only one of isbn or work_id can be specified")
            
        if isbn_val is not None:
            # figure out the work_id and then pass back all the ISBNs from the manifestations of the work
            try:
                isbn_val = isbn_mod.ISBN(isbn_val).to_string('13')
-                isbns.add(isbn_val)
-                yield isbn_val
+                if isbn_val is not None:
+                    isbns.add(isbn_val)
+                    yield isbn_val
                
                work_ids = list(cls.works([(isbn_val,'isbn')]))
                if len(work_ids):
@ -363,7 +364,7 @@ class OpenLibrary(object):
                if isbn:
                    try:
                        isbn = isbn_mod.ISBN(isbn).to_string('13')
-                        if isbn not in isbns:
+                        if isbn is not None and isbn not in isbns:
                            isbns.add(isbn)
                            yield isbn
                    except isbn_mod.ISBNException:
--- a/experimental/gutenberg/gutenberg.py
+++ b/experimental/gutenberg/gutenberg.py
@ -27,12 +27,14 @@ import re
 from itertools import islice, izip
 import logging
 import random
+import json

 from google.refine import refine

 from datetime import datetime

 from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Text, Sequence, Boolean, not_, and_, DateTime
+from sqlalchemy.dialects.mysql import MEDIUMTEXT
 from sqlalchemy.orm import mapper, sessionmaker
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.ext.declarative import declarative_base
@ -90,19 +92,18 @@ def get_or_create(session, model, defaults=None, **kwargs):

 Base = declarative_base()

-class SeedISBN(object):
-    """
-        CREATE TABLE `SeedISBN` (
-      `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
-      `gutenberg_etext_id` int(11) DEFAULT NULL,
-      `seed_isbn` char(13) DEFAULT NULL,
-      `results` mediumtext,
-      `calculated` timestamp NULL DEFAULT NULL,
-      `error` text,
-      PRIMARY KEY (`id`)
-    ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
-    """
-    pass
+class SeedISBN(Base):
+    
+    __tablename__ = 'SeedISBN'
+    __table_args__ = {'mysql_engine':'InnoDB'} 
+
+    #column definitions
+    calculated = Column(u'calculated', DateTime, default=datetime.utcnow())
+    error = Column(u'error', Text())
+    gutenberg_etext_id = Column(u'gutenberg_etext_id', Integer(11), index=True)
+    id = Column(u'id', Integer(11), primary_key=True, nullable=False)
+    results = Column(u'results', MEDIUMTEXT())
+    seed_isbn = Column(u'seed_isbn', String(length=13))


 class GutenbergText(object):
@ -201,24 +202,36 @@ class MappingError(Base):
@singleton
 class GluejarDB(object):
    def __init__(self, user="gluejar", pw="gluejar", db="Gluejar", host="127.0.0.1", port=3306):
-        mysql_connect_path = "mysql+mysqldb://%s:%s@%s:%s/%s?charset=utf8" % (user,pw,host,port,db)
-        engine = create_engine(mysql_connect_path, echo=False)
+        self.mysql_connect_path = "mysql+mysqldb://%s:%s@%s:%s/%s?charset=utf8" % (user,pw,host,port,db)
+        self.engine = create_engine(self.mysql_connect_path, echo=False)
      
-        metadata = MetaData(engine)
-        Base.metadata.create_all(engine) 
+        self.metadata = MetaData(self.engine)
+        Base.metadata.create_all(self.engine) 
        
-        gutenbergtext = Table('GutenbergText', metadata, autoload=True)
+        gutenbergtext = Table('GutenbergText', self.metadata, autoload=True)
        mapper(GutenbergText, gutenbergtext)

-        gutenbergfile = Table('GutenbergFile', metadata, autoload=True)
+        gutenbergfile = Table('GutenbergFile', self.metadata, autoload=True)
        mapper(GutenbergFile, gutenbergfile)
        
-        seedisbn = Table('SeedISBN', metadata, autoload=True)
-        mapper(SeedISBN, seedisbn)
+        #seedisbn = Table('SeedISBN', self.metadata, autoload=True)
+        #mapper(SeedISBN, seedisbn)
        
-        Session = sessionmaker(bind=engine)
+        Session = sessionmaker(bind=self.engine)
        session = Session()
        self.session = session
+    def _reflect(self):
+        for table in self.metadata.tables.values():
+            print """
+class %s(Base):
+    __table__ = Table(%r, Base.metadata, autoload=True)
+
+""" % (table.name, table.name)
+    def _sqlautocode(self):
+        """
+        spit out some code to help us run sqlautocode
+        """
+        return "sqlautocode -o model.py  %s" % (self.mysql_connect_path)
    def commit_db(self):
        self.session.commit()
    def rollback(self):
@ -610,8 +623,8 @@ def seed_isbn(olwk_ids, freebase_ids, lang='en'):
    fb = FreebaseBooks()
    gb = GoogleBooks(key=GOOGLE_BOOKS_KEY)
    
-    fb_isbn_set = reduce(operator.or_,[set(fb.xisbn(book_id=freebase_id)) for freebase_id in freebase_ids])
-    ol_isbn_set = reduce(operator.or_,[set(OpenLibrary.xisbn(work_id=olwk_id)) for olwk_id in olwk_ids])
+    fb_isbn_set = reduce(operator.or_,[set(fb.xisbn(book_id=freebase_id)) for freebase_id in freebase_ids]) if len(freebase_ids) else set()
+    ol_isbn_set = reduce(operator.or_,[set(OpenLibrary.xisbn(work_id=olwk_id)) for olwk_id in olwk_ids]) if len(olwk_ids) else set()
    
    #lt_isbn_set = set(map(lambda x: isbn_mod.ISBN(x).to_string('13'), thingisbn(SURFACING_ISBN)))
    
@ -645,13 +658,13 @@ def seed_isbn(olwk_ids, freebase_ids, lang='en'):
    logger.debug("unrecognized by LT %s %d", lt_unrecognized, len(lt_unrecognized))
    
    # figure out new ISBNs found by LT
-    new_isbns = (reduce(operator.or_,lt_clusters) | lt_unrecognized) - (fb_isbn_set | ol_isbn_set)
+    new_isbns = ((reduce(operator.or_,lt_clusters) if len(lt_clusters) else set())| lt_unrecognized) - (fb_isbn_set | ol_isbn_set)
    logger.debug( "new isbns from LT %s %d", new_isbns, len(new_isbns))
        
    gbooks_data = {}
    
    # then pass to Google books to get info, including language
-    all_isbns = (reduce(operator.or_,lt_clusters) | lt_unrecognized)
+    all_isbns = ((reduce(operator.or_,lt_clusters) if len(lt_clusters) else set()) | lt_unrecognized)
    for (i, isbn) in enumerate(all_isbns):
        gbooks_data[isbn] = gb.isbn(isbn)
        logger.debug("%d %s %s", i, isbn, gbooks_data[isbn])
@ -703,7 +716,7 @@ def report_on_seed_isbn(seed_isbn_result):
        ("seed isbn",  s[0]),
        ("the Google info we have on the seed isbn", s[1]['gbooks_data'].get(s[0])),
        ("lang", s[1]['lang']),
-        ("Freebase ids", s[1]['fb_isbns']),
+        ("Freebase ids", s[1]['freebase_ids']),
        ("number of OL ids",  len(s[1]['olwk_ids'])),
        ("total number of ISBNs from pooling FB + OL + LT", s[1]['len_all_isbns']),
        ("number of FB isbns", len(s[1]['fb_isbns'])),
@ -725,33 +738,93 @@ def surfacing_seed_isbn():
    SURFACING_WORK_OLID = 'OL675829W'
    surfacing_fb_id = '/m/05p_vg'
    book_isbn = '9780446311076'
-    return seed_isbn(olwk_ids=(SURFACING_WORK_OLID,), freebase_ids=(surfacing_fb_id,))    
+    return seed_isbn(olwk_ids=(SURFACING_WORK_OLID,), freebase_ids=(surfacing_fb_id,), lang='en')    
    
 def ry_mashups_seed_isbn():
    olid = "OL10306321W"
    fb_id = "/en/pro_web_2_0_mashups_remixing_data_and_web_services"
-    return seed_isbn(olwk_ids=(olid,), freebase_ids=(fb_id,))
+    return seed_isbn(olwk_ids=(olid,), freebase_ids=(fb_id,), lang='en')
    
 def moby_dick_seed_isbn():
-    return seed_isbn(olwk_ids=('OL102749W',), freebase_ids=('/en/moby-dick',))
+    return seed_isbn(olwk_ids=('OL102749W',), freebase_ids=('/en/moby-dick',), lang='en')
+
+def calc_seed_isbns(ids=None, max=None, override=False, max_consecutive_error=3):

-def calc_seed_isbns(ids=None, max=None, override=False):
    # if ids specified, work through them
    # loop through all Gutenberg ids, see whethether the seed_isbn has been calculated -- and if not, do so.

-    # collate all the ol work ids for a given gutenberg id
+    current_error_count = 0
    
    gluejar_db = GluejarDB()
-    gutenberg_done = set([gluejar_db.session.query(SeedISBN.gutenberg_etext_id).all()])
    
+    # pull out a set of Gutenberg text ids that already in the SeedISBN table so that we have the option of
+    # not recalculating those Gutenberg texts
+    gutenberg_done = set(map(lambda x: x[0], gluejar_db.session.query(SeedISBN.gutenberg_etext_id).all()))
+    logger.debug("gutenberg_done %s", gutenberg_done )
+    
+    # collate all the ol work ids  and Freebase ids for a given gutenberg id
    if ids is None:
-        gutenberg_with_ol = defaultdict(set)
-        for mapping in gutenberg_to_ol_mapping(max=max):
-            logger.debug(mapping)
-            gutenberg_with_ol[mapping["gutenberg_etext_id"]].add(mapping["olid"])
-        ids = gutenberg_
+        g_ids = set()
+        ol_ids = defaultdict(set)
+        fb_ids = defaultdict(set)
+        lang = {}
+        for mapping in gutenberg_to_ol_mapping():
+            g_id = mapping["gutenberg_etext_id"]
+            g_ids.add(g_id)
+            ol_ids[g_id].add(mapping["olid"])
+            fb_ids[g_id].add(mapping["freebase_id"])
+            lang[g_id] = mapping["lang"]
+        logger.debug("len(g_ids): %d", len(g_ids))
+        # turn the mapping into a series of tuples that can be fed to seed_isbn
+        if not override:
+            logger.debug("len(g_ids) before subtracting gutenberg_done: %d", len(g_ids))
+            logger.debug("len(gutenberg_done): %d", len(gutenberg_done))
+            g_ids -= gutenberg_done
+            logger.debug("len(g_ids) after subtracting gutenberg_done: %d", len(g_ids))
+            
+        ids = [(g_id, tuple(ol_ids[g_id]), tuple(fb_ids[g_id]), lang[g_id]) for g_id in g_ids]
+        logger.debug("len(ids): %d", len(ids))
+        
+    for (i, work_id) in enumerate(islice(ids, max)):
+        if current_error_count >= max_consecutive_error:
+            break
+        (g_id, args) = (work_id[0], work_id[1:])
+        logger.info("i, g_id, args: %d %s %s", i, g_id, args)
+        (seed, created) = get_or_create(gluejar_db.session, SeedISBN, gutenberg_etext_id=g_id)
+        try:
+            s = seed_isbn(*args)
+            seed.calculated = datetime.utcnow()
+            seed.seed_isbn = s[0]
+            seed.error = None
+            seed.results = json.dumps(s)
+            current_error_count = 0
+            yield (g_id, s)
+        except Exception, e:
+            current_error_count += 1
+            seed.seed_isbn = None
+            seed.calculated = datetime.utcnow()
+            seed.error = str(e)
+            seed.results = None
+            logger.warning(str(e))
+            yield (g_id, e)
+        finally:
+            gluejar_db.commit_db() 
+
+            
+            
+def reports_in_db(max=None):
+    
+    gluejar_db = GluejarDB()
+    gutenberg_done = gluejar_db.session.query(SeedISBN).all()
+    for s in islice(gutenberg_done, max):
+        yield report_on_seed_isbn(json.loads(s.results))
+        
+def results_in_db(max=None):
+    gluejar_db = GluejarDB()
+    gutenberg_done = gluejar_db.session.query(SeedISBN).all()
+    for s in islice(gutenberg_done, max):
+        yield json.loads(s.results)    
    
-    return gutenberg_with_ol
    
 class FreebaseClient(object):
    def __init__(self, username=None, password=None, main_or_sandbox='main'):
@ -871,6 +944,9 @@ class DatabaseTest(unittest.TestCase):

 class ChainTest(unittest.TestCase):
    def test_chain(self):
+        """
+        Make sure that I (RY) understoo that itertools.ichain worked by actually chaining together a series of iterators into 1
+        """
        self.assertTrue(True)
        max = None
        sizes = [5, 8, 9]
@ -997,9 +1073,14 @@ if __name__ == '__main__':
    
    #unittest.main()

-    calc_seed_isbns()
+    for (i,s) in enumerate(calc_seed_isbns(max=100)):
+        try:
+            print i, report_on_seed_isbn(s[1])
+        except Exception, e:
+            print i, e
            
-    suites = suite()
+    
+    #suites = suite()
    #suites = unittest.defaultTestLoader.loadTestsFromModule(__import__('__main__'))
    #unittest.TextTestRunner().run(suites)