2011-12-05 17:23:17 +00:00
"""
Module to parse the Project Gutenberg Catalog and map to various work IDs
"""
import unittest
import os
import json
from copy import deepcopy
from freebase . api . mqlkey import quotekey , unquotekey
import freebase
import requests
from lxml import html
import httplib
from urlparse import urljoin
from urllib import urlencode
from pprint import pprint
2012-02-10 18:56:08 +00:00
from collections import defaultdict , OrderedDict
2011-12-05 17:23:17 +00:00
2012-02-27 16:46:34 +00:00
from itertools import islice , chain , izip , repeat
2012-02-08 18:44:18 +00:00
import operator
2011-12-05 17:23:17 +00:00
import time
import re
import logging
2012-02-08 22:28:46 +00:00
import random
2012-02-11 03:15:35 +00:00
import json
2011-12-05 17:23:17 +00:00
from datetime import datetime
2011-12-10 22:18:22 +00:00
from sqlalchemy import create_engine , MetaData , Table , Column , Integer , String , Text , Sequence , Boolean , not_ , and_ , DateTime
2012-02-11 03:15:35 +00:00
from sqlalchemy . dialects . mysql import MEDIUMTEXT
2011-12-05 17:23:17 +00:00
from sqlalchemy . orm import mapper , sessionmaker
from sqlalchemy . exc import IntegrityError
from sqlalchemy . ext . declarative import declarative_base
from sqlalchemy . schema import UniqueConstraint
2011-12-10 22:18:22 +00:00
from sqlalchemy . sql . expression import ClauseElement
2011-12-05 17:23:17 +00:00
2012-02-08 06:52:50 +00:00
from bookdata import WorkMapper , OpenLibrary , FreebaseBooks , GoogleBooks , GOOGLE_BOOKS_KEY , thingisbn
try :
from regluit . core import isbn as isbn_mod
except :
import isbn as isbn_mod
2011-12-05 17:23:17 +00:00
2012-02-08 22:28:46 +00:00
logging . basicConfig ( filename = ' gutenberg.log ' , level = logging . DEBUG )
2011-12-05 17:23:17 +00:00
logger = logging . getLogger ( __name__ )
def filter_none ( d ) :
d2 = { }
for ( k , v ) in d . iteritems ( ) :
if v is not None :
d2 [ k ] = v
return d2
# http://stackoverflow.com/questions/2348317/how-to-write-a-pager-for-python-iterators/2350904#2350904
def grouper ( iterable , page_size ) :
page = [ ]
for item in iterable :
page . append ( item )
if len ( page ) == page_size :
yield page
page = [ ]
2012-02-27 16:46:34 +00:00
if len ( page ) :
yield page
2011-12-05 17:23:17 +00:00
def singleton ( cls ) :
instances = { }
def getinstance ( ) :
if cls not in instances :
instances [ cls ] = cls ( )
return instances [ cls ]
return getinstance
2011-12-10 22:18:22 +00:00
# http://stackoverflow.com/a/2587041/7782
def get_or_create ( session , model , defaults = None , * * kwargs ) :
instance = session . query ( model ) . filter_by ( * * kwargs ) . first ( )
if instance :
return instance , False
else :
params = dict ( ( k , v ) for k , v in kwargs . iteritems ( ) if not isinstance ( v , ClauseElement ) )
if defaults is None :
defaults = { }
params . update ( defaults )
instance = model ( * * params )
session . add ( instance )
return instance , True
2011-12-05 17:23:17 +00:00
Base = declarative_base ( )
2012-02-16 00:06:40 +00:00
2012-02-11 03:15:35 +00:00
class SeedISBN ( Base ) :
__tablename__ = ' SeedISBN '
__table_args__ = { ' mysql_engine ' : ' InnoDB ' }
#column definitions
calculated = Column ( u ' calculated ' , DateTime , default = datetime . utcnow ( ) )
error = Column ( u ' error ' , Text ( ) )
gutenberg_etext_id = Column ( u ' gutenberg_etext_id ' , Integer ( 11 ) , index = True )
id = Column ( u ' id ' , Integer ( 11 ) , primary_key = True , nullable = False )
results = Column ( u ' results ' , MEDIUMTEXT ( ) )
seed_isbn = Column ( u ' seed_isbn ' , String ( length = 13 ) )
2012-02-27 16:46:34 +00:00
title = Column ( u ' title ' , Text ( ) )
title_error = Column ( u ' title_error ' , Text ( ) )
2012-02-10 18:56:08 +00:00
2011-12-05 17:23:17 +00:00
class GutenbergText ( object ) :
2012-01-13 01:52:10 +00:00
"""
2012-02-10 18:56:08 +00:00
CREATE TABLE ` GutenbergText ` (
` id ` int ( 11 ) unsigned NOT NULL AUTO_INCREMENT ,
` etext_id ` int ( 10 ) unsigned NOT NULL ,
` title ` varchar ( 1024 ) DEFAULT NULL ,
` friendly_title ` varchar ( 1024 ) DEFAULT NULL ,
` lang ` char ( 5 ) DEFAULT NULL ,
` rights ` varchar ( 512 ) DEFAULT NULL ,
` created ` date DEFAULT NULL ,
` creator ` varchar ( 1024 ) DEFAULT NULL ,
PRIMARY KEY ( ` id ` ) ,
KEY ` etext_id ` ( ` etext_id ` )
) ENGINE = MyISAM AUTO_INCREMENT = 37874 DEFAULT CHARSET = utf8 ;
2012-01-13 01:52:10 +00:00
"""
2011-12-05 17:23:17 +00:00
pass
class GutenbergFile ( object ) :
2012-01-13 01:52:10 +00:00
"""
CREATE TABLE ` GutenbergFile ` (
` id ` int ( 11 ) unsigned NOT NULL AUTO_INCREMENT ,
` about ` varchar ( 300 ) NOT NULL DEFAULT ' ' ,
` format ` varchar ( 256 ) DEFAULT NULL ,
` extent ` int ( 11 ) unsigned DEFAULT NULL ,
` modified ` date DEFAULT NULL ,
` is_format_of ` int ( 11 ) DEFAULT NULL ,
PRIMARY KEY ( ` id ` ) ,
UNIQUE KEY ` about_index ` ( ` about ` ) ,
KEY ` is_format_of ` ( ` is_format_of ` )
) ENGINE = MyISAM AUTO_INCREMENT = 463211 DEFAULT CHARSET = utf8 ;
"""
2011-12-05 17:23:17 +00:00
pass
class WikipediaLink ( Base ) :
__tablename__ = ' WikipediaLink '
__table_args__ = (
UniqueConstraint ( ' gutenberg_etext_id ' , ' wikipedia_href ' , name = ' wikipedia_etext_id ' ) ,
{ ' mysql_engine ' : ' MyISAM ' }
)
id = Column ( Integer , primary_key = True )
gutenberg_etext_id = Column ( ' gutenberg_etext_id ' , Integer ( 11 ) )
wikipedia_href = Column ( ' wikipedia_href ' , String ( 255 ) )
wikipedia_title = Column ( ' wikipedia_title ' , String ( 255 ) )
class FreebaseEntity ( Base ) :
__tablename__ = ' FreebaseEntity '
__table_args__ = (
{ ' mysql_engine ' : ' MyISAM ' }
)
id = Column ( ' id ' , String ( 255 ) , primary_key = True )
wikipedia_href = Column ( ' wikipedia_href ' , String ( 255 ) )
is_book_book = Column ( ' is_book_book ' , Boolean )
2011-12-10 22:18:22 +00:00
class OpenLibraryWork ( Base ) :
__tablename__ = ' OpenLibraryWork '
__table_args__ = (
{ ' mysql_engine ' : ' MyISAM ' }
)
id = Column ( ' id ' , String ( 255 ) , primary_key = True )
title = Column ( ' title ' , String ( 512 ) , default = None )
class MappedWork ( Base ) :
__tablename__ = ' MappedWork '
__table_args__ = (
{ ' mysql_engine ' : ' MyISAM ' }
)
id = Column ( Integer , primary_key = True )
olid = Column ( ' olid ' , String ( 255 ) )
freebase_id = Column ( ' freebase_id ' , String ( 255 ) )
gutenberg_etext_id = Column ( Integer )
class GutenbergIdMapped ( Base ) :
__tablename__ = ' GutenbergIdMapped '
__table_args__ = (
{ ' mysql_engine ' : ' MyISAM ' }
)
id = Column ( Integer , primary_key = True , autoincrement = False )
class MappingError ( Base ) :
__tablename__ = ' MappingError '
__table_args__ = (
{ ' mysql_engine ' : ' MyISAM ' }
)
id = Column ( ' id ' , Integer , primary_key = True )
created = Column ( ' created ' , DateTime , default = datetime . utcnow ( ) )
message = Column ( ' message ' , String ( 1000 ) )
2011-12-05 17:23:17 +00:00
@singleton
class GluejarDB ( object ) :
def __init__ ( self , user = " gluejar " , pw = " gluejar " , db = " Gluejar " , host = " 127.0.0.1 " , port = 3306 ) :
2012-02-11 03:15:35 +00:00
self . mysql_connect_path = " mysql+mysqldb:// %s : %s @ %s : %s / %s ?charset=utf8 " % ( user , pw , host , port , db )
self . engine = create_engine ( self . mysql_connect_path , echo = False )
2011-12-05 17:23:17 +00:00
2012-02-11 03:15:35 +00:00
self . metadata = MetaData ( self . engine )
Base . metadata . create_all ( self . engine )
2011-12-05 17:23:17 +00:00
2012-02-11 03:15:35 +00:00
gutenbergtext = Table ( ' GutenbergText ' , self . metadata , autoload = True )
2011-12-05 17:23:17 +00:00
mapper ( GutenbergText , gutenbergtext )
2012-02-11 03:15:35 +00:00
gutenbergfile = Table ( ' GutenbergFile ' , self . metadata , autoload = True )
2011-12-05 17:23:17 +00:00
mapper ( GutenbergFile , gutenbergfile )
2012-02-11 03:15:35 +00:00
#seedisbn = Table('SeedISBN', self.metadata, autoload=True)
#mapper(SeedISBN, seedisbn)
2012-02-10 18:56:08 +00:00
2012-02-11 03:15:35 +00:00
Session = sessionmaker ( bind = self . engine )
2011-12-05 17:23:17 +00:00
session = Session ( )
self . session = session
2012-02-11 03:15:35 +00:00
def _reflect ( self ) :
for table in self . metadata . tables . values ( ) :
print """
class % s ( Base ) :
__table__ = Table ( % r , Base . metadata , autoload = True )
""" % (table.name, table.name)
def _sqlautocode ( self ) :
"""
spit out some code to help us run sqlautocode
"""
return " sqlautocode -o model.py %s " % ( self . mysql_connect_path )
2011-12-05 17:23:17 +00:00
def commit_db ( self ) :
self . session . commit ( )
def rollback ( self ) :
self . session . rollback ( )
def gutenberg_texts ( self ) :
2012-01-13 01:52:10 +00:00
""" generator for all records in the GutenbergText table """
2011-12-05 17:23:17 +00:00
items = self . session . query ( GutenbergText ) . all ( )
for item in items :
yield item
def filtered_wikipedia_links ( self ) :
""" generate wikipedia links that are in the main Wikipedia namespace """
2012-01-13 01:52:10 +00:00
# eliminate pages in the TO_FILTER namespace
2011-12-05 17:23:17 +00:00
TO_FILTER = [ ' File: % ' , ' Portal: % ' , ' Portal talk: % ' , " Talk: % " ,
' Template: % ' , ' Template talk: % ' , ' User: % ' , ' User talk: % ' ,
' Wikipedia: % ' , ' Wikipedia talk: % ' ]
total_filter = and_ ( * [ not_ ( WikipediaLink . wikipedia_title . like ( f ) ) for f in TO_FILTER ] )
items = self . session . query ( WikipediaLink ) . filter ( total_filter )
for item in items :
yield item
def parse_project_gutenberg_catalog ( fname = ' /Users/raymondyee/D/Document/Gluejar/gutenberg/catalog.rdf ' ) :
#URL = http://www.gutenberg.org/feeds/catalog.rdf.zip
import re
def text ( node ) :
node . normalize ( )
return node . childNodes [ 0 ] . data
RDF_NS = ' http://www.w3.org/1999/02/22-rdf-syntax-ns# '
DC_NS = ' http://purl.org/dc/elements/1.1/ '
DCTERMS_NS = ' http://purl.org/dc/terms/ '
PGTERMS_NS = ' http://www.gutenberg.org/rdfterms/ '
from xml . dom . pulldom import START_ELEMENT , parse
doc = parse ( fname )
for event , node in doc :
if event == START_ELEMENT and node . localName == " etext " :
doc . expandNode ( node )
# etext_id
id = node . getAttributeNS ( RDF_NS , ' ID ' )
try :
etext_id = int ( re . match ( r ' ^etext( \ d+)$ ' , id ) . group ( 1 ) )
except :
etext_id = None
# title
try :
title = text ( node . getElementsByTagNameNS ( DC_NS , ' title ' ) [ 0 ] )
title = title . replace ( " \n " , " " ) . replace ( " \r " , " " )
except :
title = None
# friendly_title
try :
friendly_title = text ( node . getElementsByTagNameNS ( PGTERMS_NS , ' friendlytitle ' ) [ 0 ] )
friendly_title = friendly_title . replace ( " \n " , " " ) . replace ( " \r " , " " )
except :
friendly_title = None
# lang
try :
lang = text ( node . getElementsByTagNameNS ( DC_NS , ' language ' ) [ 0 ] . getElementsByTagNameNS ( DCTERMS_NS , ' ISO639-2 ' ) [ 0 ] . getElementsByTagNameNS ( RDF_NS , ' value ' ) [ 0 ] )
except Exception , e :
logger . debug ( e )
lang = None
# rights
try :
rights_node = node . getElementsByTagNameNS ( DC_NS , ' rights ' ) [ 0 ]
rights = rights_node . getAttributeNS ( RDF_NS , ' resource ' )
if rights == ' ' :
rights = text ( rights_node )
except Exception , e :
logger . debug ( e )
right = None
# created
# <dc:created><dcterms:W3CDTF><rdf:value>2011-11-02</rdf:value></dcterms:W3CDTF></dc:created>
try :
created_str = text ( node . getElementsByTagNameNS ( DC_NS , ' created ' ) [ 0 ] . getElementsByTagNameNS ( DCTERMS_NS , ' W3CDTF ' ) [ 0 ] . getElementsByTagNameNS ( RDF_NS , ' value ' ) [ 0 ] )
created = datetime . date ( datetime . strptime ( created_str , " % Y- % m- %d " ) )
except Exception , e :
logger . debug ( e )
created = None
# creator
try :
creator = text ( node . getElementsByTagNameNS ( DC_NS , ' creator ' ) [ 0 ] )
except Exception , e :
logger . debug ( e )
creator = None
yield { ' type ' : ' text ' , ' etext_id ' : etext_id , ' title ' : title , ' friendly_title ' : friendly_title ,
' lang ' : lang , ' rights ' : rights , ' created ' : created , ' creator ' : creator }
if event == START_ELEMENT and node . localName == " file " :
doc . expandNode ( node )
# about
try :
about = node . getAttributeNS ( RDF_NS , ' about ' )
except Exception , e :
logger . debug ( e )
about = None
# isFormatOf
try :
is_format_of_raw = node . getElementsByTagNameNS ( DCTERMS_NS , ' isFormatOf ' ) [ 0 ] . getAttributeNS ( RDF_NS , ' resource ' )
is_format_of = int ( re . match ( r ' #etext( \ d+)$ ' , is_format_of_raw ) . group ( 1 ) )
except Exception , e :
logger . debug ( e )
is_format_of = None
# format: grab the first one
try :
format = text ( node . getElementsByTagNameNS ( DC_NS , ' format ' ) [ 0 ] . getElementsByTagNameNS ( DCTERMS_NS , ' IMT ' ) [ 0 ] . getElementsByTagNameNS ( RDF_NS , ' value ' ) [ 0 ] )
except Exception , e :
logger . debug ( e )
format = None
# modified
try :
modified_str = text ( node . getElementsByTagNameNS ( DCTERMS_NS , ' modified ' ) [ 0 ] . getElementsByTagNameNS ( DCTERMS_NS , ' W3CDTF ' ) [ 0 ] . getElementsByTagNameNS ( RDF_NS , ' value ' ) [ 0 ] )
modified = datetime . date ( datetime . strptime ( modified_str , " % Y- % m- %d " ) )
except Exception , e :
logger . info ( e )
modified = None
# extent
try :
extent = int ( text ( node . getElementsByTagNameNS ( DCTERMS_NS , ' extent ' ) [ 0 ] ) )
except Exception , e :
raise e
logger . info ( e )
extent = None
yield { ' type ' : ' file ' , ' about ' : about , ' is_format_of ' : is_format_of , ' format ' : format , ' modified ' : modified ,
' extent ' : extent }
def walk_through_catalog ( fname = ' /Users/raymondyee/D/Document/Gluejar/gutenberg/catalog.rdf ' , max = 100000 ) :
for i , item in enumerate ( islice ( parse_project_gutenberg_catalog ( fname ) , max ) ) :
print i , item
def load_texts_to_db ( fname = ' /Users/raymondyee/D/Document/Gluejar/gutenberg/catalog_texts.rdf ' , max = None ) :
gluejar_db = GluejarDB ( )
for ( i , item ) in enumerate ( islice ( parse_project_gutenberg_catalog ( fname ) , max ) ) :
print i , item
if item [ ' type ' ] == ' text ' :
try :
book = gluejar_db . session . query ( GutenbergText ) . filter ( GutenbergText . etext_id == item [ ' etext_id ' ] ) . one ( )
except :
book = GutenbergText ( )
book . etext_id = item [ ' etext_id ' ]
gluejar_db . session . add ( book )
book . title = item [ ' title ' ]
book . friendly_title = item [ ' friendly_title ' ]
book . lang = item [ ' lang ' ]
book . rights = item [ ' rights ' ]
book . created = item [ ' created ' ]
book . creator = item [ ' creator ' ]
gluejar_db . commit_db ( )
def load_files_to_db ( fname = ' /Users/raymondyee/D/Document/Gluejar/gutenberg/catalog_files.rdf ' , max = 100000 ) :
gluejar_db = GluejarDB ( )
for ( i , item ) in enumerate ( islice ( parse_project_gutenberg_catalog ( fname ) , max ) ) :
print i , item
if item [ ' type ' ] == ' file ' :
# try to write if it's a problem do a query to update -- about is unique
try :
file = GutenbergFile ( )
file . about = item [ ' about ' ]
gluejar_db . session . add ( file )
gluejar_db . commit_db ( )
except IntegrityError , e :
gluejar_db . session . rollback ( )
file = gluejar_db . session . query ( GutenbergFile ) . filter ( GutenbergFile . about == item [ ' about ' ] ) . one ( )
file . is_format_of = item [ ' is_format_of ' ]
file . format = item [ ' format ' ]
file . modified = item [ ' modified ' ]
file . extent = item [ ' extent ' ]
gluejar_db . commit_db ( )
gluejar_db . commit_db ( )
def external_links_in_wikipedia ( target , limit = 500 , offset = 0 ) :
# e.g., http://en.wikipedia.org/w/index.php?title=Special:LinkSearch&target=http%3A%2F%2Fwww.gutenberg.org%2Fetext%2F&limit=500&offset=0
base_url = " http://en.wikipedia.org/w/index.php "
params = filter_none ( { " title " : " Special:LinkSearch " , " target " : target ,
" limit " : limit , offset : offset } )
url = " %s ? %s " % ( base_url , urlencode ( params ) )
# page through all the results
more_pages = True
while more_pages :
r = requests . get ( url )
if r . status_code != httplib . OK :
raise Exception ( " Problem with request on %s %s : %s %s " , base_url , params , r . status_code , r . content )
etree = html . fromstring ( r . content )
links = etree . xpath ( " //ol " ) [ 0 ] . xpath ( " li " )
for link in links :
( target_a , source_a ) = link . xpath ( ' a ' )
yield { " target " : target_a . attrib [ " href " ] , " source_href " : source_a . attrib [ " href " ] , " source_title " : source_a . text }
# is there another page
following_page = etree . xpath ( " //a[@class= ' mw-nextlink ' ] " )
if len ( following_page ) > 0 :
url = urljoin ( url , following_page [ 0 ] . attrib [ " href " ] )
else :
more_pages = False
def load_wikipedia_external_links_into_db ( max = None ) :
targets = [ " http://www.gutenberg.org/etext " , " http://www.gutenberg.org/ebook " ]
links = chain ( * [ external_links_in_wikipedia ( target ) for target in targets ] )
gluejar_db = GluejarDB ( )
for ( i , link ) in enumerate ( islice ( links , max ) ) :
link_target = link [ " target " ]
try :
etext_id = re . search ( r ' \ /( \ d+)$ ' , link_target ) . group ( 1 )
except :
etext_id = None
print i , link [ " source_href " ] , link [ " source_title " ] , link_target , etext_id
if etext_id is not None :
wl = WikipediaLink ( )
wl . gutenberg_etext_id = etext_id
wl . wikipedia_href = link [ " source_href " ]
wl . wikipedia_title = link [ " source_title " ]
gluejar_db . session . add ( wl )
try :
gluejar_db . commit_db ( )
except Exception , e :
print e
gluejar_db . rollback ( )
def map_wikipedia_links_to_freebase_ids ( max = None , page_size = 5 ) :
fb = FreebaseClient ( ' rdhyee ' , ' fbkule! ' )
db = GluejarDB ( )
wikipedia_ids = list ( ( wl . wikipedia_href for wl in islice ( db . filtered_wikipedia_links ( ) , max ) ) )
for id in wikipedia_ids :
print id
resp = fb . wikipedia_href_to_freebase_id ( wikipedia_ids , page_size = page_size )
for ( i , r ) in enumerate ( resp ) :
print i , r
if len ( r ) : # an actual result
print r [ 0 ] [ ' id ' ] , r [ 0 ] [ ' type ' ] , r [ 0 ] [ ' key ' ] [ 0 ] [ ' value ' ]
fb_entity = FreebaseEntity ( )
fb_entity . id = r [ 0 ] [ ' id ' ]
try :
db . session . add ( fb_entity )
db . commit_db ( )
except IntegrityError , e :
db . rollback ( )
fb_entity = db . session . query ( FreebaseEntity ) . filter ( FreebaseEntity . id == r [ 0 ] [ ' id ' ] ) . one ( )
fb_entity . wikipedia_href = ' /wiki/ %s ' % ( unquotekey ( r [ 0 ] [ ' key ' ] [ 0 ] [ ' value ' ] ) )
fb_entity . is_book_book = ' /book/book ' in r [ 0 ] [ ' type ' ]
db . commit_db ( )
def map_refine_fb_links_to_openlibrary_work_ids ( max = None ) :
2011-12-10 22:18:22 +00:00
2012-02-15 02:01:13 +00:00
from google . refine import refine
2011-12-10 22:18:22 +00:00
db = GluejarDB ( )
2011-12-05 17:23:17 +00:00
refine_proj_id = " 1884515736058 "
refine_obj = refine . Refine ( refine . RefineServer ( ) )
proj = refine_obj . open_project ( refine_proj_id )
cols_to_extract = [ ' etext_id ' , ' title ' , ' name ' , ' fb_id ' , ' fb_id_judgement ' , ' wikipedia_title ' ]
limit = max if max is not None else 1000000
response = proj . get_rows ( limit = limit )
2011-12-10 22:18:22 +00:00
# get Gutenberg IDs already done
done = set ( [ r . id for r in db . session . query ( GutenbergIdMapped ) . all ( ) ] )
2011-12-05 17:23:17 +00:00
print " response.total: " , response . total
2011-12-10 22:18:22 +00:00
for i , row in enumerate ( islice ( response . rows , max ) ) :
2011-12-05 17:23:17 +00:00
print i , row . index , row [ ' etext_id ' ] , row [ ' title ' ] , row [ ' name ' ] , row [ ' fb_id ' ] , row [ ' fb_id_judgement ' ] ,
2011-12-10 22:18:22 +00:00
if row [ ' etext_id ' ] is not None and ( int ( row [ ' etext_id ' ] ) not in done ) :
try :
work_ids = list ( WorkMapper . freebase_book_to_openlibrary_work ( row [ ' fb_id ' ] , complete_search = True ) )
print work_ids
( fb_item , created ) = get_or_create ( db . session , FreebaseEntity , row [ ' fb_id ' ] )
for work_id in work_ids :
( ol_item , created ) = get_or_create ( db . session , OpenLibraryWork , id = work_id )
( mapping , created ) = get_or_create ( db . session , MappedWork , olid = work_id , freebase_id = row [ ' fb_id ' ] ,
gutenberg_etext_id = int ( row [ ' etext_id ' ] ) )
done . add ( int ( row [ ' etext_id ' ] ) )
( done_item , created ) = get_or_create ( db . session , GutenbergIdMapped , id = int ( row [ ' etext_id ' ] ) )
except Exception , e :
message = " Problem with i %d , etext_id %s : %s " % ( i , row [ ' etext_id ' ] , e )
print message
( error_item , created ) = get_or_create ( db . session , MappingError , message = message )
else :
print " already done "
db . commit_db ( )
def compute_ol_title_from_work_id ( max = None ) :
db = GluejarDB ( )
# loop through the OpenLibraryWork with null title
for ( i , work ) in enumerate ( islice ( db . session . query ( OpenLibraryWork ) . filter ( OpenLibraryWork . title == None ) , max ) ) :
print i , work . id ,
try :
title = OpenLibrary . json_for_olid ( work . id ) [ " title " ]
work . title = title
print title
except Exception , e :
message = " Problem with i %d , work.id %s : %s " % ( i , work . id , e )
print message
2011-12-05 17:23:17 +00:00
2011-12-10 22:18:22 +00:00
db . commit_db ( )
2011-12-12 18:49:33 +00:00
def export_gutenberg_to_ol_mapping ( max = None , fname = None ) :
2012-02-10 18:56:08 +00:00
output = list ( gutenberg_to_ol_mapping ( max = max ) )
if fname is not None :
f = open ( fname , " wb " )
f . write ( json . dumps ( output ) )
f . close ( )
return output
def gutenberg_to_ol_mapping ( max = None ) :
2012-02-08 06:52:50 +00:00
SQL = """ SELECT mw.gutenberg_etext_id, gt.title as gt_title, mw.olid, olw.title as ol_title, mw.freebase_id, gf.about as ' url ' , gf.format, gt.rights, gt.lang, DATE_FORMAT(gt.created, " % Y- % m- %d " ) as ' created '
2011-12-12 18:49:33 +00:00
FROM MappedWork mw LEFT JOIN GutenbergText gt
2012-02-03 07:07:25 +00:00
ON mw . gutenberg_etext_id = gt . etext_id LEFT JOIN OpenLibraryWork olw ON olw . id = mw . olid LEFT JOIN GutenbergFile gf ON gf . is_format_of = gt . etext_id
2011-12-12 18:49:33 +00:00
WHERE gf . format = ' application/epub+zip ' ; """
2012-02-08 06:52:50 +00:00
headers = ( " gutenberg_etext_id " , " gt_title " , " olid " , " ol_title " , " freebase_id " , " url " , " format " , " rights " , " lang " , " created " )
# getting the right fields?
# (title, gutenberg_etext_id, ol_work_id, seed_isbn, url, format, license, lang, publication_date)
2011-12-12 18:49:33 +00:00
db = GluejarDB ( )
2012-02-15 02:01:13 +00:00
resp = enumerate ( islice ( db . session . query ( * headers ) . from_statement ( SQL ) . all ( ) , max ) )
2011-12-12 18:49:33 +00:00
# what choice of serialization at this point? JSON for now, but not the best for a large file
for ( i , r ) in resp :
2012-02-10 18:56:08 +00:00
#print r, type(r), dict(izip(headers,r))
yield dict ( izip ( headers , r ) )
2011-12-12 18:49:33 +00:00
def import_gutenberg_json ( fname ) :
2012-02-08 06:52:50 +00:00
headers = ( " gutenberg_etext_id " , " gt_title " , " olid " , " ol_title " , " freebase_id " , " url " , " format " , " rights " , " lang " , " created " )
2012-02-03 07:07:25 +00:00
2011-12-12 18:49:33 +00:00
f = open ( fname )
2012-02-03 07:07:25 +00:00
records = json . load ( f )
for record in records :
print [ record [ h ] for h in headers ]
return records
2012-02-08 06:52:50 +00:00
def gutenberg_ol_fb_mappings ( gutenberg_ids , max = None ) :
2012-02-03 07:07:25 +00:00
""" For each element of the gutenberg_ids, return an good seed ISBN """
db = GluejarDB ( )
for ( i , g_id ) in enumerate ( islice ( gutenberg_ids , max ) ) :
mappings = db . session . query ( MappedWork ) . filter_by ( gutenberg_etext_id = g_id )
for mapping in mappings . all ( ) :
yield { ' fb ' : mapping . freebase_id , ' olid ' : mapping . olid }
2012-02-10 18:56:08 +00:00
def seed_isbn ( olwk_ids , freebase_ids , lang = ' en ' ) :
2012-02-08 22:28:46 +00:00
random . seed ( )
2012-02-10 18:56:08 +00:00
logger . info ( " seed_isbn input: olwk_ids, freebase_ids, lang: %s %s %s " , olwk_ids , freebase_ids , lang )
2012-02-08 06:52:50 +00:00
lt_clusters = [ ]
lt_unrecognized = set ( )
fb = FreebaseBooks ( )
gb = GoogleBooks ( key = GOOGLE_BOOKS_KEY )
2012-02-11 03:15:35 +00:00
fb_isbn_set = reduce ( operator . or_ , [ set ( fb . xisbn ( book_id = freebase_id ) ) for freebase_id in freebase_ids ] ) if len ( freebase_ids ) else set ( )
ol_isbn_set = reduce ( operator . or_ , [ set ( OpenLibrary . xisbn ( work_id = olwk_id ) ) for olwk_id in olwk_ids ] ) if len ( olwk_ids ) else set ( )
2012-02-08 06:52:50 +00:00
#lt_isbn_set = set(map(lambda x: isbn_mod.ISBN(x).to_string('13'), thingisbn(SURFACING_ISBN)))
2012-02-08 22:28:46 +00:00
logger . debug ( " Freebase set: %d %s " , len ( fb_isbn_set ) , fb_isbn_set )
logger . debug ( " OpenLibrary set: %d %s " , len ( ol_isbn_set ) , ol_isbn_set )
logger . debug ( " in both fb and ol: %d %s " , len ( fb_isbn_set & ol_isbn_set ) , fb_isbn_set & ol_isbn_set )
logger . debug ( " in fb but not ol: %d %s " , len ( fb_isbn_set - ol_isbn_set ) , fb_isbn_set - ol_isbn_set )
logger . debug ( " in ol but not fb: %d %s " , len ( ol_isbn_set - fb_isbn_set ) , ol_isbn_set - fb_isbn_set )
2012-02-08 06:52:50 +00:00
# loop through union set and ask thingisbn to cluster
to_cluster = ( fb_isbn_set | ol_isbn_set )
2012-02-08 22:28:46 +00:00
logger . debug ( " to cluster: %s , %d " , to_cluster , len ( to_cluster ) )
2012-02-08 06:52:50 +00:00
while len ( to_cluster ) :
seed = to_cluster . pop ( )
2012-02-08 22:28:46 +00:00
cluster = set ( filter ( None , map ( lambda x : isbn_mod . ISBN ( x ) . to_string ( ' 13 ' ) , thingisbn ( seed ) ) ) )
2012-02-08 06:52:50 +00:00
# is there anything in the cluster
if len ( cluster ) == 0 :
lt_unrecognized . add ( seed )
else :
# check that seed is in the cluster
assert seed in cluster
lt_clusters . append ( cluster )
to_cluster - = cluster
# print out the clusters
2012-02-08 22:28:46 +00:00
logger . debug ( " clusters " )
2012-02-08 06:52:50 +00:00
for ( i , lt_cluster ) in enumerate ( lt_clusters ) :
2012-02-08 22:28:46 +00:00
logger . debug ( " %d %s %d " , i , lt_cluster , len ( lt_cluster ) )
logger . debug ( " unrecognized by LT %s %d " , lt_unrecognized , len ( lt_unrecognized ) )
2012-02-08 06:52:50 +00:00
# figure out new ISBNs found by LT
2012-02-11 03:15:35 +00:00
new_isbns = ( ( reduce ( operator . or_ , lt_clusters ) if len ( lt_clusters ) else set ( ) ) | lt_unrecognized ) - ( fb_isbn_set | ol_isbn_set )
2012-02-08 22:28:46 +00:00
logger . debug ( " new isbns from LT %s %d " , new_isbns , len ( new_isbns ) )
2012-02-08 06:52:50 +00:00
gbooks_data = { }
# then pass to Google books to get info, including language
2012-02-11 03:15:35 +00:00
all_isbns = ( ( reduce ( operator . or_ , lt_clusters ) if len ( lt_clusters ) else set ( ) ) | lt_unrecognized )
2012-02-10 18:56:08 +00:00
for ( i , isbn ) in enumerate ( all_isbns ) :
2012-02-08 06:52:50 +00:00
gbooks_data [ isbn ] = gb . isbn ( isbn )
2012-02-08 22:28:46 +00:00
logger . debug ( " %d %s %s " , i , isbn , gbooks_data [ isbn ] )
2012-02-08 18:44:18 +00:00
# subcluster the lt_clusters by language
lt_clusters_by_lang = [ ]
for lt_cluster in lt_clusters :
lang_map = defaultdict ( list )
for id in lt_cluster :
lang_of_id = gbooks_data . get ( id ) . get ( ' language ' ) if gbooks_data . get ( id ) is not None else None
lang_map [ lang_of_id ] . append ( ( id ) )
lt_clusters_by_lang . append ( lang_map )
2012-02-08 22:28:46 +00:00
# boil the candidate down to a single ISBN: take a random ISBN from the list of all ISBNs in the requested
# language subcluster within the largest cluster that has such a language subcluster.
# Return None if there is no matching sub-language
# cluster in the largest cluster
candidate_subclusters = filter ( lambda x : x [ 0 ] is not None ,
[ ( c . get ( lang ) , len ( reduce ( operator . add , c . values ( ) ) ) ) for c in lt_clusters_by_lang ]
)
logger . debug ( " candidate_subclusters: %s " , candidate_subclusters )
if len ( candidate_subclusters ) :
candidate_seed_isbn = random . sample (
max ( candidate_subclusters , key = lambda x : x [ 1 ] ) [ 0 ] , 1 ) [ 0 ]
else :
candidate_seed_isbn = None
2012-02-08 18:44:18 +00:00
# return a dict with elements that are easy to turn into json
2012-02-08 22:28:46 +00:00
2012-02-10 18:56:08 +00:00
logger . info ( " seed_isbn output: olwk_ids, freebase_ids, lang, candidate_seed: %s %s %s %s " , olwk_ids , freebase_ids , lang ,
2012-02-08 22:28:46 +00:00
candidate_seed_isbn )
2012-02-10 18:56:08 +00:00
details = { ' olwk_ids ' : olwk_ids , ' freebase_ids ' : freebase_ids , ' lang ' : lang ,
' candidate_seed_isbn ' : candidate_seed_isbn ,
' gbooks_data ' : gbooks_data , ' lt_clusters ' : map ( tuple , lt_clusters ) ,
' lt_unrecognized ' : tuple ( lt_unrecognized ) ,
' fb_isbns ' : tuple ( fb_isbn_set ) ,
' ol_isbns ' : tuple ( ol_isbn_set ) ,
' lt_clusters_by_lang ' : lt_clusters_by_lang ,
' len_all_isbns ' : len ( all_isbns ) }
2012-02-08 22:28:46 +00:00
return ( candidate_seed_isbn , details )
2011-12-10 22:18:22 +00:00
2012-02-27 16:46:34 +00:00
def candidate_subcluster_from_lt_clusters_by_lang ( lang , lt_clusters_by_lang ) :
"""
Boil the candidate down to a single ISBN : take a random ISBN from the list of all ISBNs in the requested
language subcluster within the largest cluster that has such a language subcluster .
Return None if there is no matching sub - language
Try to find an ISBN that has good overlap with Freebase and OpenLibrary
"""
candidate_subclusters = filter ( lambda x : x [ 0 ] is not None ,
[ ( c . get ( lang ) , len ( reduce ( operator . add , c . values ( ) ) ) ) for c in lt_clusters_by_lang ]
)
if len ( candidate_subclusters ) :
candidate_subcluster = max ( candidate_subclusters , key = lambda x : x [ 1 ] )
else :
candidate_subcluster = [ ]
return candidate_seed_isbn
2012-02-10 18:56:08 +00:00
def report_on_seed_isbn ( seed_isbn_result ) :
2012-02-27 16:46:34 +00:00
"""
return a dictionary interpreting the output of the seed isbn calculation
"""
2012-02-10 18:56:08 +00:00
s = seed_isbn_result
2012-02-27 16:46:34 +00:00
# what proportion of all the ISBNS does the largest cluster make of all the ISBNs
# x is an iterable of cluster lengths
dominance = lambda x : float ( max ( x ) ) / float ( sum ( x ) ) if len ( x ) else None
2012-02-10 18:56:08 +00:00
report = OrderedDict ( [
( " seed isbn " , s [ 0 ] ) ,
( " the Google info we have on the seed isbn " , s [ 1 ] [ ' gbooks_data ' ] . get ( s [ 0 ] ) ) ,
( " lang " , s [ 1 ] [ ' lang ' ] ) ,
2012-02-11 03:15:35 +00:00
( " Freebase ids " , s [ 1 ] [ ' freebase_ids ' ] ) ,
2012-02-10 18:56:08 +00:00
( " number of OL ids " , len ( s [ 1 ] [ ' olwk_ids ' ] ) ) ,
( " total number of ISBNs from pooling FB + OL + LT " , s [ 1 ] [ ' len_all_isbns ' ] ) ,
( " number of FB isbns " , len ( s [ 1 ] [ ' fb_isbns ' ] ) ) ,
( " number of OL isbns " , len ( s [ 1 ] [ ' ol_isbns ' ] ) ) ,
( " number of LT isbns " , sum ( map ( len , s [ 1 ] [ ' lt_clusters ' ] ) ) ) ,
( " number of isbns not recognized by LT " , len ( s [ 1 ] [ ' lt_unrecognized ' ] ) ) ,
( " number of Google Books isbns " , len ( s [ 1 ] [ ' gbooks_data ' ] ) ) ,
( " number of Google Books isbns not recognized " , len ( filter ( lambda x : x is None , s [ 1 ] [ ' gbooks_data ' ] . values ( ) ) ) ) ,
( " size of clusters and their respective subclusters " , [ ( len ( reduce ( operator . add , c . values ( ) ) ) ,
[ ( lang , len ( isbns ) ) for ( lang , isbns ) in c . items ( ) ] )
for c in s [ 1 ] [ ' lt_clusters_by_lang ' ] ] ) ,
( " size of the sub-cluster including the seed isbn " , len ( filter ( lambda x : s [ 0 ] in x ,
reduce ( operator . add , [ c . values ( ) for c in s [ 1 ] [ ' lt_clusters_by_lang ' ] ] ) ) [ 0 ] ) \
2012-02-27 16:46:34 +00:00
if s [ 0 ] is not None else None ) ,
( " dominance of largest cluster " , dominance ( [ len ( cluster ) for cluster in s [ 1 ] [ ' lt_clusters ' ] ] ) )
2012-02-10 18:56:08 +00:00
] )
return report
2012-02-08 18:44:18 +00:00
def surfacing_seed_isbn ( ) :
SURFACING_WORK_OLID = ' OL675829W '
surfacing_fb_id = ' /m/05p_vg '
book_isbn = ' 9780446311076 '
2012-02-11 03:15:35 +00:00
return seed_isbn ( olwk_ids = ( SURFACING_WORK_OLID , ) , freebase_ids = ( surfacing_fb_id , ) , lang = ' en ' )
2012-02-10 18:56:08 +00:00
def ry_mashups_seed_isbn ( ) :
olid = " OL10306321W "
fb_id = " /en/pro_web_2_0_mashups_remixing_data_and_web_services "
2012-02-11 03:15:35 +00:00
return seed_isbn ( olwk_ids = ( olid , ) , freebase_ids = ( fb_id , ) , lang = ' en ' )
2012-02-08 18:44:18 +00:00
2012-02-08 22:28:46 +00:00
def moby_dick_seed_isbn ( ) :
2012-02-11 03:15:35 +00:00
return seed_isbn ( olwk_ids = ( ' OL102749W ' , ) , freebase_ids = ( ' /en/moby-dick ' , ) , lang = ' en ' )
def calc_seed_isbns ( ids = None , max = None , override = False , max_consecutive_error = 3 ) :
2012-02-10 18:56:08 +00:00
# if ids specified, work through them
# loop through all Gutenberg ids, see whethether the seed_isbn has been calculated -- and if not, do so.
2012-02-11 03:15:35 +00:00
current_error_count = 0
2012-02-10 18:56:08 +00:00
gluejar_db = GluejarDB ( )
2012-02-11 03:15:35 +00:00
# pull out a set of Gutenberg text ids that already in the SeedISBN table so that we have the option of
# not recalculating those Gutenberg texts
gutenberg_done = set ( map ( lambda x : x [ 0 ] , gluejar_db . session . query ( SeedISBN . gutenberg_etext_id ) . all ( ) ) )
logger . debug ( " gutenberg_done %s " , gutenberg_done )
# collate all the ol work ids and Freebase ids for a given gutenberg id
2012-02-10 18:56:08 +00:00
if ids is None :
2012-02-11 03:15:35 +00:00
g_ids = set ( )
ol_ids = defaultdict ( set )
fb_ids = defaultdict ( set )
lang = { }
for mapping in gutenberg_to_ol_mapping ( ) :
g_id = mapping [ " gutenberg_etext_id " ]
g_ids . add ( g_id )
ol_ids [ g_id ] . add ( mapping [ " olid " ] )
fb_ids [ g_id ] . add ( mapping [ " freebase_id " ] )
lang [ g_id ] = mapping [ " lang " ]
logger . debug ( " len(g_ids): %d " , len ( g_ids ) )
# turn the mapping into a series of tuples that can be fed to seed_isbn
if not override :
logger . debug ( " len(g_ids) before subtracting gutenberg_done: %d " , len ( g_ids ) )
logger . debug ( " len(gutenberg_done): %d " , len ( gutenberg_done ) )
g_ids - = gutenberg_done
logger . debug ( " len(g_ids) after subtracting gutenberg_done: %d " , len ( g_ids ) )
ids = [ ( g_id , tuple ( ol_ids [ g_id ] ) , tuple ( fb_ids [ g_id ] ) , lang [ g_id ] ) for g_id in g_ids ]
logger . debug ( " len(ids): %d " , len ( ids ) )
2012-02-10 18:56:08 +00:00
2012-02-11 03:15:35 +00:00
for ( i , work_id ) in enumerate ( islice ( ids , max ) ) :
if current_error_count > = max_consecutive_error :
break
( g_id , args ) = ( work_id [ 0 ] , work_id [ 1 : ] )
logger . info ( " i, g_id, args: %d %s %s " , i , g_id , args )
( seed , created ) = get_or_create ( gluejar_db . session , SeedISBN , gutenberg_etext_id = g_id )
try :
s = seed_isbn ( * args )
seed . calculated = datetime . utcnow ( )
seed . seed_isbn = s [ 0 ]
seed . error = None
seed . results = json . dumps ( s )
current_error_count = 0
yield ( g_id , s )
except Exception , e :
current_error_count + = 1
seed . seed_isbn = None
seed . calculated = datetime . utcnow ( )
seed . error = str ( e )
seed . results = None
logger . warning ( str ( e ) )
yield ( g_id , e )
finally :
gluejar_db . commit_db ( )
def reports_in_db ( max = None ) :
2012-02-27 16:46:34 +00:00
"""
a generator of all the Gutenberg seed isbn calculations
"""
2012-02-11 03:15:35 +00:00
gluejar_db = GluejarDB ( )
gutenberg_done = gluejar_db . session . query ( SeedISBN ) . all ( )
for s in islice ( gutenberg_done , max ) :
yield report_on_seed_isbn ( json . loads ( s . results ) )
def results_in_db ( max = None ) :
gluejar_db = GluejarDB ( )
gutenberg_done = gluejar_db . session . query ( SeedISBN ) . all ( )
for s in islice ( gutenberg_done , max ) :
yield json . loads ( s . results )
2012-02-15 02:01:13 +00:00
def calc_and_report_seed_isbn_calc ( ) :
for ( i , s ) in enumerate ( calc_seed_isbns ( max = 1000 ) ) :
try :
print i , report_on_seed_isbn ( s [ 1 ] )
except Exception , e :
print i , e
def gutenberg_and_seed_isbn ( max = None , include_olid = False ) :
SQL = """ SELECT mw.gutenberg_etext_id, gt.title as gt_title, mw.olid, olw.title as ol_title, mw.freebase_id, gf.about as ' url ' , gf.format, gt.rights, gt.lang,
si . seed_isbn ,
DATE_FORMAT ( gt . created , " % Y- % m- %d " ) as ' created '
FROM MappedWork mw LEFT JOIN GutenbergText gt
ON mw . gutenberg_etext_id = gt . etext_id LEFT JOIN OpenLibraryWork olw ON olw . id = mw . olid LEFT JOIN GutenbergFile gf ON gf . is_format_of = gt . etext_id
LEFT JOIN seedisbn si ON si . gutenberg_etext_id = gt . etext_id
WHERE gf . format = ' application/epub+zip ' ; """
headers = ( " gutenberg_etext_id " , " gt_title " , " olid " , " ol_title " , " freebase_id " , " url " , " format " ,
" rights " , " lang " , " seed_isbn " , " created " )
# title, gutenberg_etext_id, ol_work_id, seed_isbn, url, format, license, lang, publication_date
db = GluejarDB ( )
ebook_data = set ( )
resp = enumerate ( islice ( db . session . query ( * headers ) . from_statement ( SQL ) . all ( ) , max ) )
2012-02-11 03:15:35 +00:00
2012-02-15 02:01:13 +00:00
# writing None for olid for now
for ( i , r ) in resp :
mapping = dict ( izip ( headers , r ) )
olid = mapping [ " olid " ] if include_olid else None
ebook_datum = { ' title ' : mapping [ " gt_title " ] , ' gutenberg_etext_id ' : mapping [ " gutenberg_etext_id " ] ,
' ol_work_id ' : olid , ' seed_isbn ' : mapping [ " seed_isbn " ] ,
' url ' : mapping [ " url " ] , ' format ' : mapping [ " format " ] ,
' license ' : mapping [ " rights " ] , ' lang ' : mapping [ " lang " ] ,
' publication_date ' : mapping [ " created " ] }
if tuple ( ebook_datum . items ( ) ) not in ebook_data :
ebook_data . add ( tuple ( ebook_datum . items ( ) ) )
yield ebook_datum
def export_to_json ( obj , max = None , fname = None ) :
if fname is not None :
f = open ( fname , " wb " )
f . write ( json . dumps ( obj ) )
f . close ( )
return json . dumps ( obj )
2012-02-27 16:46:34 +00:00
def calc_titles_for_seed_isbns ( max_num = None , do = False ) :
"""
For the seedisbns , calculate the titles
"""
db = GluejarDB ( )
# title is Null and title_error is Null
#titles_to_calc = db.session.query(SeedISBN).filter(and_(SeedISBN.title==None, SeedISBN.title_error==None)).all()
titles_to_calc = db . session . query ( SeedISBN , GutenbergText . lang , GutenbergText . title ) . \
join ( GutenbergText , SeedISBN . gutenberg_etext_id == GutenbergText . etext_id ) . \
filter ( and_ ( SeedISBN . title == None , SeedISBN . title_error == None ) ) . all ( )
page_size = 5
for page in grouper ( islice ( titles_to_calc , max_num ) , page_size ) :
query = list ( izip ( [ edition . seed_isbn for ( edition , lang , gt_title ) in page ] , repeat ( ' isbn ' ) ) )
try :
res = OpenLibrary . read ( query )
except Exception , e :
print e
for ( edition , lang , gt_title ) in page :
title_error = None
try :
title = res . get ( ' isbn: {0} ' . format ( edition . seed_isbn ) ) [ ' records ' ] . values ( ) [ 0 ] [ ' data ' ] [ ' title ' ]
except Exception , e :
title = None
title_error = str ( e )
if do and title is not None :
edition . title = title
edition . title_error = title_error
db . commit_db ( )
yield ( edition . seed_isbn , title )
def repick_seed_isbn ( max_num = None , do = False , print_progress = False ) :
"""
Let ' s try to get ISBNs in the cluster that are in OpenLibrary, Freebase, and Librarything if possible
"""
gluejar_db = GluejarDB ( )
gutenberg_done = gluejar_db . session . query ( SeedISBN , GutenbergText . lang , GutenbergText . title ) . join ( GutenbergText , SeedISBN . gutenberg_etext_id == GutenbergText . etext_id ) . all ( )
# need to join with GutenbergText table to get lang and Gutenberg title
for ( i , ( s , lang , gt_title ) ) in enumerate ( islice ( gutenberg_done , max_num ) ) :
# calculate the dominant cluster
results = json . loads ( s . results )
candidate_subclusters = filter ( lambda x : x [ 0 ] is not None ,
[ ( c . get ( lang ) , len ( reduce ( operator . add , c . values ( ) ) ) ) for c in results [ 1 ] [ ' lt_clusters_by_lang ' ] ]
)
# remember that the cluster is the first element in the tuple and a length in the 2nd element
if len ( candidate_subclusters ) :
candidate_subcluster = set ( max ( candidate_subclusters , key = lambda x : x [ 1 ] ) [ 0 ] )
else :
candidate_subcluster = set ( [ ] )
# confirm that the current seed isbn is in the candidate subcluster
current_seed_ok = s . seed_isbn in candidate_subcluster
# see whether we can get a seed isbn that, in addition to LibraryThing,
# is recognized by OpenLibrary and Freebase too...2nd priority
# is just OL, 3rd is Freebase and the 4th) just LT
fb_isbns = set ( results [ 1 ] [ ' fb_isbns ' ] )
ol_isbns = set ( results [ 1 ] [ ' ol_isbns ' ] )
seeds = ( candidate_subcluster & fb_isbns & ol_isbns ) or ( candidate_subcluster & ol_isbns ) or \
( candidate_subcluster & fb_isbns ) or candidate_subcluster
new_seed_isbn = None
if do and len ( seeds ) :
new_seed_isbn = seeds . pop ( )
s . seed_isbn = new_seed_isbn
gluejar_db . commit_db ( )
if print_progress :
print i , s . gutenberg_etext_id , s . seed_isbn , lang , gt_title , seeds , current_seed_ok , new_seed_isbn
yield ( s . gutenberg_etext_id , s . seed_isbn , lang , gt_title , seeds , current_seed_ok , new_seed_isbn )
2012-02-27 20:12:06 +00:00
def compute_similarity_measures_for_seed_isbns ( max_num = None ) :
"""
Output the current seedisbn calculations with some measures to help spot errors in mapping , including
the Levenshtein distance / ratio between the Gutenberg title and the title of the edition corresponding to the
ISBN - - and a dominance factor ( the ratio of the size of the largest cluster of ISBNs
divided by all the number of ISBNs in all the clusters ) . Idea : editions whose titles have big distances
and low dominance factors should be looked at more closely .
"""
from Levenshtein import distance , ratio
# what proportion of all the ISBNs does the largest cluster make of all the ISBNs
# x is an iterable of cluster lengths
dominance = lambda x : float ( max ( x ) ) / float ( sum ( x ) ) if len ( x ) else None
gluejar_db = GluejarDB ( )
seed_isbns = gluejar_db . session . query ( SeedISBN , GutenbergText . lang , GutenbergText . title ) . join ( GutenbergText , SeedISBN . gutenberg_etext_id == GutenbergText . etext_id ) . all ( )
for ( i , ( seed_isbn , lang , gt_title ) ) in enumerate ( islice ( seed_isbns , max_num ) ) :
res = json . loads ( seed_isbn . results )
yield OrderedDict ( [ ( ' etext_id ' , seed_isbn . gutenberg_etext_id ) ,
( ' seed_isbn_title ' , seed_isbn . title ) ,
( ' gt_title ' , gt_title ) ,
( ' dominance ' , dominance ( [ len ( cluster ) for cluster in res [ 1 ] [ ' lt_clusters ' ] ] ) ) ,
( ' title_l_ratio ' , ratio ( seed_isbn . title , gt_title ) if ( seed_isbn . title is not None and gt_title is not None ) else None ) ] )
def output_to_csv ( f , headers , rows , write_header = True , convert_values_to_unicode = True ) :
"""
take rows , an iterable of dicts ( and corresponding headers ) and output as a CSV file to f
"""
from unicode_csv import UnicodeDictWriter
cw = UnicodeDictWriter ( f , headers )
if write_header :
cw . writerow ( dict ( [ ( h , h ) for h in headers ] ) )
for row in rows :
if convert_values_to_unicode :
row = dict ( [ ( k , unicode ( v ) ) for ( k , v ) in row . items ( ) ] )
cw . writerow ( row )
return f
def filtered_gutenberg_and_seed_isbn ( min_l_ratio = None , min_dominance = None , max_num = None , include_olid = False ) :
# compute the similarity measures and pass through only the Gutenberg records that meet the minimum lt_ratio and dominance
measures = compute_similarity_measures_for_seed_isbns ( )
measures_map = dict ( )
for measure in measures :
measures_map [ measure [ ' etext_id ' ] ] = measure
for item in gutenberg_and_seed_isbn ( max = max_num , include_olid = include_olid ) :
g_id = item [ ' gutenberg_etext_id ' ]
accept = True
if min_dominance is not None and measures_map [ g_id ] [ ' dominance ' ] is not None and measures_map [ g_id ] [ ' dominance ' ] < min_dominance :
accept = False
if min_l_ratio is not None and measures_map [ g_id ] [ ' title_l_ratio ' ] is not None and measures_map [ g_id ] [ ' title_l_ratio ' ] < min_l_ratio :
accept = False
if accept :
yield item
2012-02-27 16:46:34 +00:00
2012-02-08 22:28:46 +00:00
2011-12-05 17:23:17 +00:00
class FreebaseClient ( object ) :
def __init__ ( self , username = None , password = None , main_or_sandbox = ' main ' ) :
if main_or_sandbox == ' main ' :
self . freebase = freebase
else :
self . freebase = freebase . sandbox
if username is not None and password is not None :
self . freebase . login ( username , password )
def wikipedia_href_to_freebase_id ( self , hrefs , page_size = 10 , chop_wiki = True ) :
MQL = u """ [ {
" type " : [ ] ,
" id " : null ,
" key " : [ {
" namespace " : " /wikipedia/en " ,
" type " : " /type/key " ,
" value " : null
} ]
} ]
""" .replace( " \n " , " " )
for ( page_num , page ) in enumerate ( grouper ( hrefs , page_size ) ) :
queries = [ ]
for ( href_num , href ) in enumerate ( page ) :
query = json . loads ( MQL )
if chop_wiki :
href = href [ 6 : ] if href . startswith ( ' /wiki/ ' ) else href
query [ 0 ] [ ' key ' ] [ 0 ] [ ' value ' ] = quotekey ( href )
print " %d , %d %s " % ( page_num , href_num , href )
queries . append ( query )
if len ( queries ) :
try :
resp = self . freebase . mqlreadmulti ( queries )
#print "fb resp, len(resp): %s %d" % (resp, len(resp))
for r in resp :
yield r
except Exception , e :
# for now, write out the stuff in the queries and then move on -- better to try on smaller pieces
print " Metaweb Error: %s for page %s " % ( e , page )
class WikipediaLinksTest ( unittest . TestCase ) :
def test_external_links ( self ) :
target = " http://www.gutenberg.org/etext "
max = 10
links = [ ]
for ( i , link ) in enumerate ( islice ( external_links_in_wikipedia ( target ) , max ) ) :
print i , link
links . append ( ( link [ " source_href " ] , link [ " target " ] ) )
self . assertEqual ( len ( links ) , max )
class DatabaseTest ( unittest . TestCase ) :
def test_insert_1_wikipedia_link ( self ) :
gluejar_db = GluejarDB ( )
wl = WikipediaLink ( )
wl . gutenberg_etext_id = 13920
wl . wikipedia_href = " /wiki/stuffffdsfsf "
wl . wikipedia_title = " stuffffdsfsf "
# add one, read it back, and then delete it
gluejar_db . session . add ( wl )
gluejar_db . commit_db ( )
query = gluejar_db . session . query ( WikipediaLink ) . filter ( WikipediaLink . wikipedia_href == " /wiki/stuffffdsfsf " )
obj = query . first ( )
self . assertEqual ( obj . wikipedia_href , " /wiki/stuffffdsfsf " )
gluejar_db . session . delete ( obj )
gluejar_db . commit_db ( )
def test_integrity_constraint_wikipedia_link ( self ) :
gluejar_db = GluejarDB ( )
wl = WikipediaLink ( )
wl . gutenberg_etext_id = 13920
wl . wikipedia_href = " /wiki/stuffffdsfsf "
wl . wikipedia_title = " stuffffdsfsf "
wl2 = WikipediaLink ( )
wl2 . gutenberg_etext_id = 13920
wl2 . wikipedia_href = " /wiki/stuffffdsfsf "
wl2 . wikipedia_title = " stuffffdsfsf2 "
# try to add links with the same value twice
gluejar_db . session . add ( wl )
gluejar_db . session . add ( wl2 )
self . assertRaises ( Exception , gluejar_db . commit_db )
gluejar_db . rollback ( )
# delete the first item
query = gluejar_db . session . query ( WikipediaLink ) . filter ( WikipediaLink . wikipedia_href == " /wiki/stuffffdsfsf " )
obj = query . first ( )
self . assertEqual ( obj . wikipedia_href , " /wiki/stuffffdsfsf " )
gluejar_db . session . delete ( obj )
gluejar_db . commit_db ( )
def test_filtered_wikipedia_links ( self ) :
db = GluejarDB ( )
for item in islice ( db . filtered_wikipedia_links ( ) , 100 ) :
print item . wikipedia_title , item . wikipedia_href
self . assertTrue ( True )
2011-12-10 22:18:22 +00:00
def test_insert_1_fb_ol_link ( self ) :
db = GluejarDB ( )
# in sqlalchemy...is there an equiv to Django get_one_or_new
# /en/the_hunting_of_the_snark -> OL151447W for etext_id of 12
( fb_item , created ) = get_or_create ( db . session , FreebaseEntity , id = " /en/the_hunting_of_the_snark " )
( ol_item , created ) = get_or_create ( db . session , OpenLibraryWork , id = " OL151447W " )
( mapping , created ) = get_or_create ( db . session , MappedWork , olid = " OL151447W " , freebase_id = " /en/the_hunting_of_the_snark " , gutenberg_etext_id = 12 )
get_or_create ( db . session , GutenbergIdMapped , id = 12 )
db . commit_db ( )
def test_mapping_error ( self ) :
db = GluejarDB ( )
( error_item , created ) = get_or_create ( db . session , MappingError , message = " testing " )
db . commit_db ( )
2011-12-05 17:23:17 +00:00
class ChainTest ( unittest . TestCase ) :
def test_chain ( self ) :
2012-02-11 03:15:35 +00:00
"""
Make sure that I ( RY ) understoo that itertools . ichain worked by actually chaining together a series of iterators into 1
"""
2011-12-05 17:23:17 +00:00
self . assertTrue ( True )
max = None
sizes = [ 5 , 8 , 9 ]
numbers = chain ( * ( xrange ( size ) for size in sizes ) )
for ( i , num ) in enumerate ( islice ( numbers , max ) ) :
pass
self . assertEqual ( i + 1 , sum ( sizes ) )
class FreebaseTest ( unittest . TestCase ) :
def test_query ( self ) :
fb = FreebaseClient ( )
resp = list ( fb . wikipedia_href_to_freebase_id ( [ ' Peter_and_Wendy ' , ' King_Lear ' ] ) )
for r in resp :
#print r
#print r[0]['id'], r[0]['type']
self . assertTrue ( ' /book/book ' in r [ 0 ] [ ' type ' ] )
def test_query_and_db_insert ( self ) :
fb = FreebaseClient ( )
db = GluejarDB ( )
resp = list ( fb . wikipedia_href_to_freebase_id ( [ ' Peter_and_Wendy ' , ' King_Lear ' , ' Hamlet ' ] ) )
for r in resp :
print r
print r [ 0 ] [ ' id ' ] , r [ 0 ] [ ' type ' ] , r [ 0 ] [ ' key ' ] [ 0 ] [ ' value ' ]
self . assertTrue ( ' /book/book ' in r [ 0 ] [ ' type ' ] )
fb_entity = FreebaseEntity ( )
fb_entity . id = r [ 0 ] [ ' id ' ]
try :
db . session . add ( fb_entity )
db . commit_db ( )
except IntegrityError , e :
db . rollback ( )
fb_entity = db . session . query ( FreebaseEntity ) . filter ( FreebaseEntity . id == r [ 0 ] [ ' id ' ] ) . one ( )
fb_entity . wikipedia_href = ' /wiki/ %s ' % ( r [ 0 ] [ ' key ' ] [ 0 ] [ ' value ' ] )
fb_entity . is_book_book = ' /book/book ' in r [ 0 ] [ ' type ' ]
db . commit_db ( )
# return True if no crashing
self . assertTrue ( True )
class RefineTest ( unittest . TestCase ) :
def setUp ( self ) :
2012-02-15 02:01:13 +00:00
from google . refine import refine
2011-12-05 17:23:17 +00:00
self . refine_obj = refine . Refine ( refine . RefineServer ( ) )
def test_project_listing ( self ) :
# https://raw.github.com/PaulMakepeace/refine-client-py/master/refine.py
projects = self . refine_obj . list_projects ( ) . items ( )
def date_to_epoch ( json_dt ) :
" Convert a JSON date time into seconds-since-epoch. "
return time . mktime ( time . strptime ( json_dt , ' % Y- % m- %d T % H: % M: % SZ ' ) )
projects . sort ( key = lambda v : date_to_epoch ( v [ 1 ] [ ' modified ' ] ) , reverse = True )
for project_id , project_info in projects :
print ( ' {0:>14} : {1} ' . format ( project_id , project_info [ ' name ' ] ) )
id = int ( project_id ) # check to see whether there will be a non-int
def test_project_name ( self ) :
id = " 1884515736058 "
print self . refine_obj . get_project_name ( id )
def test_columns ( self ) :
id = " 1884515736058 "
proj = self . refine_obj . open_project ( id )
models = proj . get_models ( )
cols = proj . columns
pprint ( models )
print models . keys ( )
print cols
def test_iterate_rows ( self ) :
id = " 1884515736058 "
proj = self . refine_obj . open_project ( id )
cols_to_extract = [ ' etext_id ' , ' title ' , ' name ' , ' fb_id ' , ' fb_id_judgement ' , ' wikipedia_title ' ]
response = proj . get_rows ( limit = 10 )
print " response.total: " , response . total
for i , row in enumerate ( islice ( response . rows , 10 ) ) :
print i , row . flagged , row . starred , row . index ,
print i , [ row [ c ] for c in cols_to_extract ]
2011-12-10 22:18:22 +00:00
class FreebaseToOpenLibraryMappingTest ( unittest . TestCase ) :
def setUp ( self ) :
pass
def test_OpenLib_setup ( self ) :
pass
2012-02-03 07:07:25 +00:00
class ISBNSeedTest ( unittest . TestCase ) :
def test_isbnseed ( self ) :
gutenberg_ids = [ ' 2701 ' ]
2012-02-08 06:52:50 +00:00
for ( g_id , val ) in izip ( gutenberg_ids , gutenberg_ol_fb_mappings ( gutenberg_ids ) ) :
2012-02-03 07:07:25 +00:00
print g_id , val
2011-12-05 17:23:17 +00:00
def suite ( ) :
testcases = [ ]
suites = unittest . TestSuite ( [ unittest . TestLoader ( ) . loadTestsFromTestCase ( testcase ) for testcase in testcases ] )
2012-02-03 07:07:25 +00:00
suites . addTest ( ISBNSeedTest ( ' test_isbnseed ' ) )
2011-12-05 17:23:17 +00:00
#suites.addTest(SettingsTest('test_dev_me_alignment')) # give option to test this alignment
return suites
if __name__ == ' __main__ ' :
2012-01-13 01:52:10 +00:00
#walk through and parse catalogs
2011-12-05 17:23:17 +00:00
#walk_through_catalog(fname='/Users/raymondyee/D/Document/Gluejar/gutenberg/catalog_texts.rdf',max=100)
#walk_through_catalog(fname='/Users/raymondyee/D/Document/Gluejar/gutenberg/catalog_files.rdf',max=1000)
2012-01-13 01:52:10 +00:00
2011-12-05 17:23:17 +00:00
#load_texts_to_db(max=10)
#load_files_to_db(max=None)
#load_wikipedia_external_links_into_db(None)
#map_wikipedia_links_to_freebase_ids(None, page_size=10)
2012-01-13 01:52:10 +00:00
# in between: here we have to do some manual work in Google Refine
2011-12-10 22:18:22 +00:00
#map_refine_fb_links_to_openlibrary_work_ids(max=None)
2011-12-12 18:49:33 +00:00
#compute_ol_title_from_work_id(max=1000)
2012-01-13 01:52:10 +00:00
2012-02-03 07:07:25 +00:00
#export_gutenberg_to_ol_mapping(fname="gutenberg_openlibrary.json")
#import_gutenberg_json(fname="gutenberg_openlibrary.json")
2012-02-08 06:52:50 +00:00
2012-02-10 18:56:08 +00:00
#print surfacing_seed_isbn()
2012-02-08 06:52:50 +00:00
2011-12-05 17:23:17 +00:00
#unittest.main()
2012-02-27 16:46:34 +00:00
#print list(gutenberg_and_seed_isbn(max=10))
2012-02-27 21:19:58 +00:00
#print list(repick_seed_isbn(10))
# output a filtered gutenberg list
# 0.56 and 0.7 I got by eye-balling the results in Google Refine
y = list ( filtered_gutenberg_and_seed_isbn ( min_l_ratio = 0.56 , min_dominance = 0.7 ) )
export_to_json ( y , fname = " g_seed_isbn.json " )
2012-02-10 18:56:08 +00:00
2012-02-11 03:15:35 +00:00
#suites = suite()
2011-12-05 17:23:17 +00:00
#suites = unittest.defaultTestLoader.loadTestsFromModule(__import__('__main__'))
2012-02-08 06:52:50 +00:00
#unittest.TextTestRunner().run(suites)
2011-12-05 17:23:17 +00:00