Some code for OpenLibrary, Freebase, HathiTrust to explore the nature of the data available in those sources

2011-11-06 07:55:07 -05:00 · 2011-11-06 07:55:07 -05:00 · 68b4da17d1
parent 705876dd8b
commit 68b4da17d1
4 changed files with 289 additions and 0 deletions
--- a/experimental/init.py
+++ b/experimental/init.py
--- a/experimental/bookdata.py
+++ b/experimental/bookdata.py
@ -0,0 +1,284 @@
+import requests
+import urllib
+import httplib
+from regluit.experimental.zotero_books import Zotero2, MyZotero
+import json
+from pprint import pprint
+from itertools import islice
+import re
+
+import freebase
+
+import logging
+logger = logging.getLogger(__name__)
+
+MASHUPBOOK_ISBN_13 = '9781590598580'
+MASHUPBOOK_ISBN_10 = '159059858X'
+RY_OLID = 'OL4264806A'
+
+SURFACING_WORK_OLID = 'OL675829W'
+SURFACING_EDITION_OLID = 'OL8075248M'
+
+class OpenLibraryException(Exception):
+    pass
+
+class HathiException(Exception):
+    pass
+
+def filter_none(d):
+    d2 = {}
+    for (k,v) in d.iteritems():
+        if v is not None:
+            d2[k] = v
+    return d2
+
+def to_list(s):
+    """if input is not a list return a list with s"""
+    if not isinstance(s,list):
+        return [s]
+    else:
+        return s
+
+def hathi_bib(id, id_type='isbn', detail_level='brief'):
+    url = "http://catalog.hathitrust.org/api/volumes/brief/%s/%s.json"  % (id_type, id)
+    r = requests.get(url)
+    if r.status_code == httplib.OK:
+        return r.content
+    else:
+        raise Exception("Hathi Bib API response: %s " % (httplib.responses[r.status_code]) )
+        
+# http://openlibrary.org/developers/api
+
+class OpenLibrary(object):
+    @classmethod
+    def books(cls, id, id_type="isbn", format="json", callback=None, jscmd=None):
+        # http://openlibrary.org/dev/docs/api/books
+        # bibkeys: one of isbn, oclc, lccn, olid
+        # format: one of json, javascript
+        # jscmd: one of viewapi, data, details (deprecated in favor of data)
+        base_url = "http://openlibrary.org/api/books"
+        params = filter_none({'bibkeys':'%s:%s' % (id_type,id),
+                              'format':format,
+                              'callback':callback,
+                              'jscmd':jscmd})
+        
+        r = requests.get(base_url,params=params)
+        if r.status_code == httplib.OK:
+            return json.loads(r.content)
+        else:
+            raise OpenLibraryException("OpenLibrary API response: %s " % (httplib.responses[r.status_code]) )
+            
+    @classmethod
+    def covers(cls, id, id_type='isbn', size='S'):
+        # http://openlibrary.org/dev/docs/api/covers
+        # id_type: one of 'id' (internal cover ID), olid (Open Library ID), isbn, oclc, lccn, goodreads, librarything
+        # size:  one of s, m, l
+        # http://covers.openlibrary.org/b/$key/$value-$size.jpg
+
+        if id_type.lower() not in ['id','isbn','oclc','lccn','goodreads','librarything']:
+            raise OpenLibraryException("%s is an incorrect id_type for covers" % (id_type))
+        if size.upper() not in ['S','M','L']:
+            raise OpenLibraryException("%s is an incorrect size for covers" % (size))
+    
+        return "http://covers.openlibrary.org/b/%s/%s-%s.jpg" % (id_type.lower(),id,size.upper())
+    
+    @classmethod
+    def author_photos(cls, olid, size='S'):
+        #http://covers.openlibrary.org/a/$key/$value-$size.jpg
+        if size.upper() not in ['S','M','L']:
+            raise OpenLibraryException("%s is an incorrect size for author" % (size))
+        return "http://covers.openlibrary.org/a/olid/%s-%s.jpg" % (olid,size.upper())
+        
+    @classmethod
+    def read(cls, queries):
+        # http://openlibrary.org/dev/docs/api/read -- most of its value to be used in browser JS?
+        # can do single or multiple requests
+        # example of a single request:
+        # http://openlibrary.org/api/volumes/brief/isbn/0596156715.json
+        
+        # multiple
+        # http://openlibrary.org/api/volumes/brief/json/id:1;lccn:50006784|olid:OL6179000M;lccn:55011330
+        # the multiple format works, I think, for single requests
+        #url = "http://openlibrary.org/api/volumes/brief/%s/%s.json" % (id_type, id)
+        
+        query_string = "|".join([ ";".join([ "%s:%s" % (id_type,id) for (id, id_type) in to_list(query)])
+                         for query in to_list(queries)])
+        if query_string:
+            url = "http://openlibrary.org/api/volumes/brief/json/%s" % (query_string)
+            r = requests.get(url)  
+            if r.status_code == httplib.OK:
+                return json.loads(r.content)
+            else:
+                raise OpenLibraryException("OpenLibrary API response: %s " % (httplib.responses[r.status_code]) )
+        else:
+            return None
+    
+    @classmethod
+    def lists(cls):
+        # http://openlibrary.org/dev/docs/api/lists
+        raise NotImplementedError
+
+    @classmethod
+    def query_iter(cls,**kwargs):
+        # use limit for page size and offset as the starting point
+        kwargs2 = kwargs.copy()
+        kwargs2.setdefault('offset', 0)
+        
+        more_items = True
+        while more_items:
+            items = cls.query(**kwargs2)
+            for item in items:
+                yield item
+            if len(items):
+                kwargs2['offset'] += len(items)
+            else:
+                more_items = False
+
+    @classmethod
+    def query(cls, **kwargs):
+        # limit and offset are special
+        base_url = "http://openlibrary.org/query.json"
+        r = requests.get(base_url, params=kwargs)
+        if r.status_code == httplib.OK:
+            return json.loads(r.content)
+        else:
+            raise OpenLibraryException("OpenLibrary API response: %s " % (httplib.responses[r.status_code]) )
+
+    @classmethod
+    def editions(cls, work_olid):
+        # http://openlibrary.org/query.json?type=/type/edition&works=/works/OL675829W&limit=5000
+        for item in cls.query_iter(type='/type/edition',works='/works/%s'%(work_olid), limit=10):
+            try:
+                yield re.match(r'^/books/(.*)$',item['key']).group(1)
+            except Exception, e:
+                raise OpenLibraryException("problem in editions: %s " % e)
+                
+    @classmethod
+    def works(cls, id, id_type='isbn'):
+        # http://openlibrary.org/api/volumes/brief/olid/OL8075248M.json
+        # can there be more than 1 work for a given edition?
+        # will return a list of works
+        # there is a bug in OL in which we have workless editions
+        response = cls.read((id,id_type))
+        # resp['olid:OL8075248M']['records']['/books/OL8075248M']["details"]["details"]["works"][0]["key"]
+        # resp.values()[0]['records'].values()[0]["details"]["details"]["works"][0]["key"]
+        try:
+            works_key = response.values()[0]['records'].values()[0]["details"]["details"]["works"]
+            if (len(response.values()) == 1 and len(response.values()[0]['records'].values()) == 1):
+                return [re.match(r'^/works/(.*)$',work_key["key"]).group(1) for work_key in works_key]
+            else:
+                raise OpenLibraryException("Assumption of 1 key in response invalid in OpenLibrary.works")
+        except Exception, e:
+            raise OpenLibraryException("problem in works: %s " % e)
+
+def parse_project_gutenberg_catalog(fname='/Users/raymondyee/D/Document/Gluejar/gutenberg/catalog.rdf'):
+    #URL = http://www.gutenberg.org/feeds/catalog.rdf.zip
+    import re
+    
+    def text(node):
+        node.normalize()
+        return node.childNodes[0].data
+        
+    RDF_NS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
+    DC_NS = 'http://purl.org/dc/elements/1.1/'
+    
+    from xml.dom.pulldom import START_ELEMENT, parse
+    doc = parse(fname)
+    for event, node in doc:
+        if event == START_ELEMENT and node.localName == "etext":
+            doc.expandNode(node)
+            id = node.getAttributeNS(RDF_NS,'ID')
+            try:
+                title = text(node.getElementsByTagNameNS(DC_NS,'title')[0])
+                title = title.replace("\n"," ").replace("\r"," ")
+            except:
+                title = None
+            yield id, title
+    
+# what books in Freebase have Wikipedia pages?
+
+class FreebaseBooks(object):
+    def __init__(self, username=None, password=None, main_or_sandbox='main'):
+        if main_or_sandbox == 'main':
+            self.freebase = freebase
+        else:
+            self.freebase = freebase.sandbox
+        if username is not None and password is not None:
+            self.freebase.login(username,password)
+    def books(self):
+        MQL = u"""[{
+  "type": "/book/book",
+  "id":   null,
+  "key": [{
+    "namespace": "/wikipedia/en",
+    "value":     null,
+    "type":      "/type/key"
+  }]
+}]
+""".replace("\n"," ")
+        query = json.loads(MQL)
+        resp = self.freebase.mqlreaditer(query)
+        for r in resp:
+            yield r
+
+    def book_editions(self):
+        MQL = u"""[{
+  "type":        "/book/book_edition",
+  "id":          null,
+  "ISBN":        [{}],
+  "LCCN":        [{}],
+  "OCLC_number": [{}],
+  "book": {
+    "id":   null,
+    "name": null
+  }
+}]""".replace("\n"," ")
+        query = json.loads(MQL)
+        resp = self.freebase.mqlreaditer(query)
+        for r in resp:
+            yield r
+
+def look_up_my_zotero_books_in_hathi():
+    zot = MyZotero()
+    for (i,b) in enumerate(zot.get_books(20)):
+        try:
+            print b, hathi_bib(b['isbn'])
+        except Exception, e:
+            print e
+
+def ol_practice():
+    print OpenLibrary.books(MASHUPBOOK_ISBN_10)
+    pprint (OpenLibrary.books(MASHUPBOOK_ISBN_13, jscmd='data'))
+    print OpenLibrary.books('0385472579', jscmd='data')
+    print (OpenLibrary.covers(MASHUPBOOK_ISBN_10, size='M'))
+    print (OpenLibrary.author_photos(RY_OLID,'S'))
+    # can we status of a pd book oclc:03544699 The Log of a Cowboy - Andy Adams, 1903
+    # http://openlibrary.org/books/OL7173600M/The_log_of_a_cowboy -- not working?
+    print OpenLibrary.books('OL7173600M' 'olid', jscmd='data')
+    # http://openlibrary.org/books/OL6542070M/The_Montessori_method works
+    print OpenLibrary.books('1181252','oclc',jscmd='data')
+    print OpenLibrary.read([(MASHUPBOOK_ISBN_10,'isbn'),('1181252','oclc')])
+    # let's bring up the editions for Surfacing
+    
+    for (i,ed) in enumerate(islice(OpenLibrary.editions(SURFACING_WORK_OLID),100)):
+        print i, ed
+        
+    # let's get the Work ID for one of the editions
+    pprint(OpenLibrary.works(SURFACING_EDITION_OLID,id_type='olid'))
+
+def freebase_test():
+    
+    fb = FreebaseBooks()
+    for (i,book) in enumerate(islice(fb.books(),200)):
+        print i, book
+    for (i,edition) in enumerate(islice(fb.book_editions(),200)):
+        print i, edition
+
+if __name__ == '__main__':
+    #look_up_my_zotero_books_in_hathi()
+    #ol_practice()
+    #print len(list(islice(parse_project_gutenberg_catalog(),100000)))
+    freebase_test()
+    
+    
+
--- a/requirements.pip
+++ b/requirements.pip
@ -15,3 +15,4 @@ redis
 oauth2
 mechanize
 pyzotero
+freebase
--- a/settings/dev.py
+++ b/settings/dev.py
@ -87,3 +87,7 @@ INSTALLED_APPS += ("djkombu",)
 # Goodreads API
 GOODREADS_API_KEY = ''
 GOODREADS_API_SECRET = ''
+
+# Freebase credentials
+FREEBASE_USERNAME = ''
+FREEBASE_PASSWORD = ''