experimental scripts to try to match metadata in oai-pmh feeds (online books page) to googlebooks

pull/1/head
Ed Summers 2011-12-04 21:45:53 -05:00
parent 1d1a8e16ed
commit 30e6dc38cd
3 changed files with 121 additions and 0 deletions

83
experimental/matcher.py Executable file
View File

@ -0,0 +1,83 @@
#!/usr/bin/env python
import re
import json
import time
import urllib
import requests
from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
def google_search(title, authors, no_matches):
headers = {'X-Forwarded-For': '69.243.24.29'}
# the title and author are intentionally not fielded
params = {
'q': title,
'key': 'AIzaSyBE36z7o6NUafIWcLEB8yk2I47-8_5y1_0'
}
for author in authors:
params['q'] += ' ' + norm_author(author)
r = requests.get('https://www.googleapis.com/books/v1/volumes',
params=params, headers=headers)
results = json.loads(r.content)
if not results.has_key('totalItems'):
print >> no_matches, "missing totalItems for %s" % r.url
print >> no_matches, r.content
return "missing totalItems"
if results['totalItems'] == 0:
return "no search results"
return None
for item in results['items']:
g_title = item['volumeInfo'].get('title', '')
g_title += ' ' + item['volumeInfo'].get('subtitle', '')
g_authors = item['volumeInfo'].get('authors', [])
if norm_title(g_title) == norm_title(title) and \
authors_equal(g_authors, authors):
return item['id']
msg = "%s\t%s\t%s" % (title, authors, r.url)
print >> no_matches, msg.encode('utf-8')
print >> no_matches, r.content
print >> no_matches, ""
return "no match"
def norm_title(t):
t = t.lower()
t = re.sub('^((the)|(a)|(an)) ', '', t)
t = re.sub('[^A-Za-z]', '', t)
return t[0:15]
def authors_equal(a, b):
if len(a) == 0 and len(b) == 0:
return True
if len(a) == 0 or len(b) == 0:
return False
a = map(extra_norm_author, a)
b = map(extra_norm_author, b)
return len(set(a) & set(b)) > 0
def extra_norm_author(a):
a = norm_author(a)
a = a.lower()
a = re.sub('[^a-z]', '', a)
return a
def norm_author(a):
parts = a.split(',')
parts = [p.strip() for p in parts]
if len(parts) > 1 and re.search('\d\d', parts[-1]):
parts.pop(-1)
parts.append(parts.pop(0))
a = ' '.join(parts)
a = re.sub('\(.+?\)', '', a)
a = re.sub(' +', ' ', a)
a = re.sub('[^A-Za-z \-]', '', a)
return a
if __name__ == "__main__":
results = open("results.txt", "w")
no_matches = open("no-matches.log", "w")
lookup(results, no_matches)

38
experimental/oai2json.py Executable file
View File

@ -0,0 +1,38 @@
#!/usr/bin/env python
import re
import sys
import json
import urllib
import requests
from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
def lookup(oai_url, set_name):
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(oai_url, registry)
for header, record, other in client.listRecords(metadataPrefix='oai_dc', set=set_name):
if not record:
continue
title = record.getField('title')[0]
authors = record.getField('creator')
urls = record.getField('identifier')
subjects = record.getField('subject')
id = header.identifier()
record = {
"id": id,
"title": title,
"authors": authors,
"subjects": subjects,
"urls": urls,
}
print json.dumps(record)
if __name__ == "__main__":
oai_url = sys.argv[1]
set_name = None
if len(sys.argv) > 2:
set_name = sys.argv[2]
lookup(oai_url, set_name)

BIN
experimental/obp.json.gz Normal file

Binary file not shown.