experimental scripts to try to match metadata in oai-pmh feeds (online books page) to googlebooks
parent
1d1a8e16ed
commit
30e6dc38cd
|
@ -0,0 +1,83 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import re
|
||||
import json
|
||||
import time
|
||||
import urllib
|
||||
|
||||
import requests
|
||||
from oaipmh.client import Client
|
||||
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
|
||||
|
||||
def google_search(title, authors, no_matches):
|
||||
headers = {'X-Forwarded-For': '69.243.24.29'}
|
||||
# the title and author are intentionally not fielded
|
||||
params = {
|
||||
'q': title,
|
||||
'key': 'AIzaSyBE36z7o6NUafIWcLEB8yk2I47-8_5y1_0'
|
||||
}
|
||||
for author in authors:
|
||||
params['q'] += ' ' + norm_author(author)
|
||||
r = requests.get('https://www.googleapis.com/books/v1/volumes',
|
||||
params=params, headers=headers)
|
||||
results = json.loads(r.content)
|
||||
if not results.has_key('totalItems'):
|
||||
print >> no_matches, "missing totalItems for %s" % r.url
|
||||
print >> no_matches, r.content
|
||||
return "missing totalItems"
|
||||
if results['totalItems'] == 0:
|
||||
return "no search results"
|
||||
return None
|
||||
for item in results['items']:
|
||||
g_title = item['volumeInfo'].get('title', '')
|
||||
g_title += ' ' + item['volumeInfo'].get('subtitle', '')
|
||||
g_authors = item['volumeInfo'].get('authors', [])
|
||||
if norm_title(g_title) == norm_title(title) and \
|
||||
authors_equal(g_authors, authors):
|
||||
return item['id']
|
||||
|
||||
msg = "%s\t%s\t%s" % (title, authors, r.url)
|
||||
print >> no_matches, msg.encode('utf-8')
|
||||
print >> no_matches, r.content
|
||||
print >> no_matches, ""
|
||||
return "no match"
|
||||
|
||||
def norm_title(t):
|
||||
t = t.lower()
|
||||
t = re.sub('^((the)|(a)|(an)) ', '', t)
|
||||
t = re.sub('[^A-Za-z]', '', t)
|
||||
return t[0:15]
|
||||
|
||||
def authors_equal(a, b):
|
||||
if len(a) == 0 and len(b) == 0:
|
||||
return True
|
||||
if len(a) == 0 or len(b) == 0:
|
||||
return False
|
||||
|
||||
a = map(extra_norm_author, a)
|
||||
b = map(extra_norm_author, b)
|
||||
|
||||
return len(set(a) & set(b)) > 0
|
||||
|
||||
def extra_norm_author(a):
|
||||
a = norm_author(a)
|
||||
a = a.lower()
|
||||
a = re.sub('[^a-z]', '', a)
|
||||
return a
|
||||
|
||||
def norm_author(a):
|
||||
parts = a.split(',')
|
||||
parts = [p.strip() for p in parts]
|
||||
if len(parts) > 1 and re.search('\d\d', parts[-1]):
|
||||
parts.pop(-1)
|
||||
parts.append(parts.pop(0))
|
||||
a = ' '.join(parts)
|
||||
a = re.sub('\(.+?\)', '', a)
|
||||
a = re.sub(' +', ' ', a)
|
||||
a = re.sub('[^A-Za-z \-]', '', a)
|
||||
return a
|
||||
|
||||
if __name__ == "__main__":
|
||||
results = open("results.txt", "w")
|
||||
no_matches = open("no-matches.log", "w")
|
||||
lookup(results, no_matches)
|
|
@ -0,0 +1,38 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import urllib
|
||||
|
||||
import requests
|
||||
from oaipmh.client import Client
|
||||
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
|
||||
|
||||
def lookup(oai_url, set_name):
|
||||
registry = MetadataRegistry()
|
||||
registry.registerReader('oai_dc', oai_dc_reader)
|
||||
client = Client(oai_url, registry)
|
||||
for header, record, other in client.listRecords(metadataPrefix='oai_dc', set=set_name):
|
||||
if not record:
|
||||
continue
|
||||
title = record.getField('title')[0]
|
||||
authors = record.getField('creator')
|
||||
urls = record.getField('identifier')
|
||||
subjects = record.getField('subject')
|
||||
id = header.identifier()
|
||||
record = {
|
||||
"id": id,
|
||||
"title": title,
|
||||
"authors": authors,
|
||||
"subjects": subjects,
|
||||
"urls": urls,
|
||||
}
|
||||
print json.dumps(record)
|
||||
|
||||
if __name__ == "__main__":
|
||||
oai_url = sys.argv[1]
|
||||
set_name = None
|
||||
if len(sys.argv) > 2:
|
||||
set_name = sys.argv[2]
|
||||
lookup(oai_url, set_name)
|
Binary file not shown.
Loading…
Reference in New Issue