regluit/experimental/matcher.py

#!/usr/bin/env python

import re
import json
import fileinput

import requests

try:
    from django.conf import settings
    GOOGLE_BOOKS_API_KEY = settings.GOOGLE_BOOKS_API_KEY
except:
    GOOGLE_BOOKS_API_KEY = os.environ.get('GOOGLE_BOOKS_API_KEY', '')

def match():
    for line in fileinput.input():
        j = json.loads(line)
        authors = j['authors']
        title = j['title']
        print googlebooks_id(title, authors)

def google_search(title, authors, no_matches):
    headers = {'X-Forwarded-For': '69.243.24.29'}
    # the title and author are intentionally not fielded
    params = {
        'q': title,
        'key': GOOGLE_BOOKS_API_KEY
    }
    for author in authors:
        params['q'] += ' ' + norm_author(author)
    r = requests.get('https://www.googleapis.com/books/v1/volumes',
            params=params, headers=headers)
    results = json.loads(r.content)
    if not results.has_key('totalItems'):
        print >> no_matches, "missing totalItems for %s" % r.url
        print >> no_matches, r.content
        return "missing totalItems"
    if results['totalItems'] == 0:
        return "no search results"
        return None
    for item in results['items']:
        g_title = item['volumeInfo'].get('title', '')
        g_title += ' ' + item['volumeInfo'].get('subtitle', '')
        g_authors = item['volumeInfo'].get('authors', [])
        if norm_title(g_title) == norm_title(title) and \
                authors_equal(g_authors, authors):
           return item['id']

    msg = "%s\t%s\t%s" % (title, authors, r.url)
    print >> no_matches, msg.encode('utf-8')
    print >> no_matches, r.content
    print >> no_matches, ""
    return "no match"

def norm_title(t):
    t = t.lower()
    t = re.sub('^((the)|(a)|(an)) ', '', t)
    t = re.sub('[^A-Za-z]', '', t)
    return t[0:15]

def authors_equal(a, b):
    if len(a) == 0 and len(b) == 0:
        return True
    if len(a) == 0 or len(b) == 0:
        return False

    a = map(extra_norm_author, a)
    b = map(extra_norm_author, b)

    return len(set(a) & set(b)) > 0

def extra_norm_author(a):
    a = norm_author(a)
    a = a.lower()
    a = re.sub('[^a-z]', '', a)
    return a

def norm_author(a):
    parts = a.split(',')
    parts = [p.strip() for p in parts]
    if len(parts) > 1 and re.search('\d\d', parts[-1]):
        parts.pop(-1)
    parts.append(parts.pop(0))
    a = ' '.join(parts)
    a = re.sub('\(.+?\)', '', a)
    a = re.sub('  +', ' ', a)
    a = re.sub('[^A-Za-z \-]', '', a)
    return a

if __name__ == "__main__":
    googlebooks_ids()