Merge pull request #1 from NYPL/SFR-479-add-api

SFR 479 Add API
add-new-regnum-params
Mike Benowitz 2019-07-05 14:04:43 -04:00 committed by GitHub
commit 4afecd043b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 919 additions and 5 deletions

View File

@ -9,6 +9,8 @@ The two projects that this data is drawn from are hosted on GitHub, more informa
- (Catalog of Copyright Entries)[https://github.com/NYPL/catalog_of_copyright_entries_project]
- (Catalog of Copyright Entries Renewals)[https://github.com/NYPL/cce-renewals]
This repository also includes an API for querying the created database using ElasticSearch as a search layer and `Flask` as the application framework. See below for more detail on running and using the search API.
## Database
This script generates and updates a Postgresql database and ElasticSearch search layer. It scans the above repository for updated source files and inserts/updates the appropriate records, ensuring that the search layer is kept up to date following these changes.
@ -61,4 +63,43 @@ Several command-line arguments control the options for the execution of this scr
- `--REINITIALIZE` Will drop the existing database and rebuild it from scratch. WARNING THIS WILL DELETE ALL CURRENT DATA
- `-t` or `--time` The time ago in seconds to check for updated records in the GitHub repositories. This allows for only updating changed records
- `-y` or `--year` A specific year to load from either the entries or renewals
- `-x` or `--exclude` Set to exclude either the entries (with `cce`) or the renewals (with `ccr`) from the current execution. Useful when used in conjunction with the `year` parameter to control what records are updated.
- `-x` or `--exclude` Set to exclude either the entries (with `cce`) or the renewals (with `ccr`) from the current execution. Useful when used in conjunction with the `year` parameter to control what records are updated.
## API
This is a basic API that allows for a limited set of queries to be executed against the database. It allows for lookups by fulltext search, registration/renewal numbers and internal UUID numbers (to retrieve specific records). The returned objects show relationships between registrations and renewals and can optionally return the source data from which each record was created.
### Running the API
To start the api, ensure that that you've updated your `virtualenv` with the most recent version of the requirements file and run the following commands:
1) `export FLASK_ENV=development`
2) `export FLASK_APP=api/app.py`
3) `python -m flask run`
This will start the flask application at `localhost:5000`. Accessing that page will redirect you to a SwaggerDocs page that describes the available endpoints, their parameters, response object and allow users a chance to experiment with the endpoints.
### Using the API
The API provides 5 endpoints for retrieving registration and renewal records. These are split between `Search` and `Lookup` endpoints
#### Search
The search endpoints return many or no object depending on what search terms are used. All three search endpoints share 3 query parameters:
- `page`: The page of results to return. Defaults to 0
- `per_page`: The number of results to return per page. Defaults to 10
- `source`: A flag to set the return of the source XML/CSV data. Defaults to `false`
The individual endpoints are:
- `/search/fulltext?query=<query string>`: A full text query
- `/search/registration/<regnum>`: A search for a specific registration number
- `/search/renewal/<rennum>`: A search for a specific renewal number
#### Lookup
The lookup endpoints return data for a specific Registration or Renewal record. These do not accept additional parameters, but return the `source` data for any record. These endpoints use the internally generated `UUID` numbers to ensure a globally unique lookup value
- `/registration/<uuid>`: Returns a single Registration record
- `/renewal/<uuid>`: Returns a single Renewal record

41
api/app.py Normal file
View File

@ -0,0 +1,41 @@
import os
import yaml
from flask import Flask, jsonify
from flasgger import Swagger
from api.prints.swagger.swag import SwaggerDoc
from api.db import db
from api.elastic import elastic
from api.prints import base, search, uuid
def loadConfig():
with open('config.yaml', 'r') as yamlFile:
config = yaml.safe_load(yamlFile)
for section in config:
sectionDict = config[section]
for key, value in sectionDict.items():
os.environ[key] = value
def create_app():
loadConfig()
app = Flask(__name__)
app.register_blueprint(base.bp)
app.register_blueprint(search.search)
app.register_blueprint(uuid.uuid)
app.config['SQLALCHEMY_DATABASE_URI'] = 'postgresql://{}:{}@{}:{}/{}'.format(
os.environ['DB_USER'],
os.environ['DB_PSWD'],
os.environ['DB_HOST'],
os.environ['DB_PORT'],
os.environ['DB_NAME']
)
app.config['ELASTICSEARCH_INDEX_URI'] = '{}:{}'.format(
os.environ['ES_HOST'],
os.environ['ES_PORT']
)
app.config['SWAGGER'] = {'title': 'CCE Search'}
db.init_app(app)
elastic.init_app(app)
docs = SwaggerDoc()
swagger = Swagger(app, template=docs.getDocs())
return app

27
api/db.py Normal file
View File

@ -0,0 +1,27 @@
from flask_sqlalchemy import SQLAlchemy
from model.cce import CCE
from model.registration import Registration
from model.renewal import Renewal, RENEWAL_REG
from model.volume import Volume
db = SQLAlchemy()
class QueryManager():
def __init__(self, session):
self.session = session
def registrationQuery(self, uuid):
return self.session.query(CCE)\
.outerjoin(Registration, RENEWAL_REG, Renewal)\
.filter(CCE.uuid == uuid).one()
def renewalQuery(self, uuid):
return self.session.query(Renewal)\
.outerjoin(RENEWAL_REG, Registration, CCE)\
.filter(Renewal.uuid == uuid).one()
def orphanRenewalQuery(self, uuid):
return self.session.query(Renewal).filter(Renewal.uuid == uuid).one()

40
api/elastic.py Normal file
View File

@ -0,0 +1,40 @@
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
class Elastic():
def __init__(self):
self.client = None
def init_app(self, app):
self.client = Elasticsearch(app.config['ELASTICSEARCH_INDEX_URI'])
def create_search(self, index):
return Search(using=self.client, index=index)
def query_regnum(self, regnum, page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
search = self.create_search('cce')
nestedQ = Q('term', registrations__regnum=regnum)
nestedSearch = search.query('nested', path='registrations', query=nestedQ)[startPos:endPos]
return nestedSearch.execute()
def query_rennum(self, rennum, page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
search = self.create_search('ccr')
renewalSearch = search.query('term', rennum=rennum)[startPos:endPos]
return renewalSearch.execute()
def query_fulltext(self, queryText, page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
print(startPos, endPos)
search = self.create_search('cce,ccr')
renewalSearch = search.query('query_string', query=queryText)[startPos:endPos]
return renewalSearch.execute()
@staticmethod
def getFromSize(page, perPage):
startPos = page * perPage
endPos = startPos + perPage
return startPos, endPos
elastic = Elastic()

13
api/prints/base.py Normal file
View File

@ -0,0 +1,13 @@
from flask import (
Blueprint, request, session, url_for, redirect, current_app, jsonify
)
from model.cce import CCE
bp = Blueprint('base', __name__, url_prefix='/')
APP_VERSION = 'v0.1'
@bp.route('/')
def query():
return redirect(url_for('flasgger.apidocs'))

110
api/prints/search.py Normal file
View File

@ -0,0 +1,110 @@
from flask import (
Blueprint, request, session, url_for, redirect, current_app, jsonify
)
from sqlalchemy.orm.exc import NoResultFound
from api.db import db, QueryManager
from api.elastic import elastic
from api.response import MultiResponse
search = Blueprint('search', __name__, url_prefix='/search')
@search.route('/fulltext', methods=['GET'])
def fullTextQuery():
queryText = request.args.get('query', '')
sourceReturn = request.args.get('source', False)
page, perPage = MultiResponse.parsePaging(request.args)
matchingDocs = elastic.query_fulltext(queryText, page=page, perPage=perPage)
textResponse = MultiResponse(
'text',
matchingDocs.hits.total,
request.base_url,
queryText,
page,
perPage
)
qManager = QueryManager(db.session)
for entry in matchingDocs:
if entry.meta.index == 'cce':
dbEntry = qManager.registrationQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseEntry(
dbEntry,
xml=sourceReturn
))
else:
try:
dbRenewal = qManager.renewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
except NoResultFound:
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
textResponse.createDataBlock()
return jsonify(textResponse.createResponse(200))
@search.route('/registration/<regnum>', methods=['GET'])
def regQuery(regnum):
page, perPage = MultiResponse.parsePaging(request.args)
sourceReturn = request.args.get('source', False)
matchingDocs = elastic.query_regnum(regnum, page=page, perPage=perPage)
regResponse = MultiResponse(
'number',
matchingDocs.hits.total,
request.base_url,
regnum,
page,
perPage
)
qManager = QueryManager(db.session)
for entry in matchingDocs:
dbEntry = qManager.registrationQuery(entry.uuid)
regResponse.addResult(MultiResponse.parseEntry(
dbEntry,
xml=sourceReturn
))
regResponse.createDataBlock()
return jsonify(regResponse.createResponse(200))
@search.route('/renewal/<rennum>', methods=['GET'])
def renQuery(rennum):
page, perPage = MultiResponse.parsePaging(request.args)
sourceReturn = request.args.get('source', False)
matchingDocs = elastic.query_rennum(rennum, page=page, perPage=perPage)
renResponse = MultiResponse(
'number',
matchingDocs.hits.total,
request.base_url,
rennum,
page,
perPage
)
qManager = QueryManager(db.session)
for entry in matchingDocs:
dbRenewal = qManager.renewalQuery(entry.uuid)
renResponse.extendResults(parseRetRenewal(
dbRenewal,
source=sourceReturn
))
renResponse.createDataBlock()
return jsonify(renResponse.createResponse(200))
def parseRetRenewal(dbRenewal):
if len(dbRenewal.registrations) == 0:
return [MultiResponse.parseRenewal(dbRenewal)]
registrations = []
for reg in dbRenewal.registrations:
registrations.append(MultiResponse.parseEntry(reg.cce))
return registrations

411
api/prints/swagger/swag.py Normal file
View File

@ -0,0 +1,411 @@
class SwaggerDoc():
def __init__(self): pass
def getDocs(self):
return {
"swagger": "2.0",
"info": {
"title": "CCE Search",
"description": "API for searching Copyright Registrations and Renewals",
"contact": {
"responsibleOrganization": "NYPL",
"responsibleDeveloper": "Michael Benowitz",
"email": "michaelbenowitz@nypl.org",
"url": "www.nypl.org",
},
"version": "v0.1"
},
"basePath": "/", # base bash for blueprint registration
"schemes": [
"http",
"https"
],
"paths": {
"/search/fulltext": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a search_query string with full boolean logic to fuzzy search across both registration and renewal records",
"parameters": [
{
"name": "query",
"in": "query",
"type": "string",
"required": True,
"default": "*"
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
}
],
"responses": {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
}
}
}
}
},
"/search/registration/{regnum}": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a copyright registration number and returns all matching records",
"parameters": [
{
"name": "regnum",
"in": "path",
"required": True,
"schema": {
"type": "string"
},
"description": "Standard copyright registration number"
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
}
],
"responses": {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
}
}
}
}
},
"/search/renewal/{rennum}": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a copyright renewal number and returns all matching records",
"parameters": [
{
"name": "rennum",
"in": "path",
"required": True,
"schema": {
"type": "string"
},
"description": "Standard copyright renewal number"
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
}
],
"responses": {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
}
}
}
}
},
"/registration/{uuid}": {
"get": {
"tags": ["Lookup"],
"summary": "Return a specific Registration record by UUID",
"description": "Accepts a UUID and returns a registration record",
"parameters": [{
"name": "uuid",
"in": "path",
"required": True,
"schema": {
"type": "string"
},
"description": "Standard UUID"
}],
"responses": {
200: {
"description": "A single Registration record",
"schema": {
"$ref": "#/definitions/SingleResponse"
}
}
}
}
},
"/renewal/{uuid}": {
"get": {
"tags": ["Lookup"],
"summary": "Return a specific Renewal record by UUID",
"description": "Accepts a UUID and returns either an orphan renewal record or the parent registration with associated renewals",
"parameters": [{
"name": "uuid",
"in": "path",
"required": True,
"schema": {
"type": "string"
},
"description": "Standard UUID"
}],
"responses": {
200: {
"description": "A single Renewal or Registration record",
"schema": {
"$ref": "#/definitions/SingleResponse"
}
}
}
}
}
},
"definitions": {
"SingleResponse": {
"type": "object",
"properties": {
"status": {
"type": "integer"
},
"data": {
"type": "object",
"anyOf": [
{"$ref": "#/definitions/Registration"},
{"$ref": "#/definitions/Renewal"}
]
}
}
},
"MultiResponse": {
"type": "object",
"properties": {
"total": {
"type": "integer",
},
"query": {
"type": "object",
"$ref": "#/definitions/Query"
},
"paging": {
"type": "object",
"$ref": "#/definitions/Paging"
},
"results": {
"type": "array",
"items": {
"anyOf": [
{"$ref": "#/definitions/Registration"},
{"$ref": "#/definitions/Renewal"}
]
}
}
}
},
"Query": {
"type": "object",
"properties": {
"endpoint": {
"type": "string"
},
"term": {
"type": "string"
}
}
},
"Paging": {
"type": "object",
"properties": {
"first": {
"type": "string"
},
"previous": {
"type": "string"
},
"next": {
"type": "string"
},
"last": {
"type": "string"
}
}
},
"Registration": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"copies": {
"type": "string"
},
"copy_date": {
"type": "string"
},
"description": {
"type": "string"
},
"authors": {
"type": "array",
"items": {
"$ref": "#/definitions/Agent"
}
},
"publishers": {
"type": "array",
"items": {
"$ref": "#/definitions/Agent"
}
},
"registrations": {
"type": "array",
"items": {
"$ref": "#/definitions/RegRegistration"
}
},
"renewals":{
"type": "array",
"items": {
"$ref": "#/definitions/Renewal"
}
},
"source": {
"type": "object",
"properties": {
"page": {
"type": "integer"
},
"page_position": {
"type": "integer"
},
"part": {
"type": "string"
},
"series": {
"type": "string"
},
"url": {
"type": "string"
},
"year": {
"type": "integer"
}
}
}
}
},
"Agent": {
"type": "string"
},
"RegRegistration": {
"type": "object",
"properties": {
"number": {
"type": "string"
},
"date": {
"type": "string"
}
}
},
"Renewal": {
"type": "object",
"properties": {
"type": {
"type": "string"
},
"title": {
"type": "string"
},
"author": {
"type": "string"
},
"new_matter": {
"type": "string"
},
"renewal_num": {
"type": "string"
},
"renewal_date": {
"type": "string"
},
"notes": {
"type": "string"
},
"volume": {
"type": "string"
},
"part": {
"type": "string"
},
"number": {
"type": "string"
},
"page": {
"type": "string"
},
"claimants": {
"type": "array",
"items": {
"$ref": "#/definitions/Claimant"
}
}
}
},
"Claimant": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"type": {
"type": "string"
}
}
}
}
}

47
api/prints/uuid.py Normal file
View File

@ -0,0 +1,47 @@
from flask import (
Blueprint, request, session, url_for, redirect, current_app, jsonify
)
from api.db import db
from api.elastic import elastic
from api.response import SingleResponse
from model.cce import CCE
from model.registration import Registration
from model.renewal import Renewal, RENEWAL_REG
from model.volume import Volume
uuid = Blueprint('uuid', __name__, url_prefix='/')
@uuid.route('/registration/<uuid>', methods=['GET'])
def regQuery(uuid):
dbEntry = db.session.query(CCE)\
.outerjoin(Registration, RENEWAL_REG, Renewal)\
.filter(CCE.uuid == uuid).one()
regRecord = SingleResponse('uuid', request.base_url)
regRecord.result = SingleResponse.parseEntry(dbEntry, xml=True)
regRecord.createDataBlock()
return jsonify(regRecord.createResponse(200))
@uuid.route('/renewal/<uuid>', methods=['GET'])
def renQuery(uuid):
dbRenewal = db.session.query(Renewal)\
.outerjoin(RENEWAL_REG, Registration, CCE)\
.filter(Renewal.uuid == uuid).one()
renRecord = SingleResponse('uuid', request.base_url)
renRecord.result = parseRetRenewal(dbRenewal)
renRecord.createDataBlock()
return jsonify(renRecord.createResponse(200))
def parseRetRenewal(dbRenewal):
if len(dbRenewal.registrations) == 0:
return [SingleResponse.parseRenewal(dbRenewal, source=True)]
registrations = []
for reg in dbRenewal.registrations:
registrations.append(SingleResponse.parseEntry(reg.cce, xml=True))
return registrations

173
api/response.py Normal file
View File

@ -0,0 +1,173 @@
class Response():
def __init__(self, queryType, endpoint):
self.type = queryType
self.endpoint = endpoint
self.data = None
def createResponse(self, status, err=None):
if status != 200:
return {
'status': status,
'message': err.message
}
else:
return {
'status': status,
'data': self.data
}
@classmethod
def parseEntry(cls, dbEntry, xml=False):
response = {
'uuid': dbEntry.uuid,
'title': dbEntry.title,
'copies': dbEntry.copies,
'description': dbEntry.description,
'pub_date': dbEntry.pub_date_text,
'copy_date': dbEntry.copy_date_text,
'registrations': [
{'number': r.regnum, 'date': r.reg_date_text}
for r in dbEntry.registrations
],
'authors': [ a.name for a in dbEntry.authors ],
'publishers': [ p.name for p in dbEntry.publishers ],
'source': {
'url': dbEntry.volume.source,
'series': dbEntry.volume.series,
'year': dbEntry.volume.year,
'part': dbEntry.volume.part,
'page': dbEntry.page,
'page_position': dbEntry.page_position
}
}
response['renewals'] = [
cls.parseRenewal(ren, source=xml)
for reg in dbEntry.registrations
for ren in reg.renewals
]
if xml: response['xml'] = dbEntry.xml_sources[0].xml_source
return response
@classmethod
def parseRenewal(cls, dbRenewal, source=False):
renewal = {
'type': 'renewal',
'uuid': dbRenewal.uuid,
'title': dbRenewal.title,
'author': dbRenewal.author,
'claimants': [
{'name': c.name, 'type': c.claimant_type}
for c in dbRenewal.claimants
],
'new_matter': dbRenewal.new_matter,
'renewal_num': dbRenewal.renewal_num,
'renewal_date': dbRenewal.renewal_date_text,
'notes': dbRenewal.notes,
'volume': dbRenewal.volume,
'part': dbRenewal.part,
'number': dbRenewal.number,
'page': dbRenewal.page
}
if source: renewal['source'] = dbRenewal.source
return renewal
class SingleResponse(Response):
def __init__(self, queryType, endpoint):
super().__init__(queryType, endpoint)
self.result = None
def createDataBlock(self):
self.data = self.result
class MultiResponse(Response):
def __init__(self, queryType, total, endpoint, query, page, perPage):
super().__init__(queryType, endpoint)
self.total = total
self.query = query
self.page = page
self.perPage = perPage
self.results = []
def addResult(self, result):
self.results.append(result)
def extendResults(self, results):
self.results.extend(results)
def createDataBlock(self):
self.data = {
'total': self.total,
'query': self.createQuery(),
'paging': self.createPaging(),
'results': self.results
}
def createQuery(self):
return {
'endpoint': self.endpoint,
'term': self.query
}
def createPaging(self):
paging = {}
if self.type == 'text':
urlRoot = '{}?query={}'.format(self.endpoint, self.query)
else:
urlRoot = self.endpoint
if self.page > 0:
paging['first'] = '{}&page={}&per_page={}'.format(
urlRoot,
0,
self.perPage
)
else:
paging['first'] = None
prevPage = self.page - 1
if prevPage >= 0:
paging['previous'] = '{}&page={}&per_page={}'.format(
urlRoot,
prevPage,
self.perPage
)
else:
paging['previous'] = None
nextPage = self.page + 1
if (nextPage * self.perPage) < self.total:
paging['next'] = '{}&page={}&per_page={}'.format(
urlRoot,
nextPage,
self.perPage
)
else:
paging['next'] = None
lastPage = int((self.total - self.perPage) / self.perPage)
if (
self.page * self.perPage < self.total and
self.total > self.perPage
):
paging['last'] = '{}&page={}&per_page={}'.format(
urlRoot,
lastPage,
self.perPage
)
else:
paging['last'] = None
return paging
@staticmethod
def parsePaging(reqArgs):
perPage = int(reqArgs.get('per_page', 10))
page = int(reqArgs.get('page', 0))
return page, perPage

View File

@ -20,7 +20,8 @@ from model.registration import Registration as dbRegistration
from model.elastic import (
CCE,
Registration,
Renewal
Renewal,
Claimant
)
@ -143,9 +144,11 @@ class ESRen():
self.renewal = Renewal(meta={'id': self.dbRen.renewal_num})
def indexRen(self):
self.renewal.uuid = self.dbRen.uuid
self.renewal.rennum = self.dbRen.renewal_num
self.renewal.rendate = self.dbRen.renewal_date
self.renewal.title = self.dbRen.title
self.renewal.claimants = [
c.name for c in self.dbRen.claimants
Claimant(name=c.name, claim_type=c.claimant_type)
for c in self.dbRen.claimants
]

View File

@ -31,11 +31,18 @@ class Registration(BaseInner):
regdate = Date()
class Claimant(BaseInner):
name = Text(fields={'keyword': Keyword()})
claim_type = Keyword()
class Renewal(BaseDoc):
uuid = Keyword(store=True)
rennum = Keyword()
rendate = Date()
title = Text(fields={'keyword': Keyword()})
claimants = Text(multi=True)
claimants = Nested(Claimant)
class Index:
with open('config.yaml', 'r') as yamlFile:

View File

@ -16,7 +16,6 @@ from sqlalchemy.ext.associationproxy import association_proxy
from sqlalchemy.orm.exc import NoResultFound
from model.core import Base, Core
from model.xml import XML
from model.author import Author
from model.publisher import Publisher

View File

@ -13,6 +13,7 @@ from sqlalchemy import (
)
from model.core import Base, Core
from model.errorCCE import ErrorCCE
@compiles(String, 'postgresql')
def compile_xml(type_, compiler, **kw):

View File

@ -1,4 +1,5 @@
elasticsearch-dsl>=6.0.0,<7.0.0
flask-sqlalchemy
lccnorm
lxml
psycopg2-binary