Merge pull request #3 from NYPL/api-accuracy-updates

API accuracy updates
select-best-date
Mike Benowitz 2019-08-05 15:31:52 -04:00 committed by GitHub
commit 1c0b8a3bf9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 909 additions and 401 deletions

7
.coveragerc Normal file
View File

@ -0,0 +1,7 @@
[run]
omit = tests/*, */__init__.py
[report]
exclude_lines =
if __name__ == '__main__':

View File

@ -5,405 +5,440 @@ class SwaggerDoc():
def getDocs(self):
return {
"swagger": "2.0",
"info": {
"title": "CCE Search",
"description": "API for searching Copyright Registrations and Renewals",
"contact": {
"responsibleOrganization": "NYPL",
"responsibleDeveloper": "Michael Benowitz",
"email": "michaelbenowitz@nypl.org",
"url": "www.nypl.org",
'swagger': '2.0',
'info': {
'title': 'CCE Search',
'description': 'API for searching Copyright Registrations and Renewals',
'contact': {
'responsibleOrganization': 'NYPL',
'responsibleDeveloper': 'Michael Benowitz',
'email': 'michaelbenowitz@nypl.org',
'url': 'www.nypl.org',
},
"version": "v0.1"
'version': 'v0.1'
},
"basePath": "/", # base bash for blueprint registration
"schemes": [
"http",
"https"
'basePath': '/', # base bash for blueprint registration
'schemes': [
'http',
'https'
],
"paths": {
"/search/fulltext": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a search_query string with full boolean logic to fuzzy search across both registration and renewal records",
"parameters": [
'paths': {
'/search/fulltext': {
'get': {
'tags': ['Search'],
'summary': 'Returns a set of registration and renewal objects',
'description': 'Accepts a search_query string with full boolean logic to fuzzy search across both registration and renewal records',
'parameters': [
{
"name": "query",
"in": "query",
"type": "string",
"required": True,
"default": "*"
'name': 'query',
'in': 'query',
'type': 'string',
'required': True,
'default': '*'
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
'name': 'source',
'in': 'query',
'type': 'boolean',
'required': False,
'default': False,
'description': 'Return source XML/CSV data'
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
'name': 'page',
'in': 'query',
'type': 'number',
'required': False,
'default': 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
'name': 'per_page',
'in': 'query',
'type': 'number',
'required': False,
'default': 10
}
],
"responses": {
'responses': {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
'description': 'A list of copyright registrations and renewals',
'schema': {
'$ref': '#/definitions/MultiResponse'
}
}
}
}
},
"/search/registration/{regnum}": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a copyright registration number and returns all matching records",
"parameters": [
'/search/registration/{regnum}': {
'get': {
'tags': ['Search'],
'summary': 'Returns a set of registration and renewal objects',
'description': 'Accepts a copyright registration number and returns all matching records',
'parameters': [
{
"name": "regnum",
"in": "path",
"required": True,
"schema": {
"type": "string"
'name': 'regnum',
'in': 'path',
'required': True,
'schema': {
'type': 'string'
},
"description": "Standard copyright registration number"
'description': 'Standard copyright registration number'
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
'name': 'source',
'in': 'query',
'type': 'boolean',
'required': False,
'default': False,
'description': 'Return source XML/CSV data'
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
'name': 'page',
'in': 'query',
'type': 'number',
'required': False,
'default': 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
'name': 'per_page',
'in': 'query',
'type': 'number',
'required': False,
'default': 10
}
],
"responses": {
'responses': {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
'description': 'A list of copyright registrations and renewals',
'schema': {
'$ref': '#/definitions/MultiResponse'
}
}
}
}
},
"/search/renewal/{rennum}": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a copyright renewal number and returns all matching records",
"parameters": [
'/search/renewal/{rennum}': {
'get': {
'tags': ['Search'],
'summary': 'Returns a set of registration and renewal objects',
'description': 'Accepts a copyright renewal number and returns all matching records',
'parameters': [
{
"name": "rennum",
"in": "path",
"required": True,
"schema": {
"type": "string"
'name': 'rennum',
'in': 'path',
'required': True,
'schema': {
'type': 'string'
},
"description": "Standard copyright renewal number"
'description': 'Standard copyright renewal number'
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
'name': 'source',
'in': 'query',
'type': 'boolean',
'required': False,
'default': False,
'description': 'Return source XML/CSV data'
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
'name': 'page',
'in': 'query',
'type': 'number',
'required': False,
'default': 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
'name': 'per_page',
'in': 'query',
'type': 'number',
'required': False,
'default': 10
}
],
"responses": {
'responses': {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
'description': 'A list of copyright registrations and renewals',
'schema': {
'$ref': '#/definitions/MultiResponse'
}
}
}
}
},
"/registration/{uuid}": {
"get": {
"tags": ["Lookup"],
"summary": "Return a specific Registration record by UUID",
"description": "Accepts a UUID and returns a registration record",
"parameters": [{
"name": "uuid",
"in": "path",
"required": True,
"schema": {
"type": "string"
'/registration/{uuid}': {
'get': {
'tags': ['Lookup'],
'summary': 'Return a specific Registration record by UUID',
'description': 'Accepts a UUID and returns a registration record',
'parameters': [{
'name': 'uuid',
'in': 'path',
'required': True,
'schema': {
'type': 'string'
},
"description": "Standard UUID"
'description': 'Standard UUID'
}],
"responses": {
'responses': {
200: {
"description": "A single Registration record",
"schema": {
"$ref": "#/definitions/SingleResponse"
'description': 'A single Registration record',
'schema': {
'$ref': '#/definitions/SingleResponse'
}
},
404: {
'description': 'A message noting that the UUID could not be found',
'schema': {
'$ref': '#/definitions/ErrorResponse'
}
},
500: {
'description': 'Generic internal error message',
'schema': {
'$ref': '#/definitions/ErrorResponse'
}
}
}
}
},
"/renewal/{uuid}": {
"get": {
"tags": ["Lookup"],
"summary": "Return a specific Renewal record by UUID",
"description": "Accepts a UUID and returns either an orphan renewal record or the parent registration with associated renewals",
"parameters": [{
"name": "uuid",
"in": "path",
"required": True,
"schema": {
"type": "string"
'/renewal/{uuid}': {
'get': {
'tags': ['Lookup'],
'summary': 'Return a specific Renewal record by UUID',
'description': 'Accepts a UUID and returns either an orphan renewal record or the parent registration with associated renewals',
'parameters': [{
'name': 'uuid',
'in': 'path',
'required': True,
'schema': {
'type': 'string'
},
"description": "Standard UUID"
'description': 'Standard UUID'
}],
"responses": {
'responses': {
200: {
"description": "A single Renewal or Registration record",
"schema": {
"$ref": "#/definitions/SingleResponse"
'description': 'A single Renewal or Registration record',
'schema': {
'$ref': '#/definitions/SingleResponse'
}
},
404: {
'description': 'A message noting that the UUID could not be found',
'schema': {
'$ref': '#/definitions/ErrorResponse'
}
},
500: {
'description': 'Generic internal error message',
'schema': {
'$ref': '#/definitions/ErrorResponse'
}
}
}
}
}
},
"definitions": {
"SingleResponse": {
"type": "object",
"properties": {
"status": {
"type": "integer"
'definitions': {
'ErrorResponse': {
'type': 'object',
'properties': {
'status': {
'type': 'integer'
},
"data": {
"type": "object",
"anyOf": [
{"$ref": "#/definitions/Registration"},
{"$ref": "#/definitions/Renewal"}
'message': {
'type': 'string'
}
}
},
'SingleResponse': {
'type': 'object',
'properties': {
'status': {
'type': 'integer'
},
'data': {
'type': 'object',
'anyOf': [
{'$ref': '#/definitions/Registration'},
{'$ref': '#/definitions/Renewal'}
]
}
}
},
"MultiResponse": {
"type": "object",
"properties": {
"total": {
"type": "integer",
'MultiResponse': {
'type': 'object',
'properties': {
'total': {
'type': 'integer',
},
"query": {
"type": "object",
"$ref": "#/definitions/Query"
'query': {
'type': 'object',
'$ref': '#/definitions/Query'
},
"paging": {
"type": "object",
"$ref": "#/definitions/Paging"
'paging': {
'type': 'object',
'$ref': '#/definitions/Paging'
},
"results": {
"type": "array",
"items": {
"anyOf": [
{"$ref": "#/definitions/Registration"},
{"$ref": "#/definitions/Renewal"}
'results': {
'type': 'array',
'items': {
'anyOf': [
{'$ref': '#/definitions/Registration'},
{'$ref': '#/definitions/Renewal'}
]
}
}
}
},
"Query": {
"type": "object",
"properties": {
"endpoint": {
"type": "string"
'Query': {
'type': 'object',
'properties': {
'endpoint': {
'type': 'string'
},
"term": {
"type": "string"
'term': {
'type': 'string'
}
}
},
"Paging": {
"type": "object",
"properties": {
"first": {
"type": "string"
'Paging': {
'type': 'object',
'properties': {
'first': {
'type': 'string'
},
"previous": {
"type": "string"
'previous': {
'type': 'string'
},
"next": {
"type": "string"
'next': {
'type': 'string'
},
"last": {
"type": "string"
'last': {
'type': 'string'
}
}
},
"Registration": {
"type": "object",
"properties": {
"title": {
"type": "string"
'Registration': {
'type': 'object',
'properties': {
'title': {
'type': 'string'
},
"copies": {
"type": "string"
'copies': {
'type': 'string'
},
"copy_date": {
"type": "string"
'copy_date': {
'type': 'string'
},
"description": {
"type": "string"
'description': {
'type': 'string'
},
"authors": {
"type": "array",
"items": {
"$ref": "#/definitions/Agent"
'authors': {
'type': 'array',
'items': {
'$ref': '#/definitions/Agent'
}
},
"publishers": {
"type": "array",
"items": {
"$ref": "#/definitions/Agent"
'publishers': {
'type': 'array',
'items': {
'$ref': '#/definitions/Agent'
}
},
"registrations": {
"type": "array",
"items": {
"$ref": "#/definitions/RegRegistration"
'registrations': {
'type': 'array',
'items': {
'$ref': '#/definitions/RegRegistration'
}
},
"renewals":{
"type": "array",
"items": {
"$ref": "#/definitions/Renewal"
'renewals':{
'type': 'array',
'items': {
'$ref': '#/definitions/Renewal'
}
},
"source": {
"type": "object",
"properties": {
"page": {
"type": "integer"
'source': {
'type': 'object',
'properties': {
'page': {
'type': 'integer'
},
"page_position": {
"type": "integer"
'page_position': {
'type': 'integer'
},
"part": {
"type": "string"
'part': {
'type': 'string'
},
"series": {
"type": "string"
'series': {
'type': 'string'
},
"url": {
"type": "string"
'url': {
'type': 'string'
},
"year": {
"type": "integer"
'year': {
'type': 'integer'
}
}
}
}
},
"Agent": {
"type": "string"
'Agent': {
'type': 'string'
},
"RegRegistration": {
"type": "object",
"properties": {
"number": {
"type": "string"
'RegRegistration': {
'type': 'object',
'properties': {
'number': {
'type': 'string'
},
"date": {
"type": "string"
'date': {
'type': 'string'
}
}
},
"Renewal": {
"type": "object",
"properties": {
"type": {
"type": "string"
'Renewal': {
'type': 'object',
'properties': {
'type': {
'type': 'string'
},
"title": {
"type": "string"
'title': {
'type': 'string'
},
"author": {
"type": "string"
'author': {
'type': 'string'
},
"new_matter": {
"type": "string"
'new_matter': {
'type': 'string'
},
"renewal_num": {
"type": "string"
'renewal_num': {
'type': 'string'
},
"renewal_date": {
"type": "string"
'renewal_date': {
'type': 'string'
},
"notes": {
"type": "string"
'notes': {
'type': 'string'
},
"volume": {
"type": "string"
'volume': {
'type': 'string'
},
"part": {
"type": "string"
'part': {
'type': 'string'
},
"number": {
"type": "string"
'number': {
'type': 'string'
},
"page": {
"type": "string"
'page': {
'type': 'string'
},
"claimants": {
"type": "array",
"items": {
"$ref": "#/definitions/Claimant"
'claimants': {
'type': 'array',
'items': {
'$ref': '#/definitions/Claimant'
}
}
}
},
"Claimant": {
"type": "object",
"properties": {
"name": {
"type": "string"
'Claimant': {
'type': 'object',
'properties': {
'name': {
'type': 'string'
},
"type": {
"type": "string"
'type': {
'type': 'string'
}
}
}

View File

@ -1,40 +1,57 @@
from flask import (
Blueprint, request, session, url_for, redirect, current_app, jsonify
)
from flask import Blueprint, request, jsonify
from sqlalchemy.exc import DataError
from sqlalchemy.orm.exc import NoResultFound
from api.db import db
from api.elastic import elastic
from api.response import SingleResponse
from model.cce import CCE
from model.registration import Registration
from model.renewal import Renewal, RENEWAL_REG
from model.volume import Volume
from helpers.errors import LookupError
uuid = Blueprint('uuid', __name__, url_prefix='/')
@uuid.route('/registration/<uuid>', methods=['GET'])
def regQuery(uuid):
dbEntry = db.session.query(CCE)\
.outerjoin(Registration, RENEWAL_REG, Renewal)\
.filter(CCE.uuid == uuid).one()
err = None
regRecord = SingleResponse('uuid', request.base_url)
regRecord.result = SingleResponse.parseEntry(dbEntry, xml=True)
regRecord.createDataBlock()
return jsonify(regRecord.createResponse(200))
try:
dbEntry = db.session.query(CCE)\
.outerjoin(Registration, RENEWAL_REG, Renewal)\
.filter(CCE.uuid == uuid).one()
regRecord.result = SingleResponse.parseEntry(dbEntry, xml=True)
regRecord.createDataBlock()
status = 200
except NoResultFound:
status = 404
err = LookupError('Unable to locate UUID {} in database'.format(uuid))
except DataError:
status = 500
err = LookupError('Malformed UUID {} received'.format(uuid))
return jsonify(regRecord.createResponse(status, err=err))
@uuid.route('/renewal/<uuid>', methods=['GET'])
def renQuery(uuid):
dbRenewal = db.session.query(Renewal)\
.outerjoin(RENEWAL_REG, Registration, CCE)\
.filter(Renewal.uuid == uuid).one()
err = None
renRecord = SingleResponse('uuid', request.base_url)
renRecord.result = parseRetRenewal(dbRenewal)
renRecord.createDataBlock()
return jsonify(renRecord.createResponse(200))
try:
dbRenewal = db.session.query(Renewal)\
.outerjoin(RENEWAL_REG, Registration, CCE)\
.filter(Renewal.uuid == uuid).one()
renRecord.result = parseRetRenewal(dbRenewal)
renRecord.createDataBlock()
status = 200
except NoResultFound:
status = 404
err = LookupError('Unable to locate UUID {} in database'.format(uuid))
except DataError:
status = 500
err = LookupError('Malformed UUID {} received'.format(uuid))
return jsonify(renRecord.createResponse(status, err=err))
def parseRetRenewal(dbRenewal):

4
dev-requirements.txt Normal file
View File

@ -0,0 +1,4 @@
flake8
pytest
pytest-cov
pytest-mock

View File

@ -1,12 +1,8 @@
import os
from datetime import datetime
from elasticsearch.helpers import bulk, BulkIndexError, streaming_bulk
from elasticsearch.helpers import BulkIndexError, streaming_bulk
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import (
ConnectionError,
TransportError,
ConflictError
)
from elasticsearch.exceptions import ConnectionError
from elasticsearch_dsl import connections
from elasticsearch_dsl.wrappers import Range
@ -16,7 +12,7 @@ from sqlalchemy.dialects import postgresql
from model.cce import CCE as dbCCE
from model.renewal import Renewal as dbRenewal
from model.registration import Registration as dbRegistration
from model.registration import Registration as dbRegistration # noqa: F401
from model.elastic import (
CCE,
Registration,
@ -29,11 +25,13 @@ class ESIndexer():
def __init__(self, manager, loadFromTime):
self.cce_index = os.environ['ES_CCE_INDEX']
self.ccr_index = os.environ['ES_CCR_INDEX']
self.client = None
self.client = self.createElasticConnection()
self.session = manager.session
self.loadFromTime = loadFromTime if loadFromTime else datetime.strptime('1970-01-01', '%Y-%m-%d')
if loadFromTime:
self.loadFromTime = loadFromTime
else:
self.loadFromTime = datetime.strptime('1970-01-01', '%Y-%m-%d')
self.createElasticConnection()
self.createIndex()
configure_mappers()
@ -43,21 +41,23 @@ class ESIndexer():
port = os.environ['ES_PORT']
timeout = int(os.environ['ES_TIMEOUT'])
try:
self.client = Elasticsearch(
client = Elasticsearch(
hosts=[{'host': host, 'port': port}],
timeout=timeout
)
except ConnectionError as err:
print('Failed to connect to ElasticSearch instance')
raise err
connections.connections._conns['default'] = self.client
connections.connections._conns['default'] = client
return client
def createIndex(self):
if self.client.indices.exists(index=self.cce_index) is False:
CCE.init()
if self.client.indices.exists(index=self.ccr_index) is False:
Renewal.init()
def indexRecords(self, recType='cce'):
"""Process the current batch of updating records. This utilizes the
elasticsearch-py bulk helper to import records in chunks of the
@ -68,14 +68,16 @@ class ESIndexer():
success, failure = 0, 0
errors = []
try:
for status, work in streaming_bulk(self.client, self.process(recType)):
print(status, work)
for status, work in streaming_bulk(
self.client,
self.process(recType)
):
if not status:
errors.append(work)
failure += 1
else:
success += 1
print('Success {} | Failure: {}'.format(success, failure))
except BulkIndexError as err:
print('One or more records in the chunk failed to import')
@ -91,7 +93,8 @@ class ESIndexer():
for ccr in self.retrieveRenewals():
esRen = ESRen(ccr)
esRen.indexRen()
if esRen.renewal.rennum == '': continue
if esRen.renewal.rennum == '':
continue
yield esRen.renewal.to_dict(True)
def retrieveEntries(self):
@ -99,7 +102,7 @@ class ESIndexer():
.filter(dbCCE.date_modified > self.loadFromTime)
for cce in retQuery.all():
yield cce
def retrieveRenewals(self):
renQuery = self.session.query(dbRenewal)\
.filter(dbRenewal.date_modified > self.loadFromTime)
@ -110,21 +113,18 @@ class ESIndexer():
class ESDoc():
def __init__(self, cce):
self.dbRec = cce
self.entry = None
self.initEntry()
self.entry = self.initEntry()
def initEntry(self):
print('Creating ES record for {}'.format(self.dbRec))
self.entry = CCE(meta={'id': self.dbRec.uuid})
return CCE(meta={'id': self.dbRec.uuid, 'index': 'cce'})
def indexEntry(self):
self.entry.uuid = self.dbRec.uuid
self.entry.title = self.dbRec.title
self.entry.authors = [ a.name for a in self.dbRec.authors ]
self.entry.publishers = [ p.name for p in self.dbRec.publishers ]
self.entry.lccns = [ l.lccn for l in self.dbRec.lccns ]
self.entry.authors = [a.name for a in self.dbRec.authors]
self.entry.publishers = [p.name for p in self.dbRec.publishers]
self.entry.lccns = [l.lccn for l in self.dbRec.lccns]
self.entry.registrations = [
Registration(regnum=r.regnum, regdate=r.reg_date)
for r in self.dbRec.registrations
@ -134,14 +134,11 @@ class ESDoc():
class ESRen():
def __init__(self, ccr):
self.dbRen = ccr
self.renewal = None
self.initRenewal()
self.renewal = self.initRenewal()
def initRenewal(self):
print('Creating ES record for {}'.format(self.dbRen))
self.renewal = Renewal(meta={'id': self.dbRen.renewal_num})
return Renewal(meta={'id': self.dbRen.renewal_num, 'index': 'ccr'})
def indexRen(self):
self.renewal.uuid = self.dbRen.uuid

11
helpers/config.py Normal file
View File

@ -0,0 +1,11 @@
import os
import yaml
def loadConfig():
with open('config.yaml', 'r') as yamlFile:
config = yaml.safe_load(yamlFile)
for section in config:
sectionDict = config[section]
for key, value in sectionDict.items():
os.environ[key] = value

View File

@ -4,4 +4,11 @@ class DataError(Exception):
self.message = message
for key, value in kwargs.items():
setattr(self, key, value)
setattr(self, key, value)
class LookupError(Exception):
def __init__(self, message, **kwargs):
self.message = message
for key, value in kwargs.items():
setattr(self, key, value)

44
main.py
View File

@ -1,7 +1,11 @@
import argparse
from datetime import datetime, timedelta
import os
import yaml
from sessionManager import SessionManager
from builder import CCEReader
from renBuilder import CCRReader
from esIndexer import ESIndexer
from helpers.config import loadConfig
def main(secondsAgo=None, year=None, exclude=None, reinit=False):
@ -19,24 +23,25 @@ def main(secondsAgo=None, year=None, exclude=None, reinit=False):
loadCCE(manager, loadFromTime, year)
if exclude != 'ccr':
loadCCR(manager, loadFromTime, year)
indexUpdates(manager, loadFromTime)
manager.closeConnection()
def loadCCE(manager, loadFromTime, selectedYear):
cceReader = CCEReader(manager)
cceReader.loadYears(selectedYear)
cceReader.getYearFiles(loadFromTime)
cceReader.importYearData()
def loadCCR(manager, loadFromTime, selectedYear):
ccrReader = CCRReader(manager)
ccrReader.loadYears(selectedYear, loadFromTime)
ccrReader.importYears()
def indexUpdates(manager, loadFromTime):
esIndexer = ESIndexer(manager, loadFromTime)
esIndexer.indexRecords(recType='cce')
@ -48,28 +53,16 @@ def parseArgs():
description='Load CCE XML and CCR TSV into PostgresQL'
)
parser.add_argument('-t', '--time', type=int, required=False,
help='Time ago in seconds to check for file updates'
)
help='Time ago in seconds to check for file updates')
parser.add_argument('-y', '--year', type=str, required=False,
help='Specific year to load CCE entries and/or renewals from'
)
help='Specific year to load CCE entries and/or renewals from')
parser.add_argument('-x', '--exclude', type=str, required=False,
choices=['cce', 'ccr'],
help='Specify to exclude either entries or renewals from this run'
)
choices=['cce', 'ccr'],
help='Specify to exclude either entries or renewals from this run')
parser.add_argument('--REINITIALIZE', action='store_true')
return parser.parse_args()
def loadConfig():
with open('config.yaml', 'r') as yamlFile:
config = yaml.safe_load(yamlFile)
for section in config:
sectionDict = config[section]
for key, value in sectionDict.items():
os.environ[key] = value
if __name__ == '__main__':
args = parseArgs()
try:
@ -77,14 +70,9 @@ if __name__ == '__main__':
except FileNotFoundError:
pass
from sessionManager import SessionManager
from builder import CCEReader, CCEFile
from renBuilder import CCRReader, CCRFile
from esIndexer import ESIndexer
main(
secondsAgo=args.time,
year=args.year,
exclude=args.exclude,
reinit=args.REINITIALIZE
)
)

View File

@ -1,18 +1,11 @@
from sqlalchemy import (
Column,
Date,
ForeignKey,
Integer,
String,
Unicode,
Boolean
)
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import relationship, backref
from sqlalchemy.ext.associationproxy import association_proxy
from sqlalchemy.orm.exc import NoResultFound
from model.core import Base, Core
@ -25,4 +18,4 @@ class Author(Core, Base):
cce_id = Column(Integer, ForeignKey('cce.id'), index=True)
def __repr__(self):
return '<Author(name={}, primary={})>'.format(self.name, self.primary)
return '<Author(name={}, primary={})>'.format(self.name, self.primary)

View File

@ -1,19 +1,15 @@
from lxml import etree
import uuid
from sqlalchemy import (
Column,
Date,
ForeignKey,
Integer,
String,
Unicode,
Boolean
)
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import relationship, backref
from sqlalchemy.ext.associationproxy import association_proxy
from sqlalchemy.orm.exc import NoResultFound
from sqlalchemy.orm import relationship
from model.core import Base, Core
@ -43,24 +39,35 @@ class CCE(Core, Base):
volume_id = Column(Integer, ForeignKey('volume.id'))
registrations = relationship('Registration', backref='cce')
registrations = relationship(
'Registration',
backref='cce',
cascade='all, delete-orphan'
)
lccns = relationship('LCCN', backref='cce', cascade='all, delete-orphan')
authors = relationship('Author', backref='cce', cascade='all, delete-orphan')
publishers = relationship('Publisher', backref='cce', cascade='all, delete-orphan')
authors = relationship('Author',
backref='cce', cascade='all, delete-orphan')
publishers = relationship('Publisher',
backref='cce', cascade='all, delete-orphan')
def __repr__(self):
return '<CCE(regnums={}, uuid={}, title={})>'.format(self.registrations, self.uuid, self.title)
return '<CCE(regnums={}, uuid={}, title={})>'.format(
self.registrations,
self.uuid,
self.title
)
def addRelationships(self, volume, xml, lccn=[], authors=[], publishers=[], registrations=[]):
def addRelationships(self, volume, xml,
lccn=[], authors=[], publishers=[], registrations=[]):
self.volume = volume
self.addLCCN(lccn)
self.addAuthor(authors)
self.addPublisher(publishers)
self.addRegistration(registrations)
self.addXML(xml)
def addLCCN(self, lccns):
self.lccns = [ LCCN(lccn=lccn) for lccn in lccns ]
self.lccns = [LCCN(lccn=lccn) for lccn in lccns]
def addXML(self, xml):
xmlString = etree.tostring(xml, encoding='utf-8').decode()
@ -72,7 +79,7 @@ class CCE(Core, Base):
print('No author name! for {}'.format(self.uuid))
continue
self.authors.append(Author(name=auth[0], primary=auth[1]))
def addPublisher(self, publishers):
for pub in publishers:
if pub[0] is None:
@ -80,7 +87,7 @@ class CCE(Core, Base):
continue
claimant = True if pub[1] == 'yes' else False
self.publishers.append(Publisher(name=pub[0], claimant=claimant))
def addRegistration(self, registrations):
self.registrations = [
Registration(
@ -91,16 +98,18 @@ class CCE(Core, Base):
)
for reg in registrations
]
def updateRelationships(self, xml, lccn=[], authors=[], publishers=[], registrations=[]):
def updateRelationships(self, xml,
lccn=[], authors=[],
publishers=[], registrations=[]):
self.addXML(xml)
self.updateLCCN(lccn)
self.updateAuthors(authors)
self.updatePublishers(publishers)
self.updateRegistrations(registrations)
def updateLCCN(self, lccns):
currentLCCNs = [ l.lccn for l in self.lccns ]
currentLCCNs = [l.lccn for l in self.lccns]
if lccns != currentLCCNs:
self.lccns = [
l for l in self.lccns
@ -108,9 +117,9 @@ class CCE(Core, Base):
]
for new in list(set(lccns) - set(currentLCCNs)):
self.lccns.append(LCCN(lccn=new))
def updateAuthors(self, authors):
currentAuthors = [ (a.name, a.primary) for a in self.authors ]
currentAuthors = [(a.name, a.primary) for a in self.authors]
newAuthors = filter(lambda x: x[0] is None, authors)
if newAuthors != currentAuthors:
self.authors = [
@ -119,9 +128,9 @@ class CCE(Core, Base):
]
for new in list(set(newAuthors) - set(currentAuthors)):
self.authors.append(Author(name=new[0], primary=new[1]))
def updatePublishers(self, publishers):
currentPublishers = [ (a.name, a.claimant) for a in self.publishers ]
currentPublishers = [(a.name, a.claimant) for a in self.publishers]
newPublishers = [
(p[0], True if p[1] == 'yes' else False)
for p in filter(lambda x: x[0] is None, publishers)
@ -133,15 +142,15 @@ class CCE(Core, Base):
]
for new in list(set(newPublishers) - set(currentPublishers)):
self.publishers.append(Publisher(name=new[0], claimant=new[1]))
def updateRegistrations(self, registrations):
existingRegs = [
self.updateReg(r, registrations) for r in self.registrations
if r.regnum in [ n['regnum'] for n in registrations ]
if r.regnum in [n['regnum'] for n in registrations]
]
newRegs = [
r for r in registrations
if r['regnum'] not in [ n.regnum for n in existingRegs ]
if r['regnum'] not in [n.regnum for n in existingRegs]
]
self.registrations = existingRegs + [
Registration(
@ -152,16 +161,18 @@ class CCE(Core, Base):
)
for reg in newRegs
]
def updateReg(self, reg, registrations):
newReg = CCE.getReg(reg.regnum, registrations)
if newReg: reg.update(newReg)
if newReg:
reg.update(newReg)
return reg
def setParentCCE(self, parentID):
self.parent_cce_id = parentID
@staticmethod
def getReg(regnum, newRegs):
for new in newRegs:
if regnum == new['regnum']: return new
if regnum == new['regnum']:
return new

View File

@ -1,7 +1,5 @@
import os
import yaml
from elasticsearch_dsl import (
Index,
Document,
Keyword,
Text,
@ -45,7 +43,7 @@ class Renewal(BaseDoc):
claimants = Nested(Claimant)
class Index:
name = os.environ['ES_CCR_INDEX']
name = os.environ.get('ES_CCR_INDEX', None)
class CCE(BaseDoc):
@ -58,4 +56,4 @@ class CCE(BaseDoc):
registrations = Nested(Registration)
class Index:
name = os.environ['ES_CCE_INDEX']
name = os.environ.get('ES_CCE_INDEX', None)

View File

@ -14,9 +14,11 @@ from sqlalchemy import (
from model.core import Base, Core
@compiles(String, 'postgresql')
def compile_xml(type_, compiler, **kw):
return "XML"
return 'XML'
ENTRY_XML = Table(
'entry_xml',
@ -32,19 +34,24 @@ ERROR_XML = Table(
Column('xml_id', Integer, ForeignKey('xml.id'), index=True)
)
class XML(Core, Base):
__tablename__ = 'xml'
id = Column(Integer, primary_key=True)
xml_source = Column(String)
entry = relationship(
'CCE',
secondary=ENTRY_XML,
backref='xml_sources'
backref='xml_sources',
single_parent=True,
cascade='all, delete-orphan'
)
error_entry = relationship(
'ErrorCCE',
secondary=ERROR_XML,
backref='xml_sources'
backref='xml_sources',
single_parent=True,
cascade='all, delete-orphan'
)

0
tests/__init__.py Normal file
View File

View File

256
tests/test_es_indexer.py Normal file
View File

@ -0,0 +1,256 @@
from datetime import datetime
from unittest.mock import MagicMock, call
import pytest
from elasticsearch.helpers import BulkIndexError
from elasticsearch.exceptions import ConnectionError
from esIndexer import ESIndexer, ESDoc, ESRen
class TestIndexer(object):
@pytest.fixture
def setEnvVars(self, mocker):
mocker.patch.dict('os.environ', {
'ES_CCE_INDEX': 'test_cce',
'ES_CCR_INDEX': 'test_ccr',
'ES_HOST': 'test',
'ES_PORT': '9999',
'ES_TIMEOUT': '0'
})
@pytest.fixture
def testIndexer(self, mocker, setEnvVars):
mocker.patch('esIndexer.configure_mappers')
mocker.patch('esIndexer.ESIndexer.createElasticConnection')
mocker.patch('esIndexer.ESIndexer.createIndex')
mockManager = MagicMock()
mockManager.session = 'session'
return ESIndexer(mockManager, 10)
def test_indexerInit(self, mocker, testIndexer):
assert testIndexer.cce_index == 'test_cce'
assert testIndexer.ccr_index == 'test_ccr'
assert testIndexer.session == 'session'
def test_indexerInit_no_time(self, mocker, setEnvVars):
mockConfig = mocker.patch('esIndexer.configure_mappers')
mockConn = mocker.patch('esIndexer.ESIndexer.createElasticConnection')
mockCreate = mocker.patch('esIndexer.ESIndexer.createIndex')
mockManager = MagicMock()
mockManager.session = 'session'
testIndexer = ESIndexer(mockManager, None)
assert testIndexer.cce_index == 'test_cce'
assert testIndexer.ccr_index == 'test_ccr'
assert testIndexer.session == 'session'
assert testIndexer.loadFromTime == datetime(1970, 1, 1)
assert mockConn.called
assert mockCreate.called
assert mockConfig.called
def test_elastic_connection(self, mocker, setEnvVars):
mockConfig = mocker.patch('esIndexer.configure_mappers')
mockCreate = mocker.patch('esIndexer.ESIndexer.createIndex')
mockManager = MagicMock()
mockManager.session = 'session'
mockElastic = mocker.patch('esIndexer.Elasticsearch')
mockElastic.return_value = 'test_client'
mocker.patch('esIndexer.connections')
testIndexer = ESIndexer(mockManager, 10)
assert testIndexer.cce_index == 'test_cce'
assert testIndexer.ccr_index == 'test_ccr'
assert testIndexer.session == 'session'
assert testIndexer.client == 'test_client'
assert mockCreate.called
assert mockConfig.called
assert mockElastic.called_once_with(
hosts=[{'host': 'test', 'port': '9999'}],
timeout='0'
)
def test_elastic_conn_err(self, mocker, setEnvVars):
mocker.patch('esIndexer.configure_mappers')
mocker.patch('esIndexer.ESIndexer.createIndex')
mockManager = MagicMock()
mockManager.session = 'session'
mockElastic = mocker.patch('esIndexer.Elasticsearch')
mockElastic.side_effect = ConnectionError
mocker.patch('esIndexer.connections')
with pytest.raises(ConnectionError):
ESIndexer(mockManager, 10)
def test_index_create(self, mocker, setEnvVars):
mockConfig = mocker.patch('esIndexer.configure_mappers')
mockClient = MagicMock()
mockClient.indices.exists.side_effect = [False, False]
mockConn = mocker.patch('esIndexer.ESIndexer.createElasticConnection')
mockConn.return_value = mockClient
mockCCE = mocker.patch('esIndexer.CCE')
mockCCR = mocker.patch('esIndexer.Renewal')
mockManager = MagicMock()
mockManager.session = 'session'
testIndexer = ESIndexer(mockManager, 10)
assert testIndexer.cce_index == 'test_cce'
assert testIndexer.ccr_index == 'test_ccr'
assert testIndexer.session == 'session'
assert mockConn.called
assert mockConfig.called
assert mockCCE.init.called
assert mockCCR.init.called
def test_bulk_index_success(self, mocker, testIndexer):
mockProcess = mocker.patch('esIndexer.ESIndexer.process')
mockProcess.return_value = ['test1', 'test2', 'test3']
mockStreaming = mocker.patch('esIndexer.streaming_bulk')
mockStreaming.return_value = [
(True, 'test1'),
(False, 'test2'),
(True, 'test3')
]
testIndexer.indexRecords()
assert mockProcess.called
assert mockStreaming.called
def test_bulk_index_failure(self, mocker, testIndexer):
mockProcess = mocker.patch('esIndexer.ESIndexer.process')
mockProcess.return_value = ['test1', 'test2', 'test3']
mockStreaming = mocker.patch('esIndexer.streaming_bulk')
mockStreaming.side_effect = BulkIndexError
with pytest.raises(BulkIndexError):
testIndexer.indexRecords()
def test_process_cce(self, mocker, testIndexer):
mockRetrieve = mocker.patch('esIndexer.ESIndexer.retrieveEntries')
mockRetrieve.return_value = ['test1', 'test2', 'test3']
mockDoc = mocker.patch('esIndexer.ESDoc')
mockDoc().entry.to_dict.side_effect = ['test1', 'test2', 'test3']
processed = [p for p in testIndexer.process('cce')]
assert processed[0] == 'test1'
def test_process_ccr(self, mocker, testIndexer):
mockRens = []
for i in range(1, 4):
tmpRen = MagicMock()
tmpRen.renewal.rennum = '' if i == 1 else i
tmpRen.renewal.to_dict.return_value = 'test{}'.format(str(i))
mockRens.append(tmpRen)
mockRetrieve = mocker.patch('esIndexer.ESIndexer.retrieveRenewals')
mockRetrieve.return_value = ['test1', 'test2', 'test3']
mockDoc = mocker.patch('esIndexer.ESRen')
mockDoc.side_effect = mockRens
processed = [p for p in testIndexer.process('ccr')]
assert processed[0] == 'test2'
def test_retrieveEntries(self, mocker, testIndexer):
mockSession = MagicMock()
mockAll = MagicMock()
mockAll.all.return_value = ['cce1', 'cce2', 'cce3']
mockSession.query().filter.return_value = mockAll
testIndexer.session = mockSession
entries = [e for e in testIndexer.retrieveEntries()]
assert entries[1] == 'cce2'
def test_retrieveRenewals(self, mocker, testIndexer):
mockSession = MagicMock()
mockAll = MagicMock()
mockAll.all.return_value = ['ccr1', 'ccr2', 'ccr3']
mockSession.query().filter.return_value = mockAll
testIndexer.session = mockSession
renewals = [r for r in testIndexer.retrieveRenewals()]
assert renewals[2] == 'ccr3'
class TestESDoc(object):
def test_ESDocInit(self, mocker):
mockInit = mocker.patch('esIndexer.ESDoc.initEntry')
mockInit.return_value = 'testCCE'
testDoc = ESDoc('testRec')
assert testDoc.dbRec == 'testRec'
assert testDoc.entry == 'testCCE'
def test_ESDocCreateEntry(self):
mockRec = MagicMock()
mockRec.uuid = 'testUUID'
testDoc = ESDoc(mockRec)
assert testDoc.entry.meta.id == 'testUUID'
def test_esDoc_index(self, mocker):
mockEntry = MagicMock()
mockInit = mocker.patch('esIndexer.ESDoc.initEntry')
mockInit.return_value = mockEntry
mockDB = MagicMock()
mockDB.uuid = 'testUUID'
mockDB.title = 'Test Title'
mockReg = MagicMock()
mockReg.regnum = 'T0000'
mockReg.reg_date = '1999-12-31'
mockDB.registrations = [mockReg]
testDoc = ESDoc(mockDB)
testDoc.indexEntry()
assert testDoc.entry.uuid == 'testUUID'
assert testDoc.entry.registrations[0].regnum == 'T0000'
class TestESDen(object):
def test_ESRenInit(self, mocker):
mockInit = mocker.patch('esIndexer.ESRen.initRenewal')
mockInit.return_value = 'testCCR'
testDoc = ESRen('testRen')
assert testDoc.dbRen == 'testRen'
assert testDoc.renewal == 'testCCR'
def test_ESRenCreateRenewal(self):
mockRec = MagicMock()
mockRec.renewal_num = 'testRennum'
testRen = ESRen(mockRec)
assert testRen.renewal.meta.id == 'testRennum'
def test_esRen_index(self, mocker):
mockRenewal = MagicMock()
mockInit = mocker.patch('esIndexer.ESRen.initRenewal')
mockInit.return_value = mockRenewal
mockDB = MagicMock()
mockDB.uuid = 'testUUID'
mockDB.renewal_num = 'R0000'
mockCla = MagicMock()
mockCla.name = 'Test Claimant'
mockCla.claimant_type = 'T'
mockDB.claimants = [mockCla]
testRen = ESRen(mockDB)
testRen.indexRen()
assert testRen.renewal.uuid == 'testUUID'
assert testRen.renewal.claimants[0].claim_type == 'T'

View File

@ -0,0 +1,20 @@
import os
from unittest.mock import mock_open, patch
from helpers.config import loadConfig
class TestConfigHelpers(object):
def test_config_loader(self):
testYAMLText = """
TESTING:
FIRST: STRING1
SECOND: '10'
EXTRA:
VALUE: SOMETHING"""
mockOpen = mock_open(read_data=testYAMLText)
with patch('helpers.config.open', mockOpen):
loadConfig()
assert os.environ['FIRST'] == 'STRING1'
assert os.environ['SECOND'] == '10'
assert os.environ['VALUE'] == 'SOMETHING'

View File

@ -0,0 +1,8 @@
from helpers.errors import DataError
class TestErrorHelpers(object):
def test_create_DataError(self):
newDataErr = DataError('testing', source='pytest')
assert newDataErr.message == 'testing'
assert newDataErr.source == 'pytest'

81
tests/test_main_ingest.py Normal file
View File

@ -0,0 +1,81 @@
import sys
from unittest.mock import MagicMock, call
from main import main, loadCCE, loadCCR, indexUpdates, parseArgs
class TestHandler(object):
def test_main_plain(self, mocker):
mockSession = mocker.patch('main.SessionManager')
mockLoadCCE = mocker.patch('main.loadCCE')
mockLoadCCR = mocker.patch('main.loadCCR')
mockIndex = mocker.patch('main.indexUpdates')
main()
assert mockSession.called
assert mockLoadCCE.called
assert mockLoadCCR.called
assert mockIndex.called
def test_main_args(self, mocker):
mockSession = mocker.patch('main.SessionManager')
mockLoadCCE = mocker.patch('main.loadCCE')
mockLoadCCR = mocker.patch('main.loadCCR')
mockIndex = mocker.patch('main.indexUpdates')
main(
secondsAgo=10,
year=1900,
exclude='ccr',
reinit=True
)
assert mockSession.called
assert mockLoadCCE.called
assert mockLoadCCR.not_called
assert mockIndex.called
def test_cce_load(self, mocker):
mockReader = mocker.patch('main.CCEReader')
mockCCE = MagicMock()
mockReader.return_value = mockCCE
loadCCE('manager', 10, None)
assert mockReader.called_once_with('manager')
assert mockCCE.loadYears.called_once_with(None)
assert mockCCE.getYearFiles.called_once_with(10)
assert mockCCE.importYearData.called
def test_ccr_load(self, mocker):
mockReader = mocker.patch('main.CCRReader')
mockCCR = MagicMock()
mockReader.return_value = mockCCR
loadCCR('manager', 10, None)
assert mockReader.called_once_with('manager')
assert mockCCR.loadYears.called_once_with(None, 10)
assert mockCCR.importYears.called
def test_indexer(self, mocker):
mockIndexer = mocker.patch('main.ESIndexer')
mockInd = MagicMock()
mockIndexer.return_value = mockInd
indexUpdates('manager', 10)
assert mockIndexer.called_once_with('manager', 10)
assert mockInd.indexRecords.mock_calls == \
[call(recType='cce'), call(recType='ccr')]
def test_parseArgs_success(self, mocker):
args = ['main', '--time', '10', '--year', '1900', '--exclude', 'ccr']
mocker.patch.object(sys, 'argv', args)
args = parseArgs()
assert int(args.time) == 10
assert int(args.year) == 1900
assert args.exclude == 'ccr'
assert args.REINITIALIZE is False

View File

@ -0,0 +1,10 @@
from model.author import Author
class TestModelAuthor(object):
def test_authorCreate(self):
testAuthor = Author()
testAuthor.name = 'Tester'
testAuthor.primary = True
assert str(testAuthor) == '<Author(name=Tester, primary=True)>'

58
tests/test_model_cce.py Normal file
View File

@ -0,0 +1,58 @@
from unittest.mock import MagicMock, DEFAULT
import pytest
from model.cce import CCE
class TestModelCCE(object):
@pytest.fixture
def mockCCE(self):
return CCE()
def test_cceCreate(self, mockCCE):
mockCCE.uuid = 'testUUID'
mockCCE.title = 'Testing'
mockReg = MagicMock()
mockCCE.registrations = [mockReg]
assert str(mockCCE) == '<CCE(regnums=[{}], uuid={}, title={})>'.format(
str(mockReg), 'testUUID', 'Testing'
)
def test_addRelationships(self, mocker, mockCCE):
addMocks = mocker.patch.multiple('model.cce.CCE', addLCCN=DEFAULT,
addAuthor=DEFAULT,
addPublisher=DEFAULT,
addRegistration=DEFAULT,
addXML=DEFAULT)
mockCCE = CCE()
mockVol = MagicMock()
mockVol.name = 'testVol'
mockCCE.addRelationships(
mockVol,
'<xml>',
lccn=[1, 2, 3],
authors=['author1'],
publishers=['pub1'],
registrations=['reg1']
)
assert mockCCE.volume.name == 'testVol'
assert addMocks['addAuthor'].called_once_with(['author1'])
assert addMocks['addPublisher'].called_once_with(['pub1'])
assert addMocks['addRegistration'].called_once_with(['reg1'])
assert addMocks['addLCCN'].called_once_with([1, 2, 3])
def test_addLCCN(self, mocker, mockCCE):
mockLCCN = mocker.patch('model.cce.LCCN')
mockLCs = []
for i in range(1, 3):
lcMock = MagicMock()
lcMock.name = 'lccn{}'.format(i)
mockLCs.append(lcMock)
mockLCCN.side_effect = mockLCs
mockCCE.addLCCN([1, 2])
assert mockCCE.lccns[1].name == 'lccn2'