Merge pull request #1 from EbookFoundation/stevens_api

Stevens api: Added functionality to search across multiple fields (title, author, publisher)
master
Rachel Kim 2020-05-14 17:11:28 -04:00 committed by GitHub
commit d2cd9283e8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 333 additions and 27 deletions

View File

@ -2,13 +2,13 @@ import os
import yaml import yaml
from flask import Flask, jsonify from flask import Flask, jsonify
from flasgger import Swagger from flasgger import Swagger
from api.prints.swagger.swag import SwaggerDoc from .prints.swagger.swag import SwaggerDoc
from api.db import db from .db import db
from api.elastic import elastic from .elastic import elastic
from api.prints import base, search, uuid from .prints import base, search, uuid
def loadConfig(): def loadConfig():
with open('config.yaml', 'r') as yamlFile: with open('config.yaml-dist', 'r') as yamlFile:
config = yaml.safe_load(yamlFile) config = yaml.safe_load(yamlFile)
for section in config: for section in config:
sectionDict = config[section] sectionDict = config[section]
@ -37,6 +37,7 @@ application.config['ELASTICSEARCH_INDEX_URI'] = '{}:{}'.format(
os.environ['ES_HOST'], os.environ['ES_HOST'],
os.environ['ES_PORT'] os.environ['ES_PORT']
) )
# print(application.config['ELASTICSEARCH_INDEX_URI'])
application.config['SWAGGER'] = {'title': 'CCE Search'} application.config['SWAGGER'] = {'title': 'CCE Search'}
db.init_app(application) db.init_app(application)
elastic.init_app(application) elastic.init_app(application)

View File

@ -3,13 +3,18 @@ from elasticsearch_dsl import Search, Q
class Elastic(): class Elastic():
def __init__(self): def __init__(self):
self.client = None self.client = Elasticsearch()
def init_app(self, app): def init_app(self, app):
self.client = Elasticsearch(app.config['ELASTICSEARCH_INDEX_URI']) try:
self.client = Elasticsearch(hosts=app.config['ELASTICSEARCH_INDEX_URI'])
except ConnectionError as err:
print('Failed to connect to ElasticSearch instance')
raise err
def create_search(self, index): def create_search(self, index):
return Search(using=self.client, index=index) s = Search(using=self.client, index=index)
return s
def query_regnum(self, regnum, page=0, perPage=10): def query_regnum(self, regnum, page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage) startPos, endPos = Elastic.getFromSize(page, perPage)
@ -26,11 +31,40 @@ class Elastic():
def query_fulltext(self, queryText, page=0, perPage=10): def query_fulltext(self, queryText, page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage) startPos, endPos = Elastic.getFromSize(page, perPage)
print(startPos, endPos)
search = self.create_search('cce,ccr') search = self.create_search('cce,ccr')
renewalSearch = search.query('query_string', query=queryText)[startPos:endPos] renewalSearch = search.query('query_string', query=queryText)[startPos:endPos]
return renewalSearch.execute() return renewalSearch.execute()
#New Query Types
def query_title(self, queryText,page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
search = self.create_search('cce,ccr')
titleSearch = search.query('match', title=queryText)[startPos:endPos]
return titleSearch.execute()
def query_author(self, queryText,page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
search = self.create_search('cce,ccr')
titleSearch = search.query('match', authors=queryText)[startPos:endPos]
return titleSearch.execute()
# If query is given for publisher field, don't check renewals?
def query_multifields(self, params, page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
if "publishers" in params:
search = self.create_search('cce')
search = search.query('match', publishers=params["publishers"])
else:
search = self.create_search('cce,ccr')
if "title" in params:
search = search.query('match', title=params['title'])
if "authors" in params:
search = search.query('match', authors=params['authors'])
titleSearch = search[startPos:endPos]
return titleSearch.execute()
@staticmethod @staticmethod
def getFromSize(page, perPage): def getFromSize(page, perPage):
startPos = page * perPage startPos = page * perPage

View File

@ -9,6 +9,131 @@ from api.response import MultiResponse
search = Blueprint('search', __name__, url_prefix='/search') search = Blueprint('search', __name__, url_prefix='/search')
@search.route('/multi', methods=['GET'])
def multiQuery():
title = request.args.get('title', '')
authors = request.args.get('authors', '')
publishers = request.args.get('publishers','')
sourceReturn = request.args.get('source', False)
page, perPage = MultiResponse.parsePaging(request.args)
queries = {}
if title!="*" and title!="":
queries["title"]=title
if authors!="*" and authors!="":
queries["authors"]=authors
if publishers!="*" and publishers!="":
queries["publishers"]=publishers
matchingDocs = elastic.query_multifields(queries, page=page, perPage=perPage)
textResponse = MultiResponse(
'text',
matchingDocs.hits.total,
request.base_url,
queries,
page,
perPage
)
qManager = QueryManager(db.session)
for entry in matchingDocs:
if entry.meta.index == 'cce':
dbEntry = qManager.registrationQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseEntry(
dbEntry,
xml=sourceReturn
))
else:
try:
dbRenewal = qManager.renewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
except NoResultFound:
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
textResponse.createDataBlock()
return jsonify(textResponse.createResponse(200))
@search.route('/author', methods=['GET'])
def authorQuery():
queryText = request.args.get('query', '')
sourceReturn = request.args.get('source', False)
page, perPage = MultiResponse.parsePaging(request.args)
matchingDocs = elastic.query_author(queryText, page=page, perPage=perPage)
textResponse = MultiResponse(
'text',
matchingDocs.hits.total,
request.base_url,
queryText,
page,
perPage
)
qManager = QueryManager(db.session)
for entry in matchingDocs:
if entry.meta.index == 'cce':
dbEntry = qManager.registrationQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseEntry(
dbEntry,
xml=sourceReturn
))
else:
try:
dbRenewal = qManager.renewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
except NoResultFound:
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
textResponse.createDataBlock()
return jsonify(textResponse.createResponse(200))
@search.route('/title', methods=['GET'])
def titleQuery():
queryText = request.args.get('query', '')
sourceReturn = request.args.get('source', False)
page, perPage = MultiResponse.parsePaging(request.args)
matchingDocs = elastic.query_title(queryText, page=page, perPage=perPage)
textResponse = MultiResponse(
'text',
matchingDocs.hits.total,
request.base_url,
queryText,
page,
perPage
)
qManager = QueryManager(db.session)
for entry in matchingDocs:
if entry.meta.index == 'cce':
dbEntry = qManager.registrationQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseEntry(
dbEntry,
xml=sourceReturn
))
else:
try:
dbRenewal = qManager.renewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
except NoResultFound:
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
textResponse.createDataBlock()
return jsonify(textResponse.createResponse(200))
@search.route('/fulltext', methods=['GET']) @search.route('/fulltext', methods=['GET'])
def fullTextQuery(): def fullTextQuery():
@ -92,8 +217,7 @@ def renQuery(rennum):
for entry in matchingDocs: for entry in matchingDocs:
dbRenewal = qManager.renewalQuery(entry.uuid) dbRenewal = qManager.renewalQuery(entry.uuid)
renResponse.extendResults(parseRetRenewal( renResponse.extendResults(parseRetRenewal(
dbRenewal, dbRenewal
source=sourceReturn
)) ))
renResponse.createDataBlock() renResponse.createDataBlock()

View File

@ -23,6 +23,147 @@ class SwaggerDoc():
"https" "https"
], ],
"paths": { "paths": {
"/search/multi": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a query string to search across both registration and renewal records in the author field",
"parameters": [
{
"name": "title",
"in": "query",
"type": "string",
"required": False,
"default": "*"
},{
"name": "authors",
"in": "query",
"type": "string",
"required": False,
"default": "*"
},{
"name": "publishers",
"in": "query",
"type": "string",
"required": False,
"default": "*"
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
}
],
"responses": {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
}
}
}
}
},
"/search/author": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a query string to search across both registration and renewal records in the author field",
"parameters": [
{
"name": "query",
"in": "query",
"type": "string",
"required": True,
"default": "*"
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
}
],
"responses": {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
}
}
}
}
},
"/search/title": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a query string to search across both registration and renewal records in the title fiel",
"parameters": [
{
"name": "query",
"in": "query",
"type": "string",
"required": True,
"default": "*"
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
}
],
"responses": {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
}
}
}
}
},
"/search/fulltext": { "/search/fulltext": {
"get": { "get": {
"tags": ["Search"], "tags": ["Search"],

View File

@ -1,3 +1,4 @@
import math
class Response(): class Response():
def __init__(self, queryType, endpoint): def __init__(self, queryType, endpoint):
@ -151,7 +152,7 @@ class MultiResponse(Response):
else: else:
paging['next'] = None paging['next'] = None
lastPage = int((self.total - self.perPage) / self.perPage) lastPage = math.ceil(((self.total - self.perPage) / self.perPage))
if ( if (
self.page * self.perPage < self.total and self.page * self.perPage < self.total and
self.total > self.perPage self.total > self.perPage

View File

@ -6,6 +6,13 @@ from lxml import etree
import os import os
import re import re
import traceback import traceback
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = 'utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = 'utf-8')
from model.cce import CCE from model.cce import CCE
from model.errorCCE import ErrorCCE from model.errorCCE import ErrorCCE

View File

@ -148,6 +148,7 @@ class ESRen():
self.renewal.rennum = self.dbRen.renewal_num self.renewal.rennum = self.dbRen.renewal_num
self.renewal.rendate = self.dbRen.renewal_date self.renewal.rendate = self.dbRen.renewal_date
self.renewal.title = self.dbRen.title self.renewal.title = self.dbRen.title
self.renewal.authors = self.dbRen.author
self.renewal.claimants = [ self.renewal.claimants = [
Claimant(name=c.name, claim_type=c.claimant_type) Claimant(name=c.name, claim_type=c.claimant_type)
for c in self.dbRen.claimants for c in self.dbRen.claimants

View File

@ -14,12 +14,10 @@ def main(secondsAgo=None, year=None, exclude=None, reinit=False):
startTime = datetime.now() startTime = datetime.now()
if secondsAgo is not None: if secondsAgo is not None:
loadFromTime = startTime - timedelta(seconds=secondsAgo) loadFromTime = startTime - timedelta(seconds=secondsAgo)
if exclude != 'cce': if exclude != 'cce':
loadCCE(manager, loadFromTime, year) loadCCE(manager, loadFromTime, year)
if exclude != 'ccr': if exclude != 'ccr':
loadCCR(manager, loadFromTime, year) loadCCR(manager, loadFromTime, year)
indexUpdates(manager, loadFromTime) indexUpdates(manager, loadFromTime)
manager.closeConnection() manager.closeConnection()
@ -62,7 +60,7 @@ def parseArgs():
def loadConfig(): def loadConfig():
with open('config.yaml', 'r') as yamlFile: with open('config.yaml-dist', 'r') as yamlFile:
config = yaml.safe_load(yamlFile) config = yaml.safe_load(yamlFile)
for section in config: for section in config:
sectionDict = config[section] sectionDict = config[section]
@ -75,6 +73,7 @@ if __name__ == '__main__':
try: try:
loadConfig() loadConfig()
except FileNotFoundError: except FileNotFoundError:
print("Unable to set environment variables")
pass pass
from sessionManager import SessionManager from sessionManager import SessionManager

View File

@ -41,9 +41,9 @@ class Renewal(BaseDoc):
rennum = Keyword() rennum = Keyword()
rendate = Date() rendate = Date()
title = Text(fields={'keyword': Keyword()}) title = Text(fields={'keyword': Keyword()})
authors = Text()
claimants = Nested(Claimant) claimants = Nested(Claimant)
class Index: class Index:
name = os.environ['ES_CCR_INDEX'] name = os.environ['ES_CCR_INDEX']
@ -54,7 +54,6 @@ class CCE(BaseDoc):
authors = Text(multi=True) authors = Text(multi=True)
publishers = Text(multi=True) publishers = Text(multi=True)
lccns = Keyword(multi=True) lccns = Keyword(multi=True)
registrations = Nested(Registration) registrations = Nested(Registration)
class Index: class Index:

View File

@ -16,7 +16,6 @@ class CCRReader():
self.git = Github(os.environ['ACCESS_TOKEN']) self.git = Github(os.environ['ACCESS_TOKEN'])
self.repo = self.git.get_repo(os.environ['CCR_REPO']) self.repo = self.git.get_repo(os.environ['CCR_REPO'])
self.ccrYears = {} self.ccrYears = {}
self.dbManager = manager self.dbManager = manager
def loadYears(self, selectedYear, loadFromTime): def loadYears(self, selectedYear, loadFromTime):