Merge pull request #1 from EbookFoundation/stevens_api

Stevens api: Added functionality to search across multiple fields (title, author, publisher)
master
Rachel Kim 2020-05-14 17:11:28 -04:00 committed by GitHub
commit d2cd9283e8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 333 additions and 27 deletions

View File

@ -2,13 +2,13 @@ import os
import yaml
from flask import Flask, jsonify
from flasgger import Swagger
from api.prints.swagger.swag import SwaggerDoc
from api.db import db
from api.elastic import elastic
from api.prints import base, search, uuid
from .prints.swagger.swag import SwaggerDoc
from .db import db
from .elastic import elastic
from .prints import base, search, uuid
def loadConfig():
with open('config.yaml', 'r') as yamlFile:
with open('config.yaml-dist', 'r') as yamlFile:
config = yaml.safe_load(yamlFile)
for section in config:
sectionDict = config[section]
@ -37,6 +37,7 @@ application.config['ELASTICSEARCH_INDEX_URI'] = '{}:{}'.format(
os.environ['ES_HOST'],
os.environ['ES_PORT']
)
# print(application.config['ELASTICSEARCH_INDEX_URI'])
application.config['SWAGGER'] = {'title': 'CCE Search'}
db.init_app(application)
elastic.init_app(application)

View File

@ -3,13 +3,18 @@ from elasticsearch_dsl import Search, Q
class Elastic():
def __init__(self):
self.client = None
self.client = Elasticsearch()
def init_app(self, app):
self.client = Elasticsearch(app.config['ELASTICSEARCH_INDEX_URI'])
try:
self.client = Elasticsearch(hosts=app.config['ELASTICSEARCH_INDEX_URI'])
except ConnectionError as err:
print('Failed to connect to ElasticSearch instance')
raise err
def create_search(self, index):
return Search(using=self.client, index=index)
s = Search(using=self.client, index=index)
return s
def query_regnum(self, regnum, page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
@ -26,10 +31,39 @@ class Elastic():
def query_fulltext(self, queryText, page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
print(startPos, endPos)
search = self.create_search('cce,ccr')
renewalSearch = search.query('query_string', query=queryText)[startPos:endPos]
return renewalSearch.execute()
#New Query Types
def query_title(self, queryText,page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
search = self.create_search('cce,ccr')
titleSearch = search.query('match', title=queryText)[startPos:endPos]
return titleSearch.execute()
def query_author(self, queryText,page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
search = self.create_search('cce,ccr')
titleSearch = search.query('match', authors=queryText)[startPos:endPos]
return titleSearch.execute()
# If query is given for publisher field, don't check renewals?
def query_multifields(self, params, page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
if "publishers" in params:
search = self.create_search('cce')
search = search.query('match', publishers=params["publishers"])
else:
search = self.create_search('cce,ccr')
if "title" in params:
search = search.query('match', title=params['title'])
if "authors" in params:
search = search.query('match', authors=params['authors'])
titleSearch = search[startPos:endPos]
return titleSearch.execute()
@staticmethod
def getFromSize(page, perPage):
@ -37,4 +71,4 @@ class Elastic():
endPos = startPos + perPage
return startPos, endPos
elastic = Elastic()
elastic = Elastic()

View File

@ -9,6 +9,131 @@ from api.response import MultiResponse
search = Blueprint('search', __name__, url_prefix='/search')
@search.route('/multi', methods=['GET'])
def multiQuery():
title = request.args.get('title', '')
authors = request.args.get('authors', '')
publishers = request.args.get('publishers','')
sourceReturn = request.args.get('source', False)
page, perPage = MultiResponse.parsePaging(request.args)
queries = {}
if title!="*" and title!="":
queries["title"]=title
if authors!="*" and authors!="":
queries["authors"]=authors
if publishers!="*" and publishers!="":
queries["publishers"]=publishers
matchingDocs = elastic.query_multifields(queries, page=page, perPage=perPage)
textResponse = MultiResponse(
'text',
matchingDocs.hits.total,
request.base_url,
queries,
page,
perPage
)
qManager = QueryManager(db.session)
for entry in matchingDocs:
if entry.meta.index == 'cce':
dbEntry = qManager.registrationQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseEntry(
dbEntry,
xml=sourceReturn
))
else:
try:
dbRenewal = qManager.renewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
except NoResultFound:
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
textResponse.createDataBlock()
return jsonify(textResponse.createResponse(200))
@search.route('/author', methods=['GET'])
def authorQuery():
queryText = request.args.get('query', '')
sourceReturn = request.args.get('source', False)
page, perPage = MultiResponse.parsePaging(request.args)
matchingDocs = elastic.query_author(queryText, page=page, perPage=perPage)
textResponse = MultiResponse(
'text',
matchingDocs.hits.total,
request.base_url,
queryText,
page,
perPage
)
qManager = QueryManager(db.session)
for entry in matchingDocs:
if entry.meta.index == 'cce':
dbEntry = qManager.registrationQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseEntry(
dbEntry,
xml=sourceReturn
))
else:
try:
dbRenewal = qManager.renewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
except NoResultFound:
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
textResponse.createDataBlock()
return jsonify(textResponse.createResponse(200))
@search.route('/title', methods=['GET'])
def titleQuery():
queryText = request.args.get('query', '')
sourceReturn = request.args.get('source', False)
page, perPage = MultiResponse.parsePaging(request.args)
matchingDocs = elastic.query_title(queryText, page=page, perPage=perPage)
textResponse = MultiResponse(
'text',
matchingDocs.hits.total,
request.base_url,
queryText,
page,
perPage
)
qManager = QueryManager(db.session)
for entry in matchingDocs:
if entry.meta.index == 'cce':
dbEntry = qManager.registrationQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseEntry(
dbEntry,
xml=sourceReturn
))
else:
try:
dbRenewal = qManager.renewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
except NoResultFound:
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
textResponse.createDataBlock()
return jsonify(textResponse.createResponse(200))
@search.route('/fulltext', methods=['GET'])
def fullTextQuery():
@ -92,8 +217,7 @@ def renQuery(rennum):
for entry in matchingDocs:
dbRenewal = qManager.renewalQuery(entry.uuid)
renResponse.extendResults(parseRetRenewal(
dbRenewal,
source=sourceReturn
dbRenewal
))
renResponse.createDataBlock()

View File

@ -23,6 +23,147 @@ class SwaggerDoc():
"https"
],
"paths": {
"/search/multi": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a query string to search across both registration and renewal records in the author field",
"parameters": [
{
"name": "title",
"in": "query",
"type": "string",
"required": False,
"default": "*"
},{
"name": "authors",
"in": "query",
"type": "string",
"required": False,
"default": "*"
},{
"name": "publishers",
"in": "query",
"type": "string",
"required": False,
"default": "*"
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
}
],
"responses": {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
}
}
}
}
},
"/search/author": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a query string to search across both registration and renewal records in the author field",
"parameters": [
{
"name": "query",
"in": "query",
"type": "string",
"required": True,
"default": "*"
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
}
],
"responses": {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
}
}
}
}
},
"/search/title": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a query string to search across both registration and renewal records in the title fiel",
"parameters": [
{
"name": "query",
"in": "query",
"type": "string",
"required": True,
"default": "*"
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
}
],
"responses": {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
}
}
}
}
},
"/search/fulltext": {
"get": {
"tags": ["Search"],

View File

@ -1,3 +1,4 @@
import math
class Response():
def __init__(self, queryType, endpoint):
@ -151,7 +152,7 @@ class MultiResponse(Response):
else:
paging['next'] = None
lastPage = int((self.total - self.perPage) / self.perPage)
lastPage = math.ceil(((self.total - self.perPage) / self.perPage))
if (
self.page * self.perPage < self.total and
self.total > self.perPage

View File

@ -6,6 +6,13 @@ from lxml import etree
import os
import re
import traceback
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = 'utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = 'utf-8')
from model.cce import CCE
from model.errorCCE import ErrorCCE

View File

@ -7,12 +7,12 @@ DATABASE:
GITHUB:
ACCESS_TOKEN:
CCE_REPO:
CCR_REPO:
CCE_REPO:
CCR_REPO:
ELASTICSEARCH:
ES_CCE_INDEX:
ES_CCR_INDEX:
ES_CCE_INDEX:
ES_CCR_INDEX:
ES_HOST:
ES_PORT:
ES_TIMEOUT:
ES_TIMEOUT:

View File

@ -148,6 +148,7 @@ class ESRen():
self.renewal.rennum = self.dbRen.renewal_num
self.renewal.rendate = self.dbRen.renewal_date
self.renewal.title = self.dbRen.title
self.renewal.authors = self.dbRen.author
self.renewal.claimants = [
Claimant(name=c.name, claim_type=c.claimant_type)
for c in self.dbRen.claimants

View File

@ -14,12 +14,10 @@ def main(secondsAgo=None, year=None, exclude=None, reinit=False):
startTime = datetime.now()
if secondsAgo is not None:
loadFromTime = startTime - timedelta(seconds=secondsAgo)
if exclude != 'cce':
loadCCE(manager, loadFromTime, year)
if exclude != 'ccr':
loadCCR(manager, loadFromTime, year)
indexUpdates(manager, loadFromTime)
manager.closeConnection()
@ -62,7 +60,7 @@ def parseArgs():
def loadConfig():
with open('config.yaml', 'r') as yamlFile:
with open('config.yaml-dist', 'r') as yamlFile:
config = yaml.safe_load(yamlFile)
for section in config:
sectionDict = config[section]
@ -75,6 +73,7 @@ if __name__ == '__main__':
try:
loadConfig()
except FileNotFoundError:
print("Unable to set environment variables")
pass
from sessionManager import SessionManager
@ -87,4 +86,4 @@ if __name__ == '__main__':
year=args.year,
exclude=args.exclude,
reinit=args.REINITIALIZE
)
)

View File

@ -41,9 +41,9 @@ class Renewal(BaseDoc):
rennum = Keyword()
rendate = Date()
title = Text(fields={'keyword': Keyword()})
claimants = Nested(Claimant)
authors = Text()
claimants = Nested(Claimant)
class Index:
name = os.environ['ES_CCR_INDEX']
@ -54,7 +54,6 @@ class CCE(BaseDoc):
authors = Text(multi=True)
publishers = Text(multi=True)
lccns = Keyword(multi=True)
registrations = Nested(Registration)
class Index:

View File

@ -16,7 +16,6 @@ class CCRReader():
self.git = Github(os.environ['ACCESS_TOKEN'])
self.repo = self.git.get_repo(os.environ['CCR_REPO'])
self.ccrYears = {}
self.dbManager = manager
def loadYears(self, selectedYear, loadFromTime):
@ -190,4 +189,4 @@ class CCRFile():
except KeyError:
pass
print('No matching field found!')
raise KeyError
raise KeyError