Merge pull request #1 from EbookFoundation/stevens_api
Stevens api: Added functionality to search across multiple fields (title, author, publisher)master
commit
d2cd9283e8
11
api/app.py
11
api/app.py
|
@ -2,13 +2,13 @@ import os
|
|||
import yaml
|
||||
from flask import Flask, jsonify
|
||||
from flasgger import Swagger
|
||||
from api.prints.swagger.swag import SwaggerDoc
|
||||
from api.db import db
|
||||
from api.elastic import elastic
|
||||
from api.prints import base, search, uuid
|
||||
from .prints.swagger.swag import SwaggerDoc
|
||||
from .db import db
|
||||
from .elastic import elastic
|
||||
from .prints import base, search, uuid
|
||||
|
||||
def loadConfig():
|
||||
with open('config.yaml', 'r') as yamlFile:
|
||||
with open('config.yaml-dist', 'r') as yamlFile:
|
||||
config = yaml.safe_load(yamlFile)
|
||||
for section in config:
|
||||
sectionDict = config[section]
|
||||
|
@ -37,6 +37,7 @@ application.config['ELASTICSEARCH_INDEX_URI'] = '{}:{}'.format(
|
|||
os.environ['ES_HOST'],
|
||||
os.environ['ES_PORT']
|
||||
)
|
||||
# print(application.config['ELASTICSEARCH_INDEX_URI'])
|
||||
application.config['SWAGGER'] = {'title': 'CCE Search'}
|
||||
db.init_app(application)
|
||||
elastic.init_app(application)
|
||||
|
|
|
@ -3,13 +3,18 @@ from elasticsearch_dsl import Search, Q
|
|||
|
||||
class Elastic():
|
||||
def __init__(self):
|
||||
self.client = None
|
||||
self.client = Elasticsearch()
|
||||
|
||||
def init_app(self, app):
|
||||
self.client = Elasticsearch(app.config['ELASTICSEARCH_INDEX_URI'])
|
||||
try:
|
||||
self.client = Elasticsearch(hosts=app.config['ELASTICSEARCH_INDEX_URI'])
|
||||
except ConnectionError as err:
|
||||
print('Failed to connect to ElasticSearch instance')
|
||||
raise err
|
||||
|
||||
def create_search(self, index):
|
||||
return Search(using=self.client, index=index)
|
||||
s = Search(using=self.client, index=index)
|
||||
return s
|
||||
|
||||
def query_regnum(self, regnum, page=0, perPage=10):
|
||||
startPos, endPos = Elastic.getFromSize(page, perPage)
|
||||
|
@ -26,10 +31,39 @@ class Elastic():
|
|||
|
||||
def query_fulltext(self, queryText, page=0, perPage=10):
|
||||
startPos, endPos = Elastic.getFromSize(page, perPage)
|
||||
print(startPos, endPos)
|
||||
search = self.create_search('cce,ccr')
|
||||
renewalSearch = search.query('query_string', query=queryText)[startPos:endPos]
|
||||
return renewalSearch.execute()
|
||||
|
||||
#New Query Types
|
||||
def query_title(self, queryText,page=0, perPage=10):
|
||||
startPos, endPos = Elastic.getFromSize(page, perPage)
|
||||
search = self.create_search('cce,ccr')
|
||||
titleSearch = search.query('match', title=queryText)[startPos:endPos]
|
||||
return titleSearch.execute()
|
||||
|
||||
def query_author(self, queryText,page=0, perPage=10):
|
||||
startPos, endPos = Elastic.getFromSize(page, perPage)
|
||||
search = self.create_search('cce,ccr')
|
||||
titleSearch = search.query('match', authors=queryText)[startPos:endPos]
|
||||
return titleSearch.execute()
|
||||
|
||||
|
||||
# If query is given for publisher field, don't check renewals?
|
||||
def query_multifields(self, params, page=0, perPage=10):
|
||||
startPos, endPos = Elastic.getFromSize(page, perPage)
|
||||
if "publishers" in params:
|
||||
search = self.create_search('cce')
|
||||
search = search.query('match', publishers=params["publishers"])
|
||||
else:
|
||||
search = self.create_search('cce,ccr')
|
||||
if "title" in params:
|
||||
search = search.query('match', title=params['title'])
|
||||
if "authors" in params:
|
||||
search = search.query('match', authors=params['authors'])
|
||||
titleSearch = search[startPos:endPos]
|
||||
return titleSearch.execute()
|
||||
|
||||
|
||||
@staticmethod
|
||||
def getFromSize(page, perPage):
|
||||
|
@ -37,4 +71,4 @@ class Elastic():
|
|||
endPos = startPos + perPage
|
||||
return startPos, endPos
|
||||
|
||||
elastic = Elastic()
|
||||
elastic = Elastic()
|
||||
|
|
|
@ -9,6 +9,131 @@ from api.response import MultiResponse
|
|||
|
||||
search = Blueprint('search', __name__, url_prefix='/search')
|
||||
|
||||
@search.route('/multi', methods=['GET'])
|
||||
def multiQuery():
|
||||
title = request.args.get('title', '')
|
||||
authors = request.args.get('authors', '')
|
||||
publishers = request.args.get('publishers','')
|
||||
sourceReturn = request.args.get('source', False)
|
||||
page, perPage = MultiResponse.parsePaging(request.args)
|
||||
queries = {}
|
||||
if title!="*" and title!="":
|
||||
queries["title"]=title
|
||||
if authors!="*" and authors!="":
|
||||
queries["authors"]=authors
|
||||
if publishers!="*" and publishers!="":
|
||||
queries["publishers"]=publishers
|
||||
matchingDocs = elastic.query_multifields(queries, page=page, perPage=perPage)
|
||||
textResponse = MultiResponse(
|
||||
'text',
|
||||
matchingDocs.hits.total,
|
||||
request.base_url,
|
||||
queries,
|
||||
page,
|
||||
perPage
|
||||
)
|
||||
qManager = QueryManager(db.session)
|
||||
for entry in matchingDocs:
|
||||
if entry.meta.index == 'cce':
|
||||
dbEntry = qManager.registrationQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseEntry(
|
||||
dbEntry,
|
||||
xml=sourceReturn
|
||||
))
|
||||
else:
|
||||
try:
|
||||
dbRenewal = qManager.renewalQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseRenewal(
|
||||
dbRenewal,
|
||||
source=sourceReturn
|
||||
))
|
||||
except NoResultFound:
|
||||
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseRenewal(
|
||||
dbRenewal,
|
||||
source=sourceReturn
|
||||
))
|
||||
|
||||
textResponse.createDataBlock()
|
||||
return jsonify(textResponse.createResponse(200))
|
||||
|
||||
@search.route('/author', methods=['GET'])
|
||||
def authorQuery():
|
||||
queryText = request.args.get('query', '')
|
||||
sourceReturn = request.args.get('source', False)
|
||||
page, perPage = MultiResponse.parsePaging(request.args)
|
||||
matchingDocs = elastic.query_author(queryText, page=page, perPage=perPage)
|
||||
textResponse = MultiResponse(
|
||||
'text',
|
||||
matchingDocs.hits.total,
|
||||
request.base_url,
|
||||
queryText,
|
||||
page,
|
||||
perPage
|
||||
)
|
||||
qManager = QueryManager(db.session)
|
||||
for entry in matchingDocs:
|
||||
if entry.meta.index == 'cce':
|
||||
dbEntry = qManager.registrationQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseEntry(
|
||||
dbEntry,
|
||||
xml=sourceReturn
|
||||
))
|
||||
else:
|
||||
try:
|
||||
dbRenewal = qManager.renewalQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseRenewal(
|
||||
dbRenewal,
|
||||
source=sourceReturn
|
||||
))
|
||||
except NoResultFound:
|
||||
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseRenewal(
|
||||
dbRenewal,
|
||||
source=sourceReturn
|
||||
))
|
||||
|
||||
textResponse.createDataBlock()
|
||||
return jsonify(textResponse.createResponse(200))
|
||||
|
||||
@search.route('/title', methods=['GET'])
|
||||
def titleQuery():
|
||||
queryText = request.args.get('query', '')
|
||||
sourceReturn = request.args.get('source', False)
|
||||
page, perPage = MultiResponse.parsePaging(request.args)
|
||||
matchingDocs = elastic.query_title(queryText, page=page, perPage=perPage)
|
||||
textResponse = MultiResponse(
|
||||
'text',
|
||||
matchingDocs.hits.total,
|
||||
request.base_url,
|
||||
queryText,
|
||||
page,
|
||||
perPage
|
||||
)
|
||||
qManager = QueryManager(db.session)
|
||||
for entry in matchingDocs:
|
||||
if entry.meta.index == 'cce':
|
||||
dbEntry = qManager.registrationQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseEntry(
|
||||
dbEntry,
|
||||
xml=sourceReturn
|
||||
))
|
||||
else:
|
||||
try:
|
||||
dbRenewal = qManager.renewalQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseRenewal(
|
||||
dbRenewal,
|
||||
source=sourceReturn
|
||||
))
|
||||
except NoResultFound:
|
||||
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseRenewal(
|
||||
dbRenewal,
|
||||
source=sourceReturn
|
||||
))
|
||||
|
||||
textResponse.createDataBlock()
|
||||
return jsonify(textResponse.createResponse(200))
|
||||
|
||||
@search.route('/fulltext', methods=['GET'])
|
||||
def fullTextQuery():
|
||||
|
@ -92,8 +217,7 @@ def renQuery(rennum):
|
|||
for entry in matchingDocs:
|
||||
dbRenewal = qManager.renewalQuery(entry.uuid)
|
||||
renResponse.extendResults(parseRetRenewal(
|
||||
dbRenewal,
|
||||
source=sourceReturn
|
||||
dbRenewal
|
||||
))
|
||||
|
||||
renResponse.createDataBlock()
|
||||
|
|
|
@ -23,6 +23,147 @@ class SwaggerDoc():
|
|||
"https"
|
||||
],
|
||||
"paths": {
|
||||
"/search/multi": {
|
||||
"get": {
|
||||
"tags": ["Search"],
|
||||
"summary": "Returns a set of registration and renewal objects",
|
||||
"description": "Accepts a query string to search across both registration and renewal records in the author field",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "title",
|
||||
"in": "query",
|
||||
"type": "string",
|
||||
"required": False,
|
||||
"default": "*"
|
||||
},{
|
||||
"name": "authors",
|
||||
"in": "query",
|
||||
"type": "string",
|
||||
"required": False,
|
||||
"default": "*"
|
||||
},{
|
||||
"name": "publishers",
|
||||
"in": "query",
|
||||
"type": "string",
|
||||
"required": False,
|
||||
"default": "*"
|
||||
},{
|
||||
"name": "source",
|
||||
"in": "query",
|
||||
"type": "boolean",
|
||||
"required": False,
|
||||
"default": False,
|
||||
"description": "Return source XML/CSV data"
|
||||
},{
|
||||
"name": "page",
|
||||
"in": "query",
|
||||
"type": "number",
|
||||
"required": False,
|
||||
"default": 0
|
||||
},{
|
||||
"name": "per_page",
|
||||
"in": "query",
|
||||
"type": "number",
|
||||
"required": False,
|
||||
"default": 10
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
200: {
|
||||
"description": "A list of copyright registrations and renewals",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/MultiResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/search/author": {
|
||||
"get": {
|
||||
"tags": ["Search"],
|
||||
"summary": "Returns a set of registration and renewal objects",
|
||||
"description": "Accepts a query string to search across both registration and renewal records in the author field",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "query",
|
||||
"in": "query",
|
||||
"type": "string",
|
||||
"required": True,
|
||||
"default": "*"
|
||||
},{
|
||||
"name": "source",
|
||||
"in": "query",
|
||||
"type": "boolean",
|
||||
"required": False,
|
||||
"default": False,
|
||||
"description": "Return source XML/CSV data"
|
||||
},{
|
||||
"name": "page",
|
||||
"in": "query",
|
||||
"type": "number",
|
||||
"required": False,
|
||||
"default": 0
|
||||
},{
|
||||
"name": "per_page",
|
||||
"in": "query",
|
||||
"type": "number",
|
||||
"required": False,
|
||||
"default": 10
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
200: {
|
||||
"description": "A list of copyright registrations and renewals",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/MultiResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/search/title": {
|
||||
"get": {
|
||||
"tags": ["Search"],
|
||||
"summary": "Returns a set of registration and renewal objects",
|
||||
"description": "Accepts a query string to search across both registration and renewal records in the title fiel",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "query",
|
||||
"in": "query",
|
||||
"type": "string",
|
||||
"required": True,
|
||||
"default": "*"
|
||||
},{
|
||||
"name": "source",
|
||||
"in": "query",
|
||||
"type": "boolean",
|
||||
"required": False,
|
||||
"default": False,
|
||||
"description": "Return source XML/CSV data"
|
||||
},{
|
||||
"name": "page",
|
||||
"in": "query",
|
||||
"type": "number",
|
||||
"required": False,
|
||||
"default": 0
|
||||
},{
|
||||
"name": "per_page",
|
||||
"in": "query",
|
||||
"type": "number",
|
||||
"required": False,
|
||||
"default": 10
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
200: {
|
||||
"description": "A list of copyright registrations and renewals",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/MultiResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/search/fulltext": {
|
||||
"get": {
|
||||
"tags": ["Search"],
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import math
|
||||
|
||||
class Response():
|
||||
def __init__(self, queryType, endpoint):
|
||||
|
@ -151,7 +152,7 @@ class MultiResponse(Response):
|
|||
else:
|
||||
paging['next'] = None
|
||||
|
||||
lastPage = int((self.total - self.perPage) / self.perPage)
|
||||
lastPage = math.ceil(((self.total - self.perPage) / self.perPage))
|
||||
if (
|
||||
self.page * self.perPage < self.total and
|
||||
self.total > self.perPage
|
||||
|
|
|
@ -6,6 +6,13 @@ from lxml import etree
|
|||
import os
|
||||
import re
|
||||
import traceback
|
||||
import sys
|
||||
|
||||
import io
|
||||
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = 'utf-8')
|
||||
|
||||
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = 'utf-8')
|
||||
|
||||
from model.cce import CCE
|
||||
from model.errorCCE import ErrorCCE
|
||||
|
|
|
@ -7,12 +7,12 @@ DATABASE:
|
|||
|
||||
GITHUB:
|
||||
ACCESS_TOKEN:
|
||||
CCE_REPO:
|
||||
CCR_REPO:
|
||||
CCE_REPO:
|
||||
CCR_REPO:
|
||||
|
||||
ELASTICSEARCH:
|
||||
ES_CCE_INDEX:
|
||||
ES_CCR_INDEX:
|
||||
ES_CCE_INDEX:
|
||||
ES_CCR_INDEX:
|
||||
ES_HOST:
|
||||
ES_PORT:
|
||||
ES_TIMEOUT:
|
||||
ES_TIMEOUT:
|
||||
|
|
|
@ -148,6 +148,7 @@ class ESRen():
|
|||
self.renewal.rennum = self.dbRen.renewal_num
|
||||
self.renewal.rendate = self.dbRen.renewal_date
|
||||
self.renewal.title = self.dbRen.title
|
||||
self.renewal.authors = self.dbRen.author
|
||||
self.renewal.claimants = [
|
||||
Claimant(name=c.name, claim_type=c.claimant_type)
|
||||
for c in self.dbRen.claimants
|
||||
|
|
7
main.py
7
main.py
|
@ -14,12 +14,10 @@ def main(secondsAgo=None, year=None, exclude=None, reinit=False):
|
|||
startTime = datetime.now()
|
||||
if secondsAgo is not None:
|
||||
loadFromTime = startTime - timedelta(seconds=secondsAgo)
|
||||
|
||||
if exclude != 'cce':
|
||||
loadCCE(manager, loadFromTime, year)
|
||||
if exclude != 'ccr':
|
||||
loadCCR(manager, loadFromTime, year)
|
||||
|
||||
indexUpdates(manager, loadFromTime)
|
||||
|
||||
manager.closeConnection()
|
||||
|
@ -62,7 +60,7 @@ def parseArgs():
|
|||
|
||||
|
||||
def loadConfig():
|
||||
with open('config.yaml', 'r') as yamlFile:
|
||||
with open('config.yaml-dist', 'r') as yamlFile:
|
||||
config = yaml.safe_load(yamlFile)
|
||||
for section in config:
|
||||
sectionDict = config[section]
|
||||
|
@ -75,6 +73,7 @@ if __name__ == '__main__':
|
|||
try:
|
||||
loadConfig()
|
||||
except FileNotFoundError:
|
||||
print("Unable to set environment variables")
|
||||
pass
|
||||
|
||||
from sessionManager import SessionManager
|
||||
|
@ -87,4 +86,4 @@ if __name__ == '__main__':
|
|||
year=args.year,
|
||||
exclude=args.exclude,
|
||||
reinit=args.REINITIALIZE
|
||||
)
|
||||
)
|
||||
|
|
|
@ -41,9 +41,9 @@ class Renewal(BaseDoc):
|
|||
rennum = Keyword()
|
||||
rendate = Date()
|
||||
title = Text(fields={'keyword': Keyword()})
|
||||
|
||||
claimants = Nested(Claimant)
|
||||
authors = Text()
|
||||
|
||||
claimants = Nested(Claimant)
|
||||
class Index:
|
||||
name = os.environ['ES_CCR_INDEX']
|
||||
|
||||
|
@ -54,7 +54,6 @@ class CCE(BaseDoc):
|
|||
authors = Text(multi=True)
|
||||
publishers = Text(multi=True)
|
||||
lccns = Keyword(multi=True)
|
||||
|
||||
registrations = Nested(Registration)
|
||||
|
||||
class Index:
|
||||
|
|
|
@ -16,7 +16,6 @@ class CCRReader():
|
|||
self.git = Github(os.environ['ACCESS_TOKEN'])
|
||||
self.repo = self.git.get_repo(os.environ['CCR_REPO'])
|
||||
self.ccrYears = {}
|
||||
|
||||
self.dbManager = manager
|
||||
|
||||
def loadYears(self, selectedYear, loadFromTime):
|
||||
|
@ -190,4 +189,4 @@ class CCRFile():
|
|||
except KeyError:
|
||||
pass
|
||||
print('No matching field found!')
|
||||
raise KeyError
|
||||
raise KeyError
|
||||
|
|
Loading…
Reference in New Issue