Added new search functionality

stevens_api
Rachel Kim 2020-05-05 22:32:28 -04:00
parent 46dd2d8d75
commit 1bec15f97f
11 changed files with 359 additions and 38 deletions

View File

@ -2,13 +2,13 @@ import os
import yaml import yaml
from flask import Flask, jsonify from flask import Flask, jsonify
from flasgger import Swagger from flasgger import Swagger
from api.prints.swagger.swag import SwaggerDoc from prints.swagger.swag import SwaggerDoc
from api.db import db from db import db
from api.elastic import elastic from elastic import elastic
from api.prints import base, search, uuid from prints import base, search, uuid
def loadConfig(): def loadConfig():
with open('config.yaml', 'r') as yamlFile: with open('config.yaml-dist', 'r') as yamlFile:
config = yaml.safe_load(yamlFile) config = yaml.safe_load(yamlFile)
for section in config: for section in config:
sectionDict = config[section] sectionDict = config[section]
@ -37,6 +37,7 @@ application.config['ELASTICSEARCH_INDEX_URI'] = '{}:{}'.format(
os.environ['ES_HOST'], os.environ['ES_HOST'],
os.environ['ES_PORT'] os.environ['ES_PORT']
) )
# print(application.config['ELASTICSEARCH_INDEX_URI'])
application.config['SWAGGER'] = {'title': 'CCE Search'} application.config['SWAGGER'] = {'title': 'CCE Search'}
db.init_app(application) db.init_app(application)
elastic.init_app(application) elastic.init_app(application)

View File

@ -3,13 +3,18 @@ from elasticsearch_dsl import Search, Q
class Elastic(): class Elastic():
def __init__(self): def __init__(self):
self.client = None self.client = Elasticsearch()
def init_app(self, app): def init_app(self, app):
self.client = Elasticsearch(app.config['ELASTICSEARCH_INDEX_URI']) try:
self.client = Elasticsearch(hosts=app.config['ELASTICSEARCH_INDEX_URI'])
except ConnectionError as err:
print('Failed to connect to ElasticSearch instance')
raise err
def create_search(self, index): def create_search(self, index):
return Search(using=self.client, index=index) s = Search(using=self.client, index=index)
return s
def query_regnum(self, regnum, page=0, perPage=10): def query_regnum(self, regnum, page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage) startPos, endPos = Elastic.getFromSize(page, perPage)
@ -30,6 +35,41 @@ class Elastic():
search = self.create_search('cce,ccr') search = self.create_search('cce,ccr')
renewalSearch = search.query('query_string', query=queryText)[startPos:endPos] renewalSearch = search.query('query_string', query=queryText)[startPos:endPos]
return renewalSearch.execute() return renewalSearch.execute()
#New Query Types
def query_title(self, queryText,page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
print(startPos, endPos)
search = self.create_search('cce,ccr')
titleSearch = search.query('match', title=queryText)[startPos:endPos]
print(titleSearch.to_dict())
return titleSearch.execute()
def query_author(self, queryText,page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
print(startPos, endPos)
search = self.create_search('cce,ccr')
titleSearch = search.query('match', authors=queryText)[startPos:endPos]
print(titleSearch.to_dict())
return titleSearch.execute()
# If query is given for publisher field, don't check renewals?
def query_multifields(self, params, page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
print(startPos, endPos)
if "publishers" in params:
search = self.create_search('cce')
search = search.query('match', publishers=params["publishers"])
else:
search = self.create_search('cce,ccr')
if "title" in params:
search = search.query('match', title=params['title'])
if "authors" in params:
search = search.query('match', authors=params['authors'])
titleSearch = search[startPos:endPos]
return titleSearch.execute()
@staticmethod @staticmethod
def getFromSize(page, perPage): def getFromSize(page, perPage):

View File

@ -9,6 +9,132 @@ from api.response import MultiResponse
search = Blueprint('search', __name__, url_prefix='/search') search = Blueprint('search', __name__, url_prefix='/search')
@search.route('/multi', methods=['GET'])
def multiQuery():
title = request.args.get('title', '')
authors = request.args.get('authors', '')
publishers = request.args.get('publishers','')
sourceReturn = request.args.get('source', False)
page, perPage = MultiResponse.parsePaging(request.args)
queries = {}
if title!="*" and title!="":
queries["title"]=title
if authors!="*" and authors!="":
queries["authors"]=authors
if publishers!="*" and publishers!="":
queries["publishers"]=publishers
print(queries)
matchingDocs = elastic.query_multifields(queries, page=page, perPage=perPage)
textResponse = MultiResponse(
'text',
matchingDocs.hits.total,
request.base_url,
queries,
page,
perPage
)
qManager = QueryManager(db.session)
for entry in matchingDocs:
if entry.meta.index == 'cce':
dbEntry = qManager.registrationQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseEntry(
dbEntry,
xml=sourceReturn
))
else:
try:
dbRenewal = qManager.renewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
except NoResultFound:
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
textResponse.createDataBlock()
return jsonify(textResponse.createResponse(200))
@search.route('/author', methods=['GET'])
def authorQuery():
queryText = request.args.get('query', '')
sourceReturn = request.args.get('source', False)
page, perPage = MultiResponse.parsePaging(request.args)
matchingDocs = elastic.query_author(queryText, page=page, perPage=perPage)
textResponse = MultiResponse(
'text',
matchingDocs.hits.total,
request.base_url,
queryText,
page,
perPage
)
qManager = QueryManager(db.session)
for entry in matchingDocs:
if entry.meta.index == 'cce':
dbEntry = qManager.registrationQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseEntry(
dbEntry,
xml=sourceReturn
))
else:
try:
dbRenewal = qManager.renewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
except NoResultFound:
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
textResponse.createDataBlock()
return jsonify(textResponse.createResponse(200))
@search.route('/title', methods=['GET'])
def titleQuery():
queryText = request.args.get('query', '')
sourceReturn = request.args.get('source', False)
page, perPage = MultiResponse.parsePaging(request.args)
matchingDocs = elastic.query_title(queryText, page=page, perPage=perPage)
textResponse = MultiResponse(
'text',
matchingDocs.hits.total,
request.base_url,
queryText,
page,
perPage
)
qManager = QueryManager(db.session)
for entry in matchingDocs:
if entry.meta.index == 'cce':
dbEntry = qManager.registrationQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseEntry(
dbEntry,
xml=sourceReturn
))
else:
try:
dbRenewal = qManager.renewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
except NoResultFound:
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
textResponse.createDataBlock()
return jsonify(textResponse.createResponse(200))
@search.route('/fulltext', methods=['GET']) @search.route('/fulltext', methods=['GET'])
def fullTextQuery(): def fullTextQuery():

View File

@ -23,6 +23,147 @@ class SwaggerDoc():
"https" "https"
], ],
"paths": { "paths": {
"/search/multi": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a query string to search across both registration and renewal records in the author field",
"parameters": [
{
"name": "title",
"in": "query",
"type": "string",
"required": False,
"default": "*"
},{
"name": "authors",
"in": "query",
"type": "string",
"required": False,
"default": "*"
},{
"name": "publishers",
"in": "query",
"type": "string",
"required": False,
"default": "*"
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
}
],
"responses": {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
}
}
}
}
},
"/search/author": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a query string to search across both registration and renewal records in the author field",
"parameters": [
{
"name": "query",
"in": "query",
"type": "string",
"required": True,
"default": "*"
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
}
],
"responses": {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
}
}
}
}
},
"/search/title": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a query string to search across both registration and renewal records in the title fiel",
"parameters": [
{
"name": "query",
"in": "query",
"type": "string",
"required": True,
"default": "*"
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
}
],
"responses": {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
}
}
}
}
},
"/search/fulltext": { "/search/fulltext": {
"get": { "get": {
"tags": ["Search"], "tags": ["Search"],

View File

@ -1,3 +1,4 @@
import math
class Response(): class Response():
def __init__(self, queryType, endpoint): def __init__(self, queryType, endpoint):
@ -151,7 +152,7 @@ class MultiResponse(Response):
else: else:
paging['next'] = None paging['next'] = None
lastPage = int((self.total - self.perPage) / self.perPage) lastPage = math.ceil(((self.total - self.perPage) / self.perPage))
if ( if (
self.page * self.perPage < self.total and self.page * self.perPage < self.total and
self.total > self.perPage self.total > self.perPage

View File

@ -6,6 +6,13 @@ from lxml import etree
import os import os
import re import re
import traceback import traceback
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = 'utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = 'utf-8')
from model.cce import CCE from model.cce import CCE
from model.errorCCE import ErrorCCE from model.errorCCE import ErrorCCE

View File

@ -1,18 +1,18 @@
DATABASE: DATABASE:
DB_USER: DB_USER: postgres
DB_PSWD: DB_PSWD: "9903"
DB_HOST: DB_HOST: localhost
DB_PORT: DB_PORT: "5432"
DB_NAME: DB_NAME: ccesearch
GITHUB: GITHUB:
ACCESS_TOKEN: ACCESS_TOKEN: 218124e9bf09a9b3f379cb5ee1ab0a8756ee3b3c
CCE_REPO: CCE_REPO: nypl/catalog_of_copyright_entries_project
CCR_REPO: CCR_REPO: nypl/cce-renewals
ELASTICSEARCH: ELASTICSEARCH:
ES_CCE_INDEX: ES_CCE_INDEX: cce
ES_CCR_INDEX: ES_CCR_INDEX: ccr
ES_HOST: ES_HOST: localhost
ES_PORT: ES_PORT: '9200'
ES_TIMEOUT: ES_TIMEOUT: "10000"

View File

@ -58,7 +58,7 @@ class ESIndexer():
if self.client.indices.exists(index=self.ccr_index) is False: if self.client.indices.exists(index=self.ccr_index) is False:
Renewal.init() Renewal.init()
def indexRecords(self, recType='cce'): def indexRecords(self, recType='ccr'):
"""Process the current batch of updating records. This utilizes the """Process the current batch of updating records. This utilizes the
elasticsearch-py bulk helper to import records in chunks of the elasticsearch-py bulk helper to import records in chunks of the
provided size. If a record in the batch errors that is reported and provided size. If a record in the batch errors that is reported and
@ -148,6 +148,7 @@ class ESRen():
self.renewal.rennum = self.dbRen.renewal_num self.renewal.rennum = self.dbRen.renewal_num
self.renewal.rendate = self.dbRen.renewal_date self.renewal.rendate = self.dbRen.renewal_date
self.renewal.title = self.dbRen.title self.renewal.title = self.dbRen.title
self.renewal.authors = self.dbRen.author
self.renewal.claimants = [ self.renewal.claimants = [
Claimant(name=c.name, claim_type=c.claimant_type) Claimant(name=c.name, claim_type=c.claimant_type)
for c in self.dbRen.claimants for c in self.dbRen.claimants

20
main.py
View File

@ -14,12 +14,10 @@ def main(secondsAgo=None, year=None, exclude=None, reinit=False):
startTime = datetime.now() startTime = datetime.now()
if secondsAgo is not None: if secondsAgo is not None:
loadFromTime = startTime - timedelta(seconds=secondsAgo) loadFromTime = startTime - timedelta(seconds=secondsAgo)
# if exclude != 'cce':
if exclude != 'cce': # loadCCE(manager, loadFromTime, year)
loadCCE(manager, loadFromTime, year) # if exclude != 'ccr':
if exclude != 'ccr': # loadCCR(manager, loadFromTime, year)
loadCCR(manager, loadFromTime, year)
indexUpdates(manager, loadFromTime) indexUpdates(manager, loadFromTime)
manager.closeConnection() manager.closeConnection()
@ -39,7 +37,7 @@ def loadCCR(manager, loadFromTime, selectedYear):
def indexUpdates(manager, loadFromTime): def indexUpdates(manager, loadFromTime):
esIndexer = ESIndexer(manager, None) esIndexer = ESIndexer(manager, None)
esIndexer.indexRecords(recType='cce') # esIndexer.indexRecords(recType='cce')
esIndexer.indexRecords(recType='ccr') esIndexer.indexRecords(recType='ccr')
@ -62,7 +60,7 @@ def parseArgs():
def loadConfig(): def loadConfig():
with open('config.yaml', 'r') as yamlFile: with open('config.yaml-dist', 'r') as yamlFile:
config = yaml.safe_load(yamlFile) config = yaml.safe_load(yamlFile)
for section in config: for section in config:
sectionDict = config[section] sectionDict = config[section]
@ -75,13 +73,17 @@ if __name__ == '__main__':
try: try:
loadConfig() loadConfig()
except FileNotFoundError: except FileNotFoundError:
print("Unable to set environment variables")
pass pass
from sessionManager import SessionManager from sessionManager import SessionManager
from builder import CCEReader, CCEFile from builder import CCEReader, CCEFile
from renBuilder import CCRReader, CCRFile from renBuilder import CCRReader, CCRFile
from esIndexer import ESIndexer from esIndexer import ESIndexer
print(args.time)
print(args.year)
print(args.exclude)
print(args.REINITIALIZE)
main( main(
secondsAgo=args.time, secondsAgo=args.time,
year=args.year, year=args.year,

View File

@ -1,5 +1,6 @@
import os import os
import yaml import yaml
import pprint
from elasticsearch_dsl import ( from elasticsearch_dsl import (
Index, Index,
Document, Document,
@ -16,14 +17,14 @@ class BaseDoc(Document):
date_modified = Date() date_modified = Date()
def save(self, **kwargs): def save(self, **kwargs):
return super(BaseDoc, self).save(**kwargs) return super(BaseDoc, self).save(** kwargs)
class BaseInner(InnerDoc): class BaseInner(InnerDoc):
date_created = Date() date_created = Date()
date_modified = Date() date_modified = Date()
def save(self, **kwargs): def save(self, **kwargs):
return super(BaseInner, self).save(**kwargs) return super(BaseInner, self).save(** kwargs)
class Registration(BaseInner): class Registration(BaseInner):
@ -41,9 +42,10 @@ class Renewal(BaseDoc):
rennum = Keyword() rennum = Keyword()
rendate = Date() rendate = Date()
title = Text(fields={'keyword': Keyword()}) title = Text(fields={'keyword': Keyword()})
authors = Text()
claimants = Nested(Claimant)
claimants = Nested(Claimant)
# pprint.pprint(dict(os.environ), width = 1)
class Index: class Index:
name = os.environ['ES_CCR_INDEX'] name = os.environ['ES_CCR_INDEX']
@ -54,7 +56,6 @@ class CCE(BaseDoc):
authors = Text(multi=True) authors = Text(multi=True)
publishers = Text(multi=True) publishers = Text(multi=True)
lccns = Keyword(multi=True) lccns = Keyword(multi=True)
registrations = Nested(Registration) registrations = Nested(Registration)
class Index: class Index:

View File

@ -14,9 +14,10 @@ from model.registration import Registration
class CCRReader(): class CCRReader():
def __init__(self, manager): def __init__(self, manager):
self.git = Github(os.environ['ACCESS_TOKEN']) self.git = Github(os.environ['ACCESS_TOKEN'])
print(self.git)
self.repo = self.git.get_repo(os.environ['CCR_REPO']) self.repo = self.git.get_repo(os.environ['CCR_REPO'])
print(self.repo)
self.ccrYears = {} self.ccrYears = {}
self.dbManager = manager self.dbManager = manager
def loadYears(self, selectedYear, loadFromTime): def loadYears(self, selectedYear, loadFromTime):