Added new search functionality

stevens_api
Rachel Kim 2020-05-05 22:32:28 -04:00
parent 46dd2d8d75
commit 1bec15f97f
11 changed files with 359 additions and 38 deletions

View File

@ -2,13 +2,13 @@ import os
import yaml
from flask import Flask, jsonify
from flasgger import Swagger
from api.prints.swagger.swag import SwaggerDoc
from api.db import db
from api.elastic import elastic
from api.prints import base, search, uuid
from prints.swagger.swag import SwaggerDoc
from db import db
from elastic import elastic
from prints import base, search, uuid
def loadConfig():
with open('config.yaml', 'r') as yamlFile:
with open('config.yaml-dist', 'r') as yamlFile:
config = yaml.safe_load(yamlFile)
for section in config:
sectionDict = config[section]
@ -37,6 +37,7 @@ application.config['ELASTICSEARCH_INDEX_URI'] = '{}:{}'.format(
os.environ['ES_HOST'],
os.environ['ES_PORT']
)
# print(application.config['ELASTICSEARCH_INDEX_URI'])
application.config['SWAGGER'] = {'title': 'CCE Search'}
db.init_app(application)
elastic.init_app(application)

View File

@ -3,13 +3,18 @@ from elasticsearch_dsl import Search, Q
class Elastic():
def __init__(self):
self.client = None
self.client = Elasticsearch()
def init_app(self, app):
self.client = Elasticsearch(app.config['ELASTICSEARCH_INDEX_URI'])
try:
self.client = Elasticsearch(hosts=app.config['ELASTICSEARCH_INDEX_URI'])
except ConnectionError as err:
print('Failed to connect to ElasticSearch instance')
raise err
def create_search(self, index):
return Search(using=self.client, index=index)
s = Search(using=self.client, index=index)
return s
def query_regnum(self, regnum, page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
@ -30,6 +35,41 @@ class Elastic():
search = self.create_search('cce,ccr')
renewalSearch = search.query('query_string', query=queryText)[startPos:endPos]
return renewalSearch.execute()
#New Query Types
def query_title(self, queryText,page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
print(startPos, endPos)
search = self.create_search('cce,ccr')
titleSearch = search.query('match', title=queryText)[startPos:endPos]
print(titleSearch.to_dict())
return titleSearch.execute()
def query_author(self, queryText,page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
print(startPos, endPos)
search = self.create_search('cce,ccr')
titleSearch = search.query('match', authors=queryText)[startPos:endPos]
print(titleSearch.to_dict())
return titleSearch.execute()
# If query is given for publisher field, don't check renewals?
def query_multifields(self, params, page=0, perPage=10):
startPos, endPos = Elastic.getFromSize(page, perPage)
print(startPos, endPos)
if "publishers" in params:
search = self.create_search('cce')
search = search.query('match', publishers=params["publishers"])
else:
search = self.create_search('cce,ccr')
if "title" in params:
search = search.query('match', title=params['title'])
if "authors" in params:
search = search.query('match', authors=params['authors'])
titleSearch = search[startPos:endPos]
return titleSearch.execute()
@staticmethod
def getFromSize(page, perPage):

View File

@ -9,6 +9,132 @@ from api.response import MultiResponse
search = Blueprint('search', __name__, url_prefix='/search')
@search.route('/multi', methods=['GET'])
def multiQuery():
title = request.args.get('title', '')
authors = request.args.get('authors', '')
publishers = request.args.get('publishers','')
sourceReturn = request.args.get('source', False)
page, perPage = MultiResponse.parsePaging(request.args)
queries = {}
if title!="*" and title!="":
queries["title"]=title
if authors!="*" and authors!="":
queries["authors"]=authors
if publishers!="*" and publishers!="":
queries["publishers"]=publishers
print(queries)
matchingDocs = elastic.query_multifields(queries, page=page, perPage=perPage)
textResponse = MultiResponse(
'text',
matchingDocs.hits.total,
request.base_url,
queries,
page,
perPage
)
qManager = QueryManager(db.session)
for entry in matchingDocs:
if entry.meta.index == 'cce':
dbEntry = qManager.registrationQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseEntry(
dbEntry,
xml=sourceReturn
))
else:
try:
dbRenewal = qManager.renewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
except NoResultFound:
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
textResponse.createDataBlock()
return jsonify(textResponse.createResponse(200))
@search.route('/author', methods=['GET'])
def authorQuery():
queryText = request.args.get('query', '')
sourceReturn = request.args.get('source', False)
page, perPage = MultiResponse.parsePaging(request.args)
matchingDocs = elastic.query_author(queryText, page=page, perPage=perPage)
textResponse = MultiResponse(
'text',
matchingDocs.hits.total,
request.base_url,
queryText,
page,
perPage
)
qManager = QueryManager(db.session)
for entry in matchingDocs:
if entry.meta.index == 'cce':
dbEntry = qManager.registrationQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseEntry(
dbEntry,
xml=sourceReturn
))
else:
try:
dbRenewal = qManager.renewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
except NoResultFound:
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
textResponse.createDataBlock()
return jsonify(textResponse.createResponse(200))
@search.route('/title', methods=['GET'])
def titleQuery():
queryText = request.args.get('query', '')
sourceReturn = request.args.get('source', False)
page, perPage = MultiResponse.parsePaging(request.args)
matchingDocs = elastic.query_title(queryText, page=page, perPage=perPage)
textResponse = MultiResponse(
'text',
matchingDocs.hits.total,
request.base_url,
queryText,
page,
perPage
)
qManager = QueryManager(db.session)
for entry in matchingDocs:
if entry.meta.index == 'cce':
dbEntry = qManager.registrationQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseEntry(
dbEntry,
xml=sourceReturn
))
else:
try:
dbRenewal = qManager.renewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
except NoResultFound:
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
textResponse.addResult(MultiResponse.parseRenewal(
dbRenewal,
source=sourceReturn
))
textResponse.createDataBlock()
return jsonify(textResponse.createResponse(200))
@search.route('/fulltext', methods=['GET'])
def fullTextQuery():

View File

@ -23,6 +23,147 @@ class SwaggerDoc():
"https"
],
"paths": {
"/search/multi": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a query string to search across both registration and renewal records in the author field",
"parameters": [
{
"name": "title",
"in": "query",
"type": "string",
"required": False,
"default": "*"
},{
"name": "authors",
"in": "query",
"type": "string",
"required": False,
"default": "*"
},{
"name": "publishers",
"in": "query",
"type": "string",
"required": False,
"default": "*"
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
}
],
"responses": {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
}
}
}
}
},
"/search/author": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a query string to search across both registration and renewal records in the author field",
"parameters": [
{
"name": "query",
"in": "query",
"type": "string",
"required": True,
"default": "*"
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
}
],
"responses": {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
}
}
}
}
},
"/search/title": {
"get": {
"tags": ["Search"],
"summary": "Returns a set of registration and renewal objects",
"description": "Accepts a query string to search across both registration and renewal records in the title fiel",
"parameters": [
{
"name": "query",
"in": "query",
"type": "string",
"required": True,
"default": "*"
},{
"name": "source",
"in": "query",
"type": "boolean",
"required": False,
"default": False,
"description": "Return source XML/CSV data"
},{
"name": "page",
"in": "query",
"type": "number",
"required": False,
"default": 0
},{
"name": "per_page",
"in": "query",
"type": "number",
"required": False,
"default": 10
}
],
"responses": {
200: {
"description": "A list of copyright registrations and renewals",
"schema": {
"$ref": "#/definitions/MultiResponse"
}
}
}
}
},
"/search/fulltext": {
"get": {
"tags": ["Search"],

View File

@ -1,3 +1,4 @@
import math
class Response():
def __init__(self, queryType, endpoint):
@ -151,7 +152,7 @@ class MultiResponse(Response):
else:
paging['next'] = None
lastPage = int((self.total - self.perPage) / self.perPage)
lastPage = math.ceil(((self.total - self.perPage) / self.perPage))
if (
self.page * self.perPage < self.total and
self.total > self.perPage

View File

@ -6,6 +6,13 @@ from lxml import etree
import os
import re
import traceback
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = 'utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = 'utf-8')
from model.cce import CCE
from model.errorCCE import ErrorCCE

View File

@ -1,18 +1,18 @@
DATABASE:
DB_USER:
DB_PSWD:
DB_HOST:
DB_PORT:
DB_NAME:
DB_USER: postgres
DB_PSWD: "9903"
DB_HOST: localhost
DB_PORT: "5432"
DB_NAME: ccesearch
GITHUB:
ACCESS_TOKEN:
CCE_REPO:
CCR_REPO:
ACCESS_TOKEN: 218124e9bf09a9b3f379cb5ee1ab0a8756ee3b3c
CCE_REPO: nypl/catalog_of_copyright_entries_project
CCR_REPO: nypl/cce-renewals
ELASTICSEARCH:
ES_CCE_INDEX:
ES_CCR_INDEX:
ES_HOST:
ES_PORT:
ES_TIMEOUT:
ES_CCE_INDEX: cce
ES_CCR_INDEX: ccr
ES_HOST: localhost
ES_PORT: '9200'
ES_TIMEOUT: "10000"

View File

@ -58,7 +58,7 @@ class ESIndexer():
if self.client.indices.exists(index=self.ccr_index) is False:
Renewal.init()
def indexRecords(self, recType='cce'):
def indexRecords(self, recType='ccr'):
"""Process the current batch of updating records. This utilizes the
elasticsearch-py bulk helper to import records in chunks of the
provided size. If a record in the batch errors that is reported and
@ -148,6 +148,7 @@ class ESRen():
self.renewal.rennum = self.dbRen.renewal_num
self.renewal.rendate = self.dbRen.renewal_date
self.renewal.title = self.dbRen.title
self.renewal.authors = self.dbRen.author
self.renewal.claimants = [
Claimant(name=c.name, claim_type=c.claimant_type)
for c in self.dbRen.claimants

20
main.py
View File

@ -14,12 +14,10 @@ def main(secondsAgo=None, year=None, exclude=None, reinit=False):
startTime = datetime.now()
if secondsAgo is not None:
loadFromTime = startTime - timedelta(seconds=secondsAgo)
if exclude != 'cce':
loadCCE(manager, loadFromTime, year)
if exclude != 'ccr':
loadCCR(manager, loadFromTime, year)
# if exclude != 'cce':
# loadCCE(manager, loadFromTime, year)
# if exclude != 'ccr':
# loadCCR(manager, loadFromTime, year)
indexUpdates(manager, loadFromTime)
manager.closeConnection()
@ -39,7 +37,7 @@ def loadCCR(manager, loadFromTime, selectedYear):
def indexUpdates(manager, loadFromTime):
esIndexer = ESIndexer(manager, None)
esIndexer.indexRecords(recType='cce')
# esIndexer.indexRecords(recType='cce')
esIndexer.indexRecords(recType='ccr')
@ -62,7 +60,7 @@ def parseArgs():
def loadConfig():
with open('config.yaml', 'r') as yamlFile:
with open('config.yaml-dist', 'r') as yamlFile:
config = yaml.safe_load(yamlFile)
for section in config:
sectionDict = config[section]
@ -75,13 +73,17 @@ if __name__ == '__main__':
try:
loadConfig()
except FileNotFoundError:
print("Unable to set environment variables")
pass
from sessionManager import SessionManager
from builder import CCEReader, CCEFile
from renBuilder import CCRReader, CCRFile
from esIndexer import ESIndexer
print(args.time)
print(args.year)
print(args.exclude)
print(args.REINITIALIZE)
main(
secondsAgo=args.time,
year=args.year,

View File

@ -1,5 +1,6 @@
import os
import yaml
import pprint
from elasticsearch_dsl import (
Index,
Document,
@ -16,14 +17,14 @@ class BaseDoc(Document):
date_modified = Date()
def save(self, **kwargs):
return super(BaseDoc, self).save(**kwargs)
return super(BaseDoc, self).save(** kwargs)
class BaseInner(InnerDoc):
date_created = Date()
date_modified = Date()
def save(self, **kwargs):
return super(BaseInner, self).save(**kwargs)
return super(BaseInner, self).save(** kwargs)
class Registration(BaseInner):
@ -41,9 +42,10 @@ class Renewal(BaseDoc):
rennum = Keyword()
rendate = Date()
title = Text(fields={'keyword': Keyword()})
claimants = Nested(Claimant)
authors = Text()
claimants = Nested(Claimant)
# pprint.pprint(dict(os.environ), width = 1)
class Index:
name = os.environ['ES_CCR_INDEX']
@ -54,7 +56,6 @@ class CCE(BaseDoc):
authors = Text(multi=True)
publishers = Text(multi=True)
lccns = Keyword(multi=True)
registrations = Nested(Registration)
class Index:

View File

@ -14,9 +14,10 @@ from model.registration import Registration
class CCRReader():
def __init__(self, manager):
self.git = Github(os.environ['ACCESS_TOKEN'])
print(self.git)
self.repo = self.git.get_repo(os.environ['CCR_REPO'])
print(self.repo)
self.ccrYears = {}
self.dbManager = manager
def loadYears(self, selectedYear, loadFromTime):