Added new search functionality
parent
46dd2d8d75
commit
1bec15f97f
11
api/app.py
11
api/app.py
|
@ -2,13 +2,13 @@ import os
|
|||
import yaml
|
||||
from flask import Flask, jsonify
|
||||
from flasgger import Swagger
|
||||
from api.prints.swagger.swag import SwaggerDoc
|
||||
from api.db import db
|
||||
from api.elastic import elastic
|
||||
from api.prints import base, search, uuid
|
||||
from prints.swagger.swag import SwaggerDoc
|
||||
from db import db
|
||||
from elastic import elastic
|
||||
from prints import base, search, uuid
|
||||
|
||||
def loadConfig():
|
||||
with open('config.yaml', 'r') as yamlFile:
|
||||
with open('config.yaml-dist', 'r') as yamlFile:
|
||||
config = yaml.safe_load(yamlFile)
|
||||
for section in config:
|
||||
sectionDict = config[section]
|
||||
|
@ -37,6 +37,7 @@ application.config['ELASTICSEARCH_INDEX_URI'] = '{}:{}'.format(
|
|||
os.environ['ES_HOST'],
|
||||
os.environ['ES_PORT']
|
||||
)
|
||||
# print(application.config['ELASTICSEARCH_INDEX_URI'])
|
||||
application.config['SWAGGER'] = {'title': 'CCE Search'}
|
||||
db.init_app(application)
|
||||
elastic.init_app(application)
|
||||
|
|
|
@ -3,13 +3,18 @@ from elasticsearch_dsl import Search, Q
|
|||
|
||||
class Elastic():
|
||||
def __init__(self):
|
||||
self.client = None
|
||||
self.client = Elasticsearch()
|
||||
|
||||
def init_app(self, app):
|
||||
self.client = Elasticsearch(app.config['ELASTICSEARCH_INDEX_URI'])
|
||||
try:
|
||||
self.client = Elasticsearch(hosts=app.config['ELASTICSEARCH_INDEX_URI'])
|
||||
except ConnectionError as err:
|
||||
print('Failed to connect to ElasticSearch instance')
|
||||
raise err
|
||||
|
||||
def create_search(self, index):
|
||||
return Search(using=self.client, index=index)
|
||||
s = Search(using=self.client, index=index)
|
||||
return s
|
||||
|
||||
def query_regnum(self, regnum, page=0, perPage=10):
|
||||
startPos, endPos = Elastic.getFromSize(page, perPage)
|
||||
|
@ -30,6 +35,41 @@ class Elastic():
|
|||
search = self.create_search('cce,ccr')
|
||||
renewalSearch = search.query('query_string', query=queryText)[startPos:endPos]
|
||||
return renewalSearch.execute()
|
||||
|
||||
#New Query Types
|
||||
def query_title(self, queryText,page=0, perPage=10):
|
||||
startPos, endPos = Elastic.getFromSize(page, perPage)
|
||||
print(startPos, endPos)
|
||||
search = self.create_search('cce,ccr')
|
||||
titleSearch = search.query('match', title=queryText)[startPos:endPos]
|
||||
print(titleSearch.to_dict())
|
||||
return titleSearch.execute()
|
||||
|
||||
def query_author(self, queryText,page=0, perPage=10):
|
||||
startPos, endPos = Elastic.getFromSize(page, perPage)
|
||||
print(startPos, endPos)
|
||||
search = self.create_search('cce,ccr')
|
||||
titleSearch = search.query('match', authors=queryText)[startPos:endPos]
|
||||
print(titleSearch.to_dict())
|
||||
return titleSearch.execute()
|
||||
|
||||
|
||||
# If query is given for publisher field, don't check renewals?
|
||||
def query_multifields(self, params, page=0, perPage=10):
|
||||
startPos, endPos = Elastic.getFromSize(page, perPage)
|
||||
print(startPos, endPos)
|
||||
if "publishers" in params:
|
||||
search = self.create_search('cce')
|
||||
search = search.query('match', publishers=params["publishers"])
|
||||
else:
|
||||
search = self.create_search('cce,ccr')
|
||||
if "title" in params:
|
||||
search = search.query('match', title=params['title'])
|
||||
if "authors" in params:
|
||||
search = search.query('match', authors=params['authors'])
|
||||
titleSearch = search[startPos:endPos]
|
||||
return titleSearch.execute()
|
||||
|
||||
|
||||
@staticmethod
|
||||
def getFromSize(page, perPage):
|
||||
|
|
|
@ -9,6 +9,132 @@ from api.response import MultiResponse
|
|||
|
||||
search = Blueprint('search', __name__, url_prefix='/search')
|
||||
|
||||
@search.route('/multi', methods=['GET'])
|
||||
def multiQuery():
|
||||
title = request.args.get('title', '')
|
||||
authors = request.args.get('authors', '')
|
||||
publishers = request.args.get('publishers','')
|
||||
sourceReturn = request.args.get('source', False)
|
||||
page, perPage = MultiResponse.parsePaging(request.args)
|
||||
queries = {}
|
||||
if title!="*" and title!="":
|
||||
queries["title"]=title
|
||||
if authors!="*" and authors!="":
|
||||
queries["authors"]=authors
|
||||
if publishers!="*" and publishers!="":
|
||||
queries["publishers"]=publishers
|
||||
print(queries)
|
||||
matchingDocs = elastic.query_multifields(queries, page=page, perPage=perPage)
|
||||
textResponse = MultiResponse(
|
||||
'text',
|
||||
matchingDocs.hits.total,
|
||||
request.base_url,
|
||||
queries,
|
||||
page,
|
||||
perPage
|
||||
)
|
||||
qManager = QueryManager(db.session)
|
||||
for entry in matchingDocs:
|
||||
if entry.meta.index == 'cce':
|
||||
dbEntry = qManager.registrationQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseEntry(
|
||||
dbEntry,
|
||||
xml=sourceReturn
|
||||
))
|
||||
else:
|
||||
try:
|
||||
dbRenewal = qManager.renewalQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseRenewal(
|
||||
dbRenewal,
|
||||
source=sourceReturn
|
||||
))
|
||||
except NoResultFound:
|
||||
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseRenewal(
|
||||
dbRenewal,
|
||||
source=sourceReturn
|
||||
))
|
||||
|
||||
textResponse.createDataBlock()
|
||||
return jsonify(textResponse.createResponse(200))
|
||||
|
||||
@search.route('/author', methods=['GET'])
|
||||
def authorQuery():
|
||||
queryText = request.args.get('query', '')
|
||||
sourceReturn = request.args.get('source', False)
|
||||
page, perPage = MultiResponse.parsePaging(request.args)
|
||||
matchingDocs = elastic.query_author(queryText, page=page, perPage=perPage)
|
||||
textResponse = MultiResponse(
|
||||
'text',
|
||||
matchingDocs.hits.total,
|
||||
request.base_url,
|
||||
queryText,
|
||||
page,
|
||||
perPage
|
||||
)
|
||||
qManager = QueryManager(db.session)
|
||||
for entry in matchingDocs:
|
||||
if entry.meta.index == 'cce':
|
||||
dbEntry = qManager.registrationQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseEntry(
|
||||
dbEntry,
|
||||
xml=sourceReturn
|
||||
))
|
||||
else:
|
||||
try:
|
||||
dbRenewal = qManager.renewalQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseRenewal(
|
||||
dbRenewal,
|
||||
source=sourceReturn
|
||||
))
|
||||
except NoResultFound:
|
||||
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseRenewal(
|
||||
dbRenewal,
|
||||
source=sourceReturn
|
||||
))
|
||||
|
||||
textResponse.createDataBlock()
|
||||
return jsonify(textResponse.createResponse(200))
|
||||
|
||||
@search.route('/title', methods=['GET'])
|
||||
def titleQuery():
|
||||
queryText = request.args.get('query', '')
|
||||
sourceReturn = request.args.get('source', False)
|
||||
page, perPage = MultiResponse.parsePaging(request.args)
|
||||
matchingDocs = elastic.query_title(queryText, page=page, perPage=perPage)
|
||||
textResponse = MultiResponse(
|
||||
'text',
|
||||
matchingDocs.hits.total,
|
||||
request.base_url,
|
||||
queryText,
|
||||
page,
|
||||
perPage
|
||||
)
|
||||
qManager = QueryManager(db.session)
|
||||
for entry in matchingDocs:
|
||||
if entry.meta.index == 'cce':
|
||||
dbEntry = qManager.registrationQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseEntry(
|
||||
dbEntry,
|
||||
xml=sourceReturn
|
||||
))
|
||||
else:
|
||||
try:
|
||||
dbRenewal = qManager.renewalQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseRenewal(
|
||||
dbRenewal,
|
||||
source=sourceReturn
|
||||
))
|
||||
except NoResultFound:
|
||||
dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
|
||||
textResponse.addResult(MultiResponse.parseRenewal(
|
||||
dbRenewal,
|
||||
source=sourceReturn
|
||||
))
|
||||
|
||||
textResponse.createDataBlock()
|
||||
return jsonify(textResponse.createResponse(200))
|
||||
|
||||
@search.route('/fulltext', methods=['GET'])
|
||||
def fullTextQuery():
|
||||
|
|
|
@ -23,6 +23,147 @@ class SwaggerDoc():
|
|||
"https"
|
||||
],
|
||||
"paths": {
|
||||
"/search/multi": {
|
||||
"get": {
|
||||
"tags": ["Search"],
|
||||
"summary": "Returns a set of registration and renewal objects",
|
||||
"description": "Accepts a query string to search across both registration and renewal records in the author field",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "title",
|
||||
"in": "query",
|
||||
"type": "string",
|
||||
"required": False,
|
||||
"default": "*"
|
||||
},{
|
||||
"name": "authors",
|
||||
"in": "query",
|
||||
"type": "string",
|
||||
"required": False,
|
||||
"default": "*"
|
||||
},{
|
||||
"name": "publishers",
|
||||
"in": "query",
|
||||
"type": "string",
|
||||
"required": False,
|
||||
"default": "*"
|
||||
},{
|
||||
"name": "source",
|
||||
"in": "query",
|
||||
"type": "boolean",
|
||||
"required": False,
|
||||
"default": False,
|
||||
"description": "Return source XML/CSV data"
|
||||
},{
|
||||
"name": "page",
|
||||
"in": "query",
|
||||
"type": "number",
|
||||
"required": False,
|
||||
"default": 0
|
||||
},{
|
||||
"name": "per_page",
|
||||
"in": "query",
|
||||
"type": "number",
|
||||
"required": False,
|
||||
"default": 10
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
200: {
|
||||
"description": "A list of copyright registrations and renewals",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/MultiResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/search/author": {
|
||||
"get": {
|
||||
"tags": ["Search"],
|
||||
"summary": "Returns a set of registration and renewal objects",
|
||||
"description": "Accepts a query string to search across both registration and renewal records in the author field",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "query",
|
||||
"in": "query",
|
||||
"type": "string",
|
||||
"required": True,
|
||||
"default": "*"
|
||||
},{
|
||||
"name": "source",
|
||||
"in": "query",
|
||||
"type": "boolean",
|
||||
"required": False,
|
||||
"default": False,
|
||||
"description": "Return source XML/CSV data"
|
||||
},{
|
||||
"name": "page",
|
||||
"in": "query",
|
||||
"type": "number",
|
||||
"required": False,
|
||||
"default": 0
|
||||
},{
|
||||
"name": "per_page",
|
||||
"in": "query",
|
||||
"type": "number",
|
||||
"required": False,
|
||||
"default": 10
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
200: {
|
||||
"description": "A list of copyright registrations and renewals",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/MultiResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/search/title": {
|
||||
"get": {
|
||||
"tags": ["Search"],
|
||||
"summary": "Returns a set of registration and renewal objects",
|
||||
"description": "Accepts a query string to search across both registration and renewal records in the title fiel",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "query",
|
||||
"in": "query",
|
||||
"type": "string",
|
||||
"required": True,
|
||||
"default": "*"
|
||||
},{
|
||||
"name": "source",
|
||||
"in": "query",
|
||||
"type": "boolean",
|
||||
"required": False,
|
||||
"default": False,
|
||||
"description": "Return source XML/CSV data"
|
||||
},{
|
||||
"name": "page",
|
||||
"in": "query",
|
||||
"type": "number",
|
||||
"required": False,
|
||||
"default": 0
|
||||
},{
|
||||
"name": "per_page",
|
||||
"in": "query",
|
||||
"type": "number",
|
||||
"required": False,
|
||||
"default": 10
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
200: {
|
||||
"description": "A list of copyright registrations and renewals",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/MultiResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/search/fulltext": {
|
||||
"get": {
|
||||
"tags": ["Search"],
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import math
|
||||
|
||||
class Response():
|
||||
def __init__(self, queryType, endpoint):
|
||||
|
@ -151,7 +152,7 @@ class MultiResponse(Response):
|
|||
else:
|
||||
paging['next'] = None
|
||||
|
||||
lastPage = int((self.total - self.perPage) / self.perPage)
|
||||
lastPage = math.ceil(((self.total - self.perPage) / self.perPage))
|
||||
if (
|
||||
self.page * self.perPage < self.total and
|
||||
self.total > self.perPage
|
||||
|
|
|
@ -6,6 +6,13 @@ from lxml import etree
|
|||
import os
|
||||
import re
|
||||
import traceback
|
||||
import sys
|
||||
|
||||
import io
|
||||
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = 'utf-8')
|
||||
|
||||
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = 'utf-8')
|
||||
|
||||
from model.cce import CCE
|
||||
from model.errorCCE import ErrorCCE
|
||||
|
|
|
@ -1,18 +1,18 @@
|
|||
DATABASE:
|
||||
DB_USER:
|
||||
DB_PSWD:
|
||||
DB_HOST:
|
||||
DB_PORT:
|
||||
DB_NAME:
|
||||
DB_USER: postgres
|
||||
DB_PSWD: "9903"
|
||||
DB_HOST: localhost
|
||||
DB_PORT: "5432"
|
||||
DB_NAME: ccesearch
|
||||
|
||||
GITHUB:
|
||||
ACCESS_TOKEN:
|
||||
CCE_REPO:
|
||||
CCR_REPO:
|
||||
ACCESS_TOKEN: 218124e9bf09a9b3f379cb5ee1ab0a8756ee3b3c
|
||||
CCE_REPO: nypl/catalog_of_copyright_entries_project
|
||||
CCR_REPO: nypl/cce-renewals
|
||||
|
||||
ELASTICSEARCH:
|
||||
ES_CCE_INDEX:
|
||||
ES_CCR_INDEX:
|
||||
ES_HOST:
|
||||
ES_PORT:
|
||||
ES_TIMEOUT:
|
||||
ES_CCE_INDEX: cce
|
||||
ES_CCR_INDEX: ccr
|
||||
ES_HOST: localhost
|
||||
ES_PORT: '9200'
|
||||
ES_TIMEOUT: "10000"
|
|
@ -58,7 +58,7 @@ class ESIndexer():
|
|||
if self.client.indices.exists(index=self.ccr_index) is False:
|
||||
Renewal.init()
|
||||
|
||||
def indexRecords(self, recType='cce'):
|
||||
def indexRecords(self, recType='ccr'):
|
||||
"""Process the current batch of updating records. This utilizes the
|
||||
elasticsearch-py bulk helper to import records in chunks of the
|
||||
provided size. If a record in the batch errors that is reported and
|
||||
|
@ -148,6 +148,7 @@ class ESRen():
|
|||
self.renewal.rennum = self.dbRen.renewal_num
|
||||
self.renewal.rendate = self.dbRen.renewal_date
|
||||
self.renewal.title = self.dbRen.title
|
||||
self.renewal.authors = self.dbRen.author
|
||||
self.renewal.claimants = [
|
||||
Claimant(name=c.name, claim_type=c.claimant_type)
|
||||
for c in self.dbRen.claimants
|
||||
|
|
20
main.py
20
main.py
|
@ -14,12 +14,10 @@ def main(secondsAgo=None, year=None, exclude=None, reinit=False):
|
|||
startTime = datetime.now()
|
||||
if secondsAgo is not None:
|
||||
loadFromTime = startTime - timedelta(seconds=secondsAgo)
|
||||
|
||||
if exclude != 'cce':
|
||||
loadCCE(manager, loadFromTime, year)
|
||||
if exclude != 'ccr':
|
||||
loadCCR(manager, loadFromTime, year)
|
||||
|
||||
# if exclude != 'cce':
|
||||
# loadCCE(manager, loadFromTime, year)
|
||||
# if exclude != 'ccr':
|
||||
# loadCCR(manager, loadFromTime, year)
|
||||
indexUpdates(manager, loadFromTime)
|
||||
|
||||
manager.closeConnection()
|
||||
|
@ -39,7 +37,7 @@ def loadCCR(manager, loadFromTime, selectedYear):
|
|||
|
||||
def indexUpdates(manager, loadFromTime):
|
||||
esIndexer = ESIndexer(manager, None)
|
||||
esIndexer.indexRecords(recType='cce')
|
||||
# esIndexer.indexRecords(recType='cce')
|
||||
esIndexer.indexRecords(recType='ccr')
|
||||
|
||||
|
||||
|
@ -62,7 +60,7 @@ def parseArgs():
|
|||
|
||||
|
||||
def loadConfig():
|
||||
with open('config.yaml', 'r') as yamlFile:
|
||||
with open('config.yaml-dist', 'r') as yamlFile:
|
||||
config = yaml.safe_load(yamlFile)
|
||||
for section in config:
|
||||
sectionDict = config[section]
|
||||
|
@ -75,13 +73,17 @@ if __name__ == '__main__':
|
|||
try:
|
||||
loadConfig()
|
||||
except FileNotFoundError:
|
||||
print("Unable to set environment variables")
|
||||
pass
|
||||
|
||||
from sessionManager import SessionManager
|
||||
from builder import CCEReader, CCEFile
|
||||
from renBuilder import CCRReader, CCRFile
|
||||
from esIndexer import ESIndexer
|
||||
|
||||
print(args.time)
|
||||
print(args.year)
|
||||
print(args.exclude)
|
||||
print(args.REINITIALIZE)
|
||||
main(
|
||||
secondsAgo=args.time,
|
||||
year=args.year,
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import os
|
||||
import yaml
|
||||
import pprint
|
||||
from elasticsearch_dsl import (
|
||||
Index,
|
||||
Document,
|
||||
|
@ -16,14 +17,14 @@ class BaseDoc(Document):
|
|||
date_modified = Date()
|
||||
|
||||
def save(self, **kwargs):
|
||||
return super(BaseDoc, self).save(**kwargs)
|
||||
return super(BaseDoc, self).save(** kwargs)
|
||||
|
||||
class BaseInner(InnerDoc):
|
||||
date_created = Date()
|
||||
date_modified = Date()
|
||||
|
||||
def save(self, **kwargs):
|
||||
return super(BaseInner, self).save(**kwargs)
|
||||
return super(BaseInner, self).save(** kwargs)
|
||||
|
||||
|
||||
class Registration(BaseInner):
|
||||
|
@ -41,9 +42,10 @@ class Renewal(BaseDoc):
|
|||
rennum = Keyword()
|
||||
rendate = Date()
|
||||
title = Text(fields={'keyword': Keyword()})
|
||||
|
||||
claimants = Nested(Claimant)
|
||||
authors = Text()
|
||||
|
||||
claimants = Nested(Claimant)
|
||||
# pprint.pprint(dict(os.environ), width = 1)
|
||||
class Index:
|
||||
name = os.environ['ES_CCR_INDEX']
|
||||
|
||||
|
@ -54,7 +56,6 @@ class CCE(BaseDoc):
|
|||
authors = Text(multi=True)
|
||||
publishers = Text(multi=True)
|
||||
lccns = Keyword(multi=True)
|
||||
|
||||
registrations = Nested(Registration)
|
||||
|
||||
class Index:
|
||||
|
|
|
@ -14,9 +14,10 @@ from model.registration import Registration
|
|||
class CCRReader():
|
||||
def __init__(self, manager):
|
||||
self.git = Github(os.environ['ACCESS_TOKEN'])
|
||||
print(self.git)
|
||||
self.repo = self.git.get_repo(os.environ['CCR_REPO'])
|
||||
print(self.repo)
|
||||
self.ccrYears = {}
|
||||
|
||||
self.dbManager = manager
|
||||
|
||||
def loadYears(self, selectedYear, loadFromTime):
|
||||
|
|
Loading…
Reference in New Issue