From 1bec15f97fdbd529ff8b3f109311c9267c138dcd Mon Sep 17 00:00:00 2001 From: Rachel Kim Date: Tue, 5 May 2020 22:32:28 -0400 Subject: [PATCH] Added new search functionality --- api/app.py | 11 +-- api/elastic.py | 46 +++++++++++- api/prints/search.py | 126 +++++++++++++++++++++++++++++++++ api/prints/swagger/swag.py | 141 +++++++++++++++++++++++++++++++++++++ api/response.py | 3 +- builder.py | 7 ++ config.yaml-dist | 26 +++---- esIndexer.py | 3 +- main.py | 20 +++--- model/elastic.py | 11 +-- renBuilder.py | 3 +- 11 files changed, 359 insertions(+), 38 deletions(-) diff --git a/api/app.py b/api/app.py index 2e6f002..83b6d18 100644 --- a/api/app.py +++ b/api/app.py @@ -2,13 +2,13 @@ import os import yaml from flask import Flask, jsonify from flasgger import Swagger -from api.prints.swagger.swag import SwaggerDoc -from api.db import db -from api.elastic import elastic -from api.prints import base, search, uuid +from prints.swagger.swag import SwaggerDoc +from db import db +from elastic import elastic +from prints import base, search, uuid def loadConfig(): - with open('config.yaml', 'r') as yamlFile: + with open('config.yaml-dist', 'r') as yamlFile: config = yaml.safe_load(yamlFile) for section in config: sectionDict = config[section] @@ -37,6 +37,7 @@ application.config['ELASTICSEARCH_INDEX_URI'] = '{}:{}'.format( os.environ['ES_HOST'], os.environ['ES_PORT'] ) +# print(application.config['ELASTICSEARCH_INDEX_URI']) application.config['SWAGGER'] = {'title': 'CCE Search'} db.init_app(application) elastic.init_app(application) diff --git a/api/elastic.py b/api/elastic.py index 2a21ee8..d062786 100644 --- a/api/elastic.py +++ b/api/elastic.py @@ -3,13 +3,18 @@ from elasticsearch_dsl import Search, Q class Elastic(): def __init__(self): - self.client = None + self.client = Elasticsearch() def init_app(self, app): - self.client = Elasticsearch(app.config['ELASTICSEARCH_INDEX_URI']) + try: + self.client = Elasticsearch(hosts=app.config['ELASTICSEARCH_INDEX_URI']) + except ConnectionError as err: + print('Failed to connect to ElasticSearch instance') + raise err def create_search(self, index): - return Search(using=self.client, index=index) + s = Search(using=self.client, index=index) + return s def query_regnum(self, regnum, page=0, perPage=10): startPos, endPos = Elastic.getFromSize(page, perPage) @@ -30,6 +35,41 @@ class Elastic(): search = self.create_search('cce,ccr') renewalSearch = search.query('query_string', query=queryText)[startPos:endPos] return renewalSearch.execute() + + #New Query Types + def query_title(self, queryText,page=0, perPage=10): + startPos, endPos = Elastic.getFromSize(page, perPage) + print(startPos, endPos) + search = self.create_search('cce,ccr') + titleSearch = search.query('match', title=queryText)[startPos:endPos] + print(titleSearch.to_dict()) + return titleSearch.execute() + + def query_author(self, queryText,page=0, perPage=10): + startPos, endPos = Elastic.getFromSize(page, perPage) + print(startPos, endPos) + search = self.create_search('cce,ccr') + titleSearch = search.query('match', authors=queryText)[startPos:endPos] + print(titleSearch.to_dict()) + return titleSearch.execute() + + + # If query is given for publisher field, don't check renewals? + def query_multifields(self, params, page=0, perPage=10): + startPos, endPos = Elastic.getFromSize(page, perPage) + print(startPos, endPos) + if "publishers" in params: + search = self.create_search('cce') + search = search.query('match', publishers=params["publishers"]) + else: + search = self.create_search('cce,ccr') + if "title" in params: + search = search.query('match', title=params['title']) + if "authors" in params: + search = search.query('match', authors=params['authors']) + titleSearch = search[startPos:endPos] + return titleSearch.execute() + @staticmethod def getFromSize(page, perPage): diff --git a/api/prints/search.py b/api/prints/search.py index e347279..2574f5d 100644 --- a/api/prints/search.py +++ b/api/prints/search.py @@ -9,6 +9,132 @@ from api.response import MultiResponse search = Blueprint('search', __name__, url_prefix='/search') +@search.route('/multi', methods=['GET']) +def multiQuery(): + title = request.args.get('title', '') + authors = request.args.get('authors', '') + publishers = request.args.get('publishers','') + sourceReturn = request.args.get('source', False) + page, perPage = MultiResponse.parsePaging(request.args) + queries = {} + if title!="*" and title!="": + queries["title"]=title + if authors!="*" and authors!="": + queries["authors"]=authors + if publishers!="*" and publishers!="": + queries["publishers"]=publishers + print(queries) + matchingDocs = elastic.query_multifields(queries, page=page, perPage=perPage) + textResponse = MultiResponse( + 'text', + matchingDocs.hits.total, + request.base_url, + queries, + page, + perPage + ) + qManager = QueryManager(db.session) + for entry in matchingDocs: + if entry.meta.index == 'cce': + dbEntry = qManager.registrationQuery(entry.uuid) + textResponse.addResult(MultiResponse.parseEntry( + dbEntry, + xml=sourceReturn + )) + else: + try: + dbRenewal = qManager.renewalQuery(entry.uuid) + textResponse.addResult(MultiResponse.parseRenewal( + dbRenewal, + source=sourceReturn + )) + except NoResultFound: + dbRenewal = qManager.orphanRenewalQuery(entry.uuid) + textResponse.addResult(MultiResponse.parseRenewal( + dbRenewal, + source=sourceReturn + )) + + textResponse.createDataBlock() + return jsonify(textResponse.createResponse(200)) + +@search.route('/author', methods=['GET']) +def authorQuery(): + queryText = request.args.get('query', '') + sourceReturn = request.args.get('source', False) + page, perPage = MultiResponse.parsePaging(request.args) + matchingDocs = elastic.query_author(queryText, page=page, perPage=perPage) + textResponse = MultiResponse( + 'text', + matchingDocs.hits.total, + request.base_url, + queryText, + page, + perPage + ) + qManager = QueryManager(db.session) + for entry in matchingDocs: + if entry.meta.index == 'cce': + dbEntry = qManager.registrationQuery(entry.uuid) + textResponse.addResult(MultiResponse.parseEntry( + dbEntry, + xml=sourceReturn + )) + else: + try: + dbRenewal = qManager.renewalQuery(entry.uuid) + textResponse.addResult(MultiResponse.parseRenewal( + dbRenewal, + source=sourceReturn + )) + except NoResultFound: + dbRenewal = qManager.orphanRenewalQuery(entry.uuid) + textResponse.addResult(MultiResponse.parseRenewal( + dbRenewal, + source=sourceReturn + )) + + textResponse.createDataBlock() + return jsonify(textResponse.createResponse(200)) + +@search.route('/title', methods=['GET']) +def titleQuery(): + queryText = request.args.get('query', '') + sourceReturn = request.args.get('source', False) + page, perPage = MultiResponse.parsePaging(request.args) + matchingDocs = elastic.query_title(queryText, page=page, perPage=perPage) + textResponse = MultiResponse( + 'text', + matchingDocs.hits.total, + request.base_url, + queryText, + page, + perPage + ) + qManager = QueryManager(db.session) + for entry in matchingDocs: + if entry.meta.index == 'cce': + dbEntry = qManager.registrationQuery(entry.uuid) + textResponse.addResult(MultiResponse.parseEntry( + dbEntry, + xml=sourceReturn + )) + else: + try: + dbRenewal = qManager.renewalQuery(entry.uuid) + textResponse.addResult(MultiResponse.parseRenewal( + dbRenewal, + source=sourceReturn + )) + except NoResultFound: + dbRenewal = qManager.orphanRenewalQuery(entry.uuid) + textResponse.addResult(MultiResponse.parseRenewal( + dbRenewal, + source=sourceReturn + )) + + textResponse.createDataBlock() + return jsonify(textResponse.createResponse(200)) @search.route('/fulltext', methods=['GET']) def fullTextQuery(): diff --git a/api/prints/swagger/swag.py b/api/prints/swagger/swag.py index d044e2c..3b2c919 100644 --- a/api/prints/swagger/swag.py +++ b/api/prints/swagger/swag.py @@ -23,6 +23,147 @@ class SwaggerDoc(): "https" ], "paths": { + "/search/multi": { + "get": { + "tags": ["Search"], + "summary": "Returns a set of registration and renewal objects", + "description": "Accepts a query string to search across both registration and renewal records in the author field", + "parameters": [ + { + "name": "title", + "in": "query", + "type": "string", + "required": False, + "default": "*" + },{ + "name": "authors", + "in": "query", + "type": "string", + "required": False, + "default": "*" + },{ + "name": "publishers", + "in": "query", + "type": "string", + "required": False, + "default": "*" + },{ + "name": "source", + "in": "query", + "type": "boolean", + "required": False, + "default": False, + "description": "Return source XML/CSV data" + },{ + "name": "page", + "in": "query", + "type": "number", + "required": False, + "default": 0 + },{ + "name": "per_page", + "in": "query", + "type": "number", + "required": False, + "default": 10 + } + ], + "responses": { + 200: { + "description": "A list of copyright registrations and renewals", + "schema": { + "$ref": "#/definitions/MultiResponse" + } + } + } + } + }, + "/search/author": { + "get": { + "tags": ["Search"], + "summary": "Returns a set of registration and renewal objects", + "description": "Accepts a query string to search across both registration and renewal records in the author field", + "parameters": [ + { + "name": "query", + "in": "query", + "type": "string", + "required": True, + "default": "*" + },{ + "name": "source", + "in": "query", + "type": "boolean", + "required": False, + "default": False, + "description": "Return source XML/CSV data" + },{ + "name": "page", + "in": "query", + "type": "number", + "required": False, + "default": 0 + },{ + "name": "per_page", + "in": "query", + "type": "number", + "required": False, + "default": 10 + } + ], + "responses": { + 200: { + "description": "A list of copyright registrations and renewals", + "schema": { + "$ref": "#/definitions/MultiResponse" + } + } + } + } + }, + "/search/title": { + "get": { + "tags": ["Search"], + "summary": "Returns a set of registration and renewal objects", + "description": "Accepts a query string to search across both registration and renewal records in the title fiel", + "parameters": [ + { + "name": "query", + "in": "query", + "type": "string", + "required": True, + "default": "*" + },{ + "name": "source", + "in": "query", + "type": "boolean", + "required": False, + "default": False, + "description": "Return source XML/CSV data" + },{ + "name": "page", + "in": "query", + "type": "number", + "required": False, + "default": 0 + },{ + "name": "per_page", + "in": "query", + "type": "number", + "required": False, + "default": 10 + } + ], + "responses": { + 200: { + "description": "A list of copyright registrations and renewals", + "schema": { + "$ref": "#/definitions/MultiResponse" + } + } + } + } + }, "/search/fulltext": { "get": { "tags": ["Search"], diff --git a/api/response.py b/api/response.py index dfe2b34..b52e8ca 100644 --- a/api/response.py +++ b/api/response.py @@ -1,3 +1,4 @@ +import math class Response(): def __init__(self, queryType, endpoint): @@ -151,7 +152,7 @@ class MultiResponse(Response): else: paging['next'] = None - lastPage = int((self.total - self.perPage) / self.perPage) + lastPage = math.ceil(((self.total - self.perPage) / self.perPage)) if ( self.page * self.perPage < self.total and self.total > self.perPage diff --git a/builder.py b/builder.py index 6b4cc23..e70fa21 100644 --- a/builder.py +++ b/builder.py @@ -6,6 +6,13 @@ from lxml import etree import os import re import traceback +import sys + +import io + +sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = 'utf-8') + +sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = 'utf-8') from model.cce import CCE from model.errorCCE import ErrorCCE diff --git a/config.yaml-dist b/config.yaml-dist index dc73a65..e3b7331 100644 --- a/config.yaml-dist +++ b/config.yaml-dist @@ -1,18 +1,18 @@ DATABASE: - DB_USER: - DB_PSWD: - DB_HOST: - DB_PORT: - DB_NAME: + DB_USER: postgres + DB_PSWD: "9903" + DB_HOST: localhost + DB_PORT: "5432" + DB_NAME: ccesearch GITHUB: - ACCESS_TOKEN: - CCE_REPO: - CCR_REPO: + ACCESS_TOKEN: 218124e9bf09a9b3f379cb5ee1ab0a8756ee3b3c + CCE_REPO: nypl/catalog_of_copyright_entries_project + CCR_REPO: nypl/cce-renewals ELASTICSEARCH: - ES_CCE_INDEX: - ES_CCR_INDEX: - ES_HOST: - ES_PORT: - ES_TIMEOUT: \ No newline at end of file + ES_CCE_INDEX: cce + ES_CCR_INDEX: ccr + ES_HOST: localhost + ES_PORT: '9200' + ES_TIMEOUT: "10000" \ No newline at end of file diff --git a/esIndexer.py b/esIndexer.py index 5631ca4..4dfd53b 100644 --- a/esIndexer.py +++ b/esIndexer.py @@ -58,7 +58,7 @@ class ESIndexer(): if self.client.indices.exists(index=self.ccr_index) is False: Renewal.init() - def indexRecords(self, recType='cce'): + def indexRecords(self, recType='ccr'): """Process the current batch of updating records. This utilizes the elasticsearch-py bulk helper to import records in chunks of the provided size. If a record in the batch errors that is reported and @@ -148,6 +148,7 @@ class ESRen(): self.renewal.rennum = self.dbRen.renewal_num self.renewal.rendate = self.dbRen.renewal_date self.renewal.title = self.dbRen.title + self.renewal.authors = self.dbRen.author self.renewal.claimants = [ Claimant(name=c.name, claim_type=c.claimant_type) for c in self.dbRen.claimants diff --git a/main.py b/main.py index 733c4b8..8451fdf 100644 --- a/main.py +++ b/main.py @@ -14,12 +14,10 @@ def main(secondsAgo=None, year=None, exclude=None, reinit=False): startTime = datetime.now() if secondsAgo is not None: loadFromTime = startTime - timedelta(seconds=secondsAgo) - - if exclude != 'cce': - loadCCE(manager, loadFromTime, year) - if exclude != 'ccr': - loadCCR(manager, loadFromTime, year) - + # if exclude != 'cce': + # loadCCE(manager, loadFromTime, year) + # if exclude != 'ccr': + # loadCCR(manager, loadFromTime, year) indexUpdates(manager, loadFromTime) manager.closeConnection() @@ -39,7 +37,7 @@ def loadCCR(manager, loadFromTime, selectedYear): def indexUpdates(manager, loadFromTime): esIndexer = ESIndexer(manager, None) - esIndexer.indexRecords(recType='cce') + # esIndexer.indexRecords(recType='cce') esIndexer.indexRecords(recType='ccr') @@ -62,7 +60,7 @@ def parseArgs(): def loadConfig(): - with open('config.yaml', 'r') as yamlFile: + with open('config.yaml-dist', 'r') as yamlFile: config = yaml.safe_load(yamlFile) for section in config: sectionDict = config[section] @@ -75,13 +73,17 @@ if __name__ == '__main__': try: loadConfig() except FileNotFoundError: + print("Unable to set environment variables") pass from sessionManager import SessionManager from builder import CCEReader, CCEFile from renBuilder import CCRReader, CCRFile from esIndexer import ESIndexer - + print(args.time) + print(args.year) + print(args.exclude) + print(args.REINITIALIZE) main( secondsAgo=args.time, year=args.year, diff --git a/model/elastic.py b/model/elastic.py index 559f9f0..522bc7c 100644 --- a/model/elastic.py +++ b/model/elastic.py @@ -1,5 +1,6 @@ import os import yaml +import pprint from elasticsearch_dsl import ( Index, Document, @@ -16,14 +17,14 @@ class BaseDoc(Document): date_modified = Date() def save(self, **kwargs): - return super(BaseDoc, self).save(**kwargs) + return super(BaseDoc, self).save(** kwargs) class BaseInner(InnerDoc): date_created = Date() date_modified = Date() def save(self, **kwargs): - return super(BaseInner, self).save(**kwargs) + return super(BaseInner, self).save(** kwargs) class Registration(BaseInner): @@ -41,9 +42,10 @@ class Renewal(BaseDoc): rennum = Keyword() rendate = Date() title = Text(fields={'keyword': Keyword()}) - - claimants = Nested(Claimant) + authors = Text() + claimants = Nested(Claimant) + # pprint.pprint(dict(os.environ), width = 1) class Index: name = os.environ['ES_CCR_INDEX'] @@ -54,7 +56,6 @@ class CCE(BaseDoc): authors = Text(multi=True) publishers = Text(multi=True) lccns = Keyword(multi=True) - registrations = Nested(Registration) class Index: diff --git a/renBuilder.py b/renBuilder.py index ef76553..88a1c2d 100644 --- a/renBuilder.py +++ b/renBuilder.py @@ -14,9 +14,10 @@ from model.registration import Registration class CCRReader(): def __init__(self, manager): self.git = Github(os.environ['ACCESS_TOKEN']) + print(self.git) self.repo = self.git.get_repo(os.environ['CCR_REPO']) + print(self.repo) self.ccrYears = {} - self.dbManager = manager def loadYears(self, selectedYear, loadFromTime):