Added new search functionality

2020-05-05 22:32:28 -04:00 · 2020-05-05 22:32:28 -04:00 · 1bec15f97f
parent 46dd2d8d75
commit 1bec15f97f
11 changed files with 359 additions and 38 deletions
--- a/api/app.py
+++ b/api/app.py
@ -2,13 +2,13 @@ import os
 import yaml
 from flask import Flask, jsonify
 from flasgger import Swagger
-from api.prints.swagger.swag import SwaggerDoc
+from prints.swagger.swag import SwaggerDoc
-from api.db import db
+from db import db
-from api.elastic import elastic
+from elastic import elastic
-from api.prints import base, search, uuid
+from prints import base, search, uuid
 def loadConfig():
-    with open('config.yaml', 'r') as yamlFile:
+    with open('config.yaml-dist', 'r') as yamlFile:
        config = yaml.safe_load(yamlFile)
        for section in config:
            sectionDict = config[section]
@ -37,6 +37,7 @@ application.config['ELASTICSEARCH_INDEX_URI'] = '{}:{}'.format(
    os.environ['ES_HOST'],
    os.environ['ES_PORT']
 )
 # print(application.config['ELASTICSEARCH_INDEX_URI'])
 application.config['SWAGGER'] = {'title': 'CCE Search'}
 db.init_app(application)
 elastic.init_app(application)
--- a/api/elastic.py
+++ b/api/elastic.py
@ -3,13 +3,18 @@ from elasticsearch_dsl import Search, Q
 class Elastic():
    def __init__(self):
-        self.client = None
+        self.client = Elasticsearch()
    def init_app(self, app):
-        self.client = Elasticsearch(app.config['ELASTICSEARCH_INDEX_URI'])
+        try:
            self.client = Elasticsearch(hosts=app.config['ELASTICSEARCH_INDEX_URI'])
        except ConnectionError as err:
            print('Failed to connect to ElasticSearch instance')
            raise err
    def create_search(self, index):
-        return Search(using=self.client, index=index)
+        s = Search(using=self.client, index=index)
        return s
    def query_regnum(self, regnum, page=0, perPage=10):
        startPos, endPos = Elastic.getFromSize(page, perPage)
@ -30,6 +35,41 @@ class Elastic():
        search = self.create_search('cce,ccr')
        renewalSearch = search.query('query_string', query=queryText)[startPos:endPos]
        return renewalSearch.execute()
    #New Query Types
    def query_title(self, queryText,page=0, perPage=10):
        startPos, endPos = Elastic.getFromSize(page, perPage)
        print(startPos, endPos)
        search = self.create_search('cce,ccr')
        titleSearch = search.query('match', title=queryText)[startPos:endPos]
        print(titleSearch.to_dict())
        return titleSearch.execute()
    def query_author(self, queryText,page=0, perPage=10):
        startPos, endPos = Elastic.getFromSize(page, perPage)
        print(startPos, endPos)
        search = self.create_search('cce,ccr')
        titleSearch = search.query('match', authors=queryText)[startPos:endPos]
        print(titleSearch.to_dict())
        return titleSearch.execute()
    # If query is given for publisher field, don't check renewals?
    def query_multifields(self, params, page=0, perPage=10):
        startPos, endPos = Elastic.getFromSize(page, perPage)
        print(startPos, endPos)
        if "publishers" in params:
            search = self.create_search('cce')
            search = search.query('match', publishers=params["publishers"])
        else:
            search = self.create_search('cce,ccr')
        if "title" in params:
            search = search.query('match', title=params['title'])
        if "authors" in params:
            search = search.query('match', authors=params['authors'])
        titleSearch = search[startPos:endPos]
        return titleSearch.execute()
    @staticmethod
    def getFromSize(page, perPage):
--- a/api/prints/search.py
+++ b/api/prints/search.py
@ -9,6 +9,132 @@ from api.response import MultiResponse
 search = Blueprint('search', __name__, url_prefix='/search')
@search.route('/multi', methods=['GET'])
 def multiQuery():
    title = request.args.get('title', '')
    authors = request.args.get('authors', '')
    publishers = request.args.get('publishers','')
    sourceReturn = request.args.get('source', False)
    page, perPage = MultiResponse.parsePaging(request.args)
    queries = {}
    if title!="*" and title!="":
        queries["title"]=title
    if authors!="*" and authors!="":
        queries["authors"]=authors
    if publishers!="*" and publishers!="":
        queries["publishers"]=publishers
    print(queries)
    matchingDocs = elastic.query_multifields(queries, page=page, perPage=perPage)
    textResponse = MultiResponse(
        'text',
        matchingDocs.hits.total,
        request.base_url,
        queries,
        page,
        perPage
    )
    qManager = QueryManager(db.session)
    for entry in matchingDocs:
        if entry.meta.index == 'cce':
            dbEntry = qManager.registrationQuery(entry.uuid)
            textResponse.addResult(MultiResponse.parseEntry(
                dbEntry, 
                xml=sourceReturn
            ))
        else:
            try:
                dbRenewal = qManager.renewalQuery(entry.uuid)
                textResponse.addResult(MultiResponse.parseRenewal(
                    dbRenewal,
                    source=sourceReturn
                ))
            except NoResultFound:
                dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
                textResponse.addResult(MultiResponse.parseRenewal(
                    dbRenewal,
                    source=sourceReturn
                ))
    textResponse.createDataBlock()    
    return jsonify(textResponse.createResponse(200))
@search.route('/author', methods=['GET'])
 def authorQuery():
    queryText = request.args.get('query', '')
    sourceReturn = request.args.get('source', False)
    page, perPage = MultiResponse.parsePaging(request.args)
    matchingDocs = elastic.query_author(queryText, page=page, perPage=perPage)
    textResponse = MultiResponse(
        'text',
        matchingDocs.hits.total,
        request.base_url,
        queryText,
        page,
        perPage
    )
    qManager = QueryManager(db.session)
    for entry in matchingDocs:
        if entry.meta.index == 'cce':
            dbEntry = qManager.registrationQuery(entry.uuid)
            textResponse.addResult(MultiResponse.parseEntry(
                dbEntry, 
                xml=sourceReturn
            ))
        else:
            try:
                dbRenewal = qManager.renewalQuery(entry.uuid)
                textResponse.addResult(MultiResponse.parseRenewal(
                    dbRenewal,
                    source=sourceReturn
                ))
            except NoResultFound:
                dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
                textResponse.addResult(MultiResponse.parseRenewal(
                    dbRenewal,
                    source=sourceReturn
                ))
    textResponse.createDataBlock()    
    return jsonify(textResponse.createResponse(200))
@search.route('/title', methods=['GET'])
 def titleQuery():
    queryText = request.args.get('query', '')
    sourceReturn = request.args.get('source', False)
    page, perPage = MultiResponse.parsePaging(request.args)
    matchingDocs = elastic.query_title(queryText, page=page, perPage=perPage)
    textResponse = MultiResponse(
        'text',
        matchingDocs.hits.total,
        request.base_url,
        queryText,
        page,
        perPage
    )
    qManager = QueryManager(db.session)
    for entry in matchingDocs:
        if entry.meta.index == 'cce':
            dbEntry = qManager.registrationQuery(entry.uuid)
            textResponse.addResult(MultiResponse.parseEntry(
                dbEntry, 
                xml=sourceReturn
            ))
        else:
            try:
                dbRenewal = qManager.renewalQuery(entry.uuid)
                textResponse.addResult(MultiResponse.parseRenewal(
                    dbRenewal,
                    source=sourceReturn
                ))
            except NoResultFound:
                dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
                textResponse.addResult(MultiResponse.parseRenewal(
                    dbRenewal,
                    source=sourceReturn
                ))
    textResponse.createDataBlock()    
    return jsonify(textResponse.createResponse(200))
@search.route('/fulltext', methods=['GET'])
 def fullTextQuery():
--- a/api/prints/swagger/swag.py
+++ b/api/prints/swagger/swag.py
@ -23,6 +23,147 @@ class SwaggerDoc():
                "https"
            ],
            "paths": {
                "/search/multi": {
                    "get": {
                        "tags": ["Search"],
                        "summary": "Returns a set of registration and renewal objects",
                        "description": "Accepts a query string to search across both registration and renewal records in the author field",
                        "parameters": [
                            {
                                "name": "title",
                                "in": "query",
                                "type": "string",
                                "required": False,
                                "default": "*"
                            },{
                                "name": "authors",
                                "in": "query",
                                "type": "string",
                                "required": False,
                                "default": "*"
                            },{
                                "name": "publishers",
                                "in": "query",
                                "type": "string",
                                "required": False,
                                "default": "*"
                            },{
                                "name": "source",
                                "in": "query",
                                "type": "boolean",
                                "required": False,
                                "default": False,
                                "description": "Return source XML/CSV data"
                            },{
                                "name": "page",
                                "in": "query",
                                "type": "number",
                                "required": False,
                                "default": 0
                            },{
                                "name": "per_page",
                                "in": "query",
                                "type": "number",
                                "required": False,
                                "default": 10
                            }
                        ],
                        "responses": {
                            200: {
                                "description": "A list of copyright registrations and renewals",
                                "schema": {
                                    "$ref": "#/definitions/MultiResponse"
                                }
                            }
                        }
                    }
                },
                "/search/author": {
                    "get": {
                        "tags": ["Search"],
                        "summary": "Returns a set of registration and renewal objects",
                        "description": "Accepts a query string to search across both registration and renewal records in the author field",
                        "parameters": [
                            {
                                "name": "query",
                                "in": "query",
                                "type": "string",
                                "required": True,
                                "default": "*"
                            },{
                                "name": "source",
                                "in": "query",
                                "type": "boolean",
                                "required": False,
                                "default": False,
                                "description": "Return source XML/CSV data"
                            },{
                                "name": "page",
                                "in": "query",
                                "type": "number",
                                "required": False,
                                "default": 0
                            },{
                                "name": "per_page",
                                "in": "query",
                                "type": "number",
                                "required": False,
                                "default": 10
                            }
                        ],
                        "responses": {
                            200: {
                                "description": "A list of copyright registrations and renewals",
                                "schema": {
                                    "$ref": "#/definitions/MultiResponse"
                                }
                            }
                        }
                    }
                },
                "/search/title": {
                    "get": {
                        "tags": ["Search"],
                        "summary": "Returns a set of registration and renewal objects",
                        "description": "Accepts a query string to search across both registration and renewal records in the title fiel",
                        "parameters": [
                            {
                                "name": "query",
                                "in": "query",
                                "type": "string",
                                "required": True,
                                "default": "*"
                            },{
                                "name": "source",
                                "in": "query",
                                "type": "boolean",
                                "required": False,
                                "default": False,
                                "description": "Return source XML/CSV data"
                            },{
                                "name": "page",
                                "in": "query",
                                "type": "number",
                                "required": False,
                                "default": 0
                            },{
                                "name": "per_page",
                                "in": "query",
                                "type": "number",
                                "required": False,
                                "default": 10
                            }
                        ],
                        "responses": {
                            200: {
                                "description": "A list of copyright registrations and renewals",
                                "schema": {
                                    "$ref": "#/definitions/MultiResponse"
                                }
                            }
                        }
                    }
                },
                "/search/fulltext": {
                    "get": {
                        "tags": ["Search"],
--- a/api/response.py
+++ b/api/response.py
@ -1,3 +1,4 @@
 import math
 class Response():
    def __init__(self, queryType, endpoint):
@ -151,7 +152,7 @@ class MultiResponse(Response):
        else:
            paging['next'] = None
-        lastPage = int((self.total - self.perPage) / self.perPage)
+        lastPage = math.ceil(((self.total - self.perPage) / self.perPage))
        if (
            self.page * self.perPage < self.total and 
            self.total > self.perPage
--- a/builder.py
+++ b/builder.py
@ -6,6 +6,13 @@ from lxml import etree
 import os
 import re
 import traceback
 import sys
 import io
 sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = 'utf-8')
 sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = 'utf-8')
 from model.cce import CCE
 from model.errorCCE import ErrorCCE
--- a/config.yaml-dist
+++ b/config.yaml-dist
@ -1,18 +1,18 @@
 DATABASE:
-  DB_USER: 
+  DB_USER: postgres
-  DB_PSWD: 
+  DB_PSWD: "9903"
-  DB_HOST: 
+  DB_HOST: localhost
-  DB_PORT: 
+  DB_PORT: "5432"
-  DB_NAME: 
+  DB_NAME: ccesearch
 GITHUB:
-  ACCESS_TOKEN: 
+  ACCESS_TOKEN: 218124e9bf09a9b3f379cb5ee1ab0a8756ee3b3c
-  CCE_REPO: 
+  CCE_REPO: nypl/catalog_of_copyright_entries_project
-  CCR_REPO: 
+  CCR_REPO: nypl/cce-renewals
 ELASTICSEARCH:
-  ES_CCE_INDEX: 
+  ES_CCE_INDEX: cce
-  ES_CCR_INDEX: 
+  ES_CCR_INDEX: ccr
-  ES_HOST: 
+  ES_HOST: localhost
-  ES_PORT: 
+  ES_PORT: '9200'
-  ES_TIMEOUT: 
+  ES_TIMEOUT: "10000"
--- a/esIndexer.py
+++ b/esIndexer.py
@ -58,7 +58,7 @@ class ESIndexer():
        if self.client.indices.exists(index=self.ccr_index) is False:
            Renewal.init()
-    def indexRecords(self, recType='cce'):
+    def indexRecords(self, recType='ccr'):
        """Process the current batch of updating records. This utilizes the
        elasticsearch-py bulk helper to import records in chunks of the
        provided size. If a record in the batch errors that is reported and
@ -148,6 +148,7 @@ class ESRen():
        self.renewal.rennum = self.dbRen.renewal_num
        self.renewal.rendate = self.dbRen.renewal_date
        self.renewal.title = self.dbRen.title
        self.renewal.authors = self.dbRen.author
        self.renewal.claimants = [
            Claimant(name=c.name, claim_type=c.claimant_type)
            for c in self.dbRen.claimants
--- a/main.py
+++ b/main.py
@ -14,12 +14,10 @@ def main(secondsAgo=None, year=None, exclude=None, reinit=False):
    startTime = datetime.now()
    if secondsAgo is not None:
        loadFromTime = startTime - timedelta(seconds=secondsAgo)
-
+    # if exclude != 'cce':
-    if exclude != 'cce':
+        # loadCCE(manager, loadFromTime, year)
-        loadCCE(manager, loadFromTime, year)
+    # if exclude != 'ccr':
-    if exclude != 'ccr':
+    # loadCCR(manager, loadFromTime, year)
        loadCCR(manager, loadFromTime, year)
    indexUpdates(manager, loadFromTime)
    manager.closeConnection()
@ -39,7 +37,7 @@ def loadCCR(manager, loadFromTime, selectedYear):
 def indexUpdates(manager, loadFromTime):
    esIndexer = ESIndexer(manager, None)
-    esIndexer.indexRecords(recType='cce')
+    # esIndexer.indexRecords(recType='cce')
    esIndexer.indexRecords(recType='ccr')
@ -62,7 +60,7 @@ def parseArgs():
 def loadConfig():
-    with open('config.yaml', 'r') as yamlFile:
+    with open('config.yaml-dist', 'r') as yamlFile:
        config = yaml.safe_load(yamlFile)
        for section in config:
            sectionDict = config[section]
@ -75,13 +73,17 @@ if __name__ == '__main__':
    try:
        loadConfig()
    except FileNotFoundError:
        print("Unable to set environment variables")
        pass
    from sessionManager import SessionManager
    from builder import CCEReader, CCEFile
    from renBuilder import CCRReader, CCRFile
    from esIndexer import ESIndexer
-
+    print(args.time)
    print(args.year)
    print(args.exclude)
    print(args.REINITIALIZE)
    main(
        secondsAgo=args.time,
        year=args.year,
--- a/model/elastic.py
+++ b/model/elastic.py
@ -1,5 +1,6 @@
 import os
 import yaml
 import pprint
 from elasticsearch_dsl import (
    Index,
    Document,
@ -16,14 +17,14 @@ class BaseDoc(Document):
    date_modified = Date()
    def save(self, **kwargs):
-        return super(BaseDoc, self).save(**kwargs)
+        return super(BaseDoc, self).save(** kwargs)
 class BaseInner(InnerDoc):
    date_created = Date()
    date_modified = Date()
    def save(self, **kwargs):
-        return super(BaseInner, self).save(**kwargs)
+        return super(BaseInner, self).save(** kwargs)
 class Registration(BaseInner):
@ -41,9 +42,10 @@ class Renewal(BaseDoc):
    rennum = Keyword()
    rendate = Date()
    title = Text(fields={'keyword': Keyword()})
-    
+    authors = Text()
    claimants = Nested(Claimant)
    claimants = Nested(Claimant)
    # pprint.pprint(dict(os.environ), width = 1) 
    class Index:
        name = os.environ['ES_CCR_INDEX']
@ -54,7 +56,6 @@ class CCE(BaseDoc):
    authors = Text(multi=True)
    publishers = Text(multi=True)
    lccns = Keyword(multi=True)
    registrations = Nested(Registration)
    class Index:
--- a/renBuilder.py
+++ b/renBuilder.py
@ -14,9 +14,10 @@ from model.registration import Registration
 class CCRReader():
    def __init__(self, manager):
        self.git = Github(os.environ['ACCESS_TOKEN'])
        print(self.git)
        self.repo = self.git.get_repo(os.environ['CCR_REPO'])
        print(self.repo)
        self.ccrYears = {}
        self.dbManager = manager
    def loadYears(self, selectedYear, loadFromTime):