Added new search functionality

2020-05-05 22:32:28 -04:00 · 2020-05-05 22:32:28 -04:00 · 1bec15f97f
parent 46dd2d8d75
commit 1bec15f97f
11 changed files with 359 additions and 38 deletions
--- a/api/app.py
+++ b/api/app.py
@ -2,13 +2,13 @@ import os
 import yaml
 from flask import Flask, jsonify
 from flasgger import Swagger
-from api.prints.swagger.swag import SwaggerDoc
-from api.db import db
-from api.elastic import elastic
-from api.prints import base, search, uuid
+from prints.swagger.swag import SwaggerDoc
+from db import db
+from elastic import elastic
+from prints import base, search, uuid

 def loadConfig():
-    with open('config.yaml', 'r') as yamlFile:
+    with open('config.yaml-dist', 'r') as yamlFile:
        config = yaml.safe_load(yamlFile)
        for section in config:
            sectionDict = config[section]
@ -37,6 +37,7 @@ application.config['ELASTICSEARCH_INDEX_URI'] = '{}:{}'.format(
    os.environ['ES_HOST'],
    os.environ['ES_PORT']
 )
+# print(application.config['ELASTICSEARCH_INDEX_URI'])
 application.config['SWAGGER'] = {'title': 'CCE Search'}
 db.init_app(application)
 elastic.init_app(application)
--- a/api/elastic.py
+++ b/api/elastic.py
@ -3,13 +3,18 @@ from elasticsearch_dsl import Search, Q

 class Elastic():
    def __init__(self):
-        self.client = None
+        self.client = Elasticsearch()
    
    def init_app(self, app):
-        self.client = Elasticsearch(app.config['ELASTICSEARCH_INDEX_URI'])
+        try:
+            self.client = Elasticsearch(hosts=app.config['ELASTICSEARCH_INDEX_URI'])
+        except ConnectionError as err:
+            print('Failed to connect to ElasticSearch instance')
+            raise err
    
    def create_search(self, index):
-        return Search(using=self.client, index=index)
+        s = Search(using=self.client, index=index)
+        return s

    def query_regnum(self, regnum, page=0, perPage=10):
        startPos, endPos = Elastic.getFromSize(page, perPage)
@ -30,6 +35,41 @@ class Elastic():
        search = self.create_search('cce,ccr')
        renewalSearch = search.query('query_string', query=queryText)[startPos:endPos]
        return renewalSearch.execute()
+
+    #New Query Types
+    def query_title(self, queryText,page=0, perPage=10):
+        startPos, endPos = Elastic.getFromSize(page, perPage)
+        print(startPos, endPos)
+        search = self.create_search('cce,ccr')
+        titleSearch = search.query('match', title=queryText)[startPos:endPos]
+        print(titleSearch.to_dict())
+        return titleSearch.execute()
+
+    def query_author(self, queryText,page=0, perPage=10):
+        startPos, endPos = Elastic.getFromSize(page, perPage)
+        print(startPos, endPos)
+        search = self.create_search('cce,ccr')
+        titleSearch = search.query('match', authors=queryText)[startPos:endPos]
+        print(titleSearch.to_dict())
+        return titleSearch.execute()
+
+
+    # If query is given for publisher field, don't check renewals?
+    def query_multifields(self, params, page=0, perPage=10):
+        startPos, endPos = Elastic.getFromSize(page, perPage)
+        print(startPos, endPos)
+        if "publishers" in params:
+            search = self.create_search('cce')
+            search = search.query('match', publishers=params["publishers"])
+        else:
+            search = self.create_search('cce,ccr')
+        if "title" in params:
+            search = search.query('match', title=params['title'])
+        if "authors" in params:
+            search = search.query('match', authors=params['authors'])
+        titleSearch = search[startPos:endPos]
+        return titleSearch.execute()
+
    
    @staticmethod
    def getFromSize(page, perPage):
--- a/api/prints/search.py
+++ b/api/prints/search.py
@ -9,6 +9,132 @@ from api.response import MultiResponse

 search = Blueprint('search', __name__, url_prefix='/search')

+@search.route('/multi', methods=['GET'])
+def multiQuery():
+    title = request.args.get('title', '')
+    authors = request.args.get('authors', '')
+    publishers = request.args.get('publishers','')
+    sourceReturn = request.args.get('source', False)
+    page, perPage = MultiResponse.parsePaging(request.args)
+    queries = {}
+    if title!="*" and title!="":
+        queries["title"]=title
+    if authors!="*" and authors!="":
+        queries["authors"]=authors
+    if publishers!="*" and publishers!="":
+        queries["publishers"]=publishers
+    print(queries)
+    matchingDocs = elastic.query_multifields(queries, page=page, perPage=perPage)
+    textResponse = MultiResponse(
+        'text',
+        matchingDocs.hits.total,
+        request.base_url,
+        queries,
+        page,
+        perPage
+    )
+    qManager = QueryManager(db.session)
+    for entry in matchingDocs:
+        if entry.meta.index == 'cce':
+            dbEntry = qManager.registrationQuery(entry.uuid)
+            textResponse.addResult(MultiResponse.parseEntry(
+                dbEntry, 
+                xml=sourceReturn
+            ))
+        else:
+            try:
+                dbRenewal = qManager.renewalQuery(entry.uuid)
+                textResponse.addResult(MultiResponse.parseRenewal(
+                    dbRenewal,
+                    source=sourceReturn
+                ))
+            except NoResultFound:
+                dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
+                textResponse.addResult(MultiResponse.parseRenewal(
+                    dbRenewal,
+                    source=sourceReturn
+                ))
+
+    textResponse.createDataBlock()    
+    return jsonify(textResponse.createResponse(200))
+
+@search.route('/author', methods=['GET'])
+def authorQuery():
+    queryText = request.args.get('query', '')
+    sourceReturn = request.args.get('source', False)
+    page, perPage = MultiResponse.parsePaging(request.args)
+    matchingDocs = elastic.query_author(queryText, page=page, perPage=perPage)
+    textResponse = MultiResponse(
+        'text',
+        matchingDocs.hits.total,
+        request.base_url,
+        queryText,
+        page,
+        perPage
+    )
+    qManager = QueryManager(db.session)
+    for entry in matchingDocs:
+        if entry.meta.index == 'cce':
+            dbEntry = qManager.registrationQuery(entry.uuid)
+            textResponse.addResult(MultiResponse.parseEntry(
+                dbEntry, 
+                xml=sourceReturn
+            ))
+        else:
+            try:
+                dbRenewal = qManager.renewalQuery(entry.uuid)
+                textResponse.addResult(MultiResponse.parseRenewal(
+                    dbRenewal,
+                    source=sourceReturn
+                ))
+            except NoResultFound:
+                dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
+                textResponse.addResult(MultiResponse.parseRenewal(
+                    dbRenewal,
+                    source=sourceReturn
+                ))
+
+    textResponse.createDataBlock()    
+    return jsonify(textResponse.createResponse(200))
+
+@search.route('/title', methods=['GET'])
+def titleQuery():
+    queryText = request.args.get('query', '')
+    sourceReturn = request.args.get('source', False)
+    page, perPage = MultiResponse.parsePaging(request.args)
+    matchingDocs = elastic.query_title(queryText, page=page, perPage=perPage)
+    textResponse = MultiResponse(
+        'text',
+        matchingDocs.hits.total,
+        request.base_url,
+        queryText,
+        page,
+        perPage
+    )
+    qManager = QueryManager(db.session)
+    for entry in matchingDocs:
+        if entry.meta.index == 'cce':
+            dbEntry = qManager.registrationQuery(entry.uuid)
+            textResponse.addResult(MultiResponse.parseEntry(
+                dbEntry, 
+                xml=sourceReturn
+            ))
+        else:
+            try:
+                dbRenewal = qManager.renewalQuery(entry.uuid)
+                textResponse.addResult(MultiResponse.parseRenewal(
+                    dbRenewal,
+                    source=sourceReturn
+                ))
+            except NoResultFound:
+                dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
+                textResponse.addResult(MultiResponse.parseRenewal(
+                    dbRenewal,
+                    source=sourceReturn
+                ))
+
+    textResponse.createDataBlock()    
+    return jsonify(textResponse.createResponse(200))

@search.route('/fulltext', methods=['GET'])
 def fullTextQuery():
--- a/api/prints/swagger/swag.py
+++ b/api/prints/swagger/swag.py
@ -23,6 +23,147 @@ class SwaggerDoc():
                "https"
            ],
            "paths": {
+                "/search/multi": {
+                    "get": {
+                        "tags": ["Search"],
+                        "summary": "Returns a set of registration and renewal objects",
+                        "description": "Accepts a query string to search across both registration and renewal records in the author field",
+                        "parameters": [
+                            {
+                                "name": "title",
+                                "in": "query",
+                                "type": "string",
+                                "required": False,
+                                "default": "*"
+                            },{
+                                "name": "authors",
+                                "in": "query",
+                                "type": "string",
+                                "required": False,
+                                "default": "*"
+                            },{
+                                "name": "publishers",
+                                "in": "query",
+                                "type": "string",
+                                "required": False,
+                                "default": "*"
+                            },{
+                                "name": "source",
+                                "in": "query",
+                                "type": "boolean",
+                                "required": False,
+                                "default": False,
+                                "description": "Return source XML/CSV data"
+                            },{
+                                "name": "page",
+                                "in": "query",
+                                "type": "number",
+                                "required": False,
+                                "default": 0
+                            },{
+                                "name": "per_page",
+                                "in": "query",
+                                "type": "number",
+                                "required": False,
+                                "default": 10
+                            }
+                        ],
+                        "responses": {
+                            200: {
+                                "description": "A list of copyright registrations and renewals",
+                                "schema": {
+                                    "$ref": "#/definitions/MultiResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "/search/author": {
+                    "get": {
+                        "tags": ["Search"],
+                        "summary": "Returns a set of registration and renewal objects",
+                        "description": "Accepts a query string to search across both registration and renewal records in the author field",
+                        "parameters": [
+                            {
+                                "name": "query",
+                                "in": "query",
+                                "type": "string",
+                                "required": True,
+                                "default": "*"
+                            },{
+                                "name": "source",
+                                "in": "query",
+                                "type": "boolean",
+                                "required": False,
+                                "default": False,
+                                "description": "Return source XML/CSV data"
+                            },{
+                                "name": "page",
+                                "in": "query",
+                                "type": "number",
+                                "required": False,
+                                "default": 0
+                            },{
+                                "name": "per_page",
+                                "in": "query",
+                                "type": "number",
+                                "required": False,
+                                "default": 10
+                            }
+                        ],
+                        "responses": {
+                            200: {
+                                "description": "A list of copyright registrations and renewals",
+                                "schema": {
+                                    "$ref": "#/definitions/MultiResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "/search/title": {
+                    "get": {
+                        "tags": ["Search"],
+                        "summary": "Returns a set of registration and renewal objects",
+                        "description": "Accepts a query string to search across both registration and renewal records in the title fiel",
+                        "parameters": [
+                            {
+                                "name": "query",
+                                "in": "query",
+                                "type": "string",
+                                "required": True,
+                                "default": "*"
+                            },{
+                                "name": "source",
+                                "in": "query",
+                                "type": "boolean",
+                                "required": False,
+                                "default": False,
+                                "description": "Return source XML/CSV data"
+                            },{
+                                "name": "page",
+                                "in": "query",
+                                "type": "number",
+                                "required": False,
+                                "default": 0
+                            },{
+                                "name": "per_page",
+                                "in": "query",
+                                "type": "number",
+                                "required": False,
+                                "default": 10
+                            }
+                        ],
+                        "responses": {
+                            200: {
+                                "description": "A list of copyright registrations and renewals",
+                                "schema": {
+                                    "$ref": "#/definitions/MultiResponse"
+                                }
+                            }
+                        }
+                    }
+                },
                "/search/fulltext": {
                    "get": {
                        "tags": ["Search"],
--- a/api/response.py
+++ b/api/response.py
@ -1,3 +1,4 @@
+import math

 class Response():
    def __init__(self, queryType, endpoint):
@ -151,7 +152,7 @@ class MultiResponse(Response):
        else:
            paging['next'] = None
        
-        lastPage = int((self.total - self.perPage) / self.perPage)
+        lastPage = math.ceil(((self.total - self.perPage) / self.perPage))
        if (
            self.page * self.perPage < self.total and 
            self.total > self.perPage
--- a/builder.py
+++ b/builder.py
@ -6,6 +6,13 @@ from lxml import etree
 import os
 import re
 import traceback
+import sys
+
+import io
+
+sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = 'utf-8')
+
+sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = 'utf-8')

 from model.cce import CCE
 from model.errorCCE import ErrorCCE
--- a/config.yaml-dist
+++ b/config.yaml-dist
@ -1,18 +1,18 @@
 DATABASE:
-  DB_USER: 
-  DB_PSWD: 
-  DB_HOST: 
-  DB_PORT: 
-  DB_NAME: 
+  DB_USER: postgres
+  DB_PSWD: "9903"
+  DB_HOST: localhost
+  DB_PORT: "5432"
+  DB_NAME: ccesearch

 GITHUB:
-  ACCESS_TOKEN: 
-  CCE_REPO: 
-  CCR_REPO: 
+  ACCESS_TOKEN: 218124e9bf09a9b3f379cb5ee1ab0a8756ee3b3c
+  CCE_REPO: nypl/catalog_of_copyright_entries_project
+  CCR_REPO: nypl/cce-renewals

 ELASTICSEARCH:
-  ES_CCE_INDEX: 
-  ES_CCR_INDEX: 
-  ES_HOST: 
-  ES_PORT: 
-  ES_TIMEOUT: 
+  ES_CCE_INDEX: cce
+  ES_CCR_INDEX: ccr
+  ES_HOST: localhost
+  ES_PORT: '9200'
+  ES_TIMEOUT: "10000"
--- a/esIndexer.py
+++ b/esIndexer.py
@ -58,7 +58,7 @@ class ESIndexer():
        if self.client.indices.exists(index=self.ccr_index) is False:
            Renewal.init()
    
-    def indexRecords(self, recType='cce'):
+    def indexRecords(self, recType='ccr'):
        """Process the current batch of updating records. This utilizes the
        elasticsearch-py bulk helper to import records in chunks of the
        provided size. If a record in the batch errors that is reported and
@ -148,6 +148,7 @@ class ESRen():
        self.renewal.rennum = self.dbRen.renewal_num
        self.renewal.rendate = self.dbRen.renewal_date
        self.renewal.title = self.dbRen.title
+        self.renewal.authors = self.dbRen.author
        self.renewal.claimants = [
            Claimant(name=c.name, claim_type=c.claimant_type)
            for c in self.dbRen.claimants
--- a/main.py
+++ b/main.py
@ -14,12 +14,10 @@ def main(secondsAgo=None, year=None, exclude=None, reinit=False):
    startTime = datetime.now()
    if secondsAgo is not None:
        loadFromTime = startTime - timedelta(seconds=secondsAgo)
-
-    if exclude != 'cce':
-        loadCCE(manager, loadFromTime, year)
-    if exclude != 'ccr':
-        loadCCR(manager, loadFromTime, year)
-    
+    # if exclude != 'cce':
+        # loadCCE(manager, loadFromTime, year)
+    # if exclude != 'ccr':
+    # loadCCR(manager, loadFromTime, year)
    indexUpdates(manager, loadFromTime)
    
    manager.closeConnection()
@ -39,7 +37,7 @@ def loadCCR(manager, loadFromTime, selectedYear):

 def indexUpdates(manager, loadFromTime):
    esIndexer = ESIndexer(manager, None)
-    esIndexer.indexRecords(recType='cce')
+    # esIndexer.indexRecords(recType='cce')
    esIndexer.indexRecords(recType='ccr')


@ -62,7 +60,7 @@ def parseArgs():


 def loadConfig():
-    with open('config.yaml', 'r') as yamlFile:
+    with open('config.yaml-dist', 'r') as yamlFile:
        config = yaml.safe_load(yamlFile)
        for section in config:
            sectionDict = config[section]
@ -75,13 +73,17 @@ if __name__ == '__main__':
    try:
        loadConfig()
    except FileNotFoundError:
+        print("Unable to set environment variables")
        pass

    from sessionManager import SessionManager
    from builder import CCEReader, CCEFile
    from renBuilder import CCRReader, CCRFile
    from esIndexer import ESIndexer
-
+    print(args.time)
+    print(args.year)
+    print(args.exclude)
+    print(args.REINITIALIZE)
    main(
        secondsAgo=args.time,
        year=args.year,
--- a/model/elastic.py
+++ b/model/elastic.py
@ -1,5 +1,6 @@
 import os
 import yaml
+import pprint
 from elasticsearch_dsl import (
    Index,
    Document,
@ -16,14 +17,14 @@ class BaseDoc(Document):
    date_modified = Date()

    def save(self, **kwargs):
-        return super(BaseDoc, self).save(**kwargs)
+        return super(BaseDoc, self).save(** kwargs)

 class BaseInner(InnerDoc):
    date_created = Date()
    date_modified = Date()

    def save(self, **kwargs):
-        return super(BaseInner, self).save(**kwargs)
+        return super(BaseInner, self).save(** kwargs)


 class Registration(BaseInner):
@ -41,9 +42,10 @@ class Renewal(BaseDoc):
    rennum = Keyword()
    rendate = Date()
    title = Text(fields={'keyword': Keyword()})
-    
-    claimants = Nested(Claimant)
+    authors = Text()

+    claimants = Nested(Claimant)
+    # pprint.pprint(dict(os.environ), width = 1) 
    class Index:
        name = os.environ['ES_CCR_INDEX']

@ -54,7 +56,6 @@ class CCE(BaseDoc):
    authors = Text(multi=True)
    publishers = Text(multi=True)
    lccns = Keyword(multi=True)
-
    registrations = Nested(Registration)

    class Index:
--- a/renBuilder.py
+++ b/renBuilder.py
@ -14,9 +14,10 @@ from model.registration import Registration
 class CCRReader():
    def __init__(self, manager):
        self.git = Github(os.environ['ACCESS_TOKEN'])
+        print(self.git)
        self.repo = self.git.get_repo(os.environ['CCR_REPO'])
+        print(self.repo)
        self.ccrYears = {}
-
        self.dbManager = manager

    def loadYears(self, selectedYear, loadFromTime):