Merge pull request #1 from EbookFoundation/stevens_api

Stevens api: Added functionality to search across multiple fields (title, author, publisher)
2020-05-14 17:11:28 -04:00 · 2020-05-14 17:11:28 -04:00 · d2cd9283e8
parent 46dd2d8d75 6efe4397f1
commit d2cd9283e8
11 changed files with 333 additions and 27 deletions
--- a/api/app.py
+++ b/api/app.py
@ -2,13 +2,13 @@ import os
 import yaml
 from flask import Flask, jsonify
 from flasgger import Swagger
-from api.prints.swagger.swag import SwaggerDoc
-from api.db import db
-from api.elastic import elastic
-from api.prints import base, search, uuid
+from .prints.swagger.swag import SwaggerDoc
+from .db import db
+from .elastic import elastic
+from .prints import base, search, uuid

 def loadConfig():
-    with open('config.yaml', 'r') as yamlFile:
+    with open('config.yaml-dist', 'r') as yamlFile:
        config = yaml.safe_load(yamlFile)
        for section in config:
            sectionDict = config[section]
@ -37,6 +37,7 @@ application.config['ELASTICSEARCH_INDEX_URI'] = '{}:{}'.format(
    os.environ['ES_HOST'],
    os.environ['ES_PORT']
 )
+# print(application.config['ELASTICSEARCH_INDEX_URI'])
 application.config['SWAGGER'] = {'title': 'CCE Search'}
 db.init_app(application)
 elastic.init_app(application)
--- a/api/elastic.py
+++ b/api/elastic.py
@ -3,13 +3,18 @@ from elasticsearch_dsl import Search, Q

 class Elastic():
    def __init__(self):
-        self.client = None
+        self.client = Elasticsearch()
    
    def init_app(self, app):
-        self.client = Elasticsearch(app.config['ELASTICSEARCH_INDEX_URI'])
+        try:
+            self.client = Elasticsearch(hosts=app.config['ELASTICSEARCH_INDEX_URI'])
+        except ConnectionError as err:
+            print('Failed to connect to ElasticSearch instance')
+            raise err
    
    def create_search(self, index):
-        return Search(using=self.client, index=index)
+        s = Search(using=self.client, index=index)
+        return s

    def query_regnum(self, regnum, page=0, perPage=10):
        startPos, endPos = Elastic.getFromSize(page, perPage)
@ -26,10 +31,39 @@ class Elastic():
    
    def query_fulltext(self, queryText, page=0, perPage=10):
        startPos, endPos = Elastic.getFromSize(page, perPage)
-        print(startPos, endPos)
        search = self.create_search('cce,ccr')
        renewalSearch = search.query('query_string', query=queryText)[startPos:endPos]
        return renewalSearch.execute()
+
+    #New Query Types
+    def query_title(self, queryText,page=0, perPage=10):
+        startPos, endPos = Elastic.getFromSize(page, perPage)
+        search = self.create_search('cce,ccr')
+        titleSearch = search.query('match', title=queryText)[startPos:endPos]
+        return titleSearch.execute()
+
+    def query_author(self, queryText,page=0, perPage=10):
+        startPos, endPos = Elastic.getFromSize(page, perPage)
+        search = self.create_search('cce,ccr')
+        titleSearch = search.query('match', authors=queryText)[startPos:endPos]
+        return titleSearch.execute()
+
+
+    # If query is given for publisher field, don't check renewals?
+    def query_multifields(self, params, page=0, perPage=10):
+        startPos, endPos = Elastic.getFromSize(page, perPage)
+        if "publishers" in params:
+            search = self.create_search('cce')
+            search = search.query('match', publishers=params["publishers"])
+        else:
+            search = self.create_search('cce,ccr')
+        if "title" in params:
+            search = search.query('match', title=params['title'])
+        if "authors" in params:
+            search = search.query('match', authors=params['authors'])
+        titleSearch = search[startPos:endPos]
+        return titleSearch.execute()
+
    
    @staticmethod
    def getFromSize(page, perPage):
@ -37,4 +71,4 @@ class Elastic():
        endPos = startPos + perPage
        return startPos, endPos

-elastic = Elastic()
+elastic = Elastic()
--- a/api/prints/search.py
+++ b/api/prints/search.py
@ -9,6 +9,131 @@ from api.response import MultiResponse

 search = Blueprint('search', __name__, url_prefix='/search')

+@search.route('/multi', methods=['GET'])
+def multiQuery():
+    title = request.args.get('title', '')
+    authors = request.args.get('authors', '')
+    publishers = request.args.get('publishers','')
+    sourceReturn = request.args.get('source', False)
+    page, perPage = MultiResponse.parsePaging(request.args)
+    queries = {}
+    if title!="*" and title!="":
+        queries["title"]=title
+    if authors!="*" and authors!="":
+        queries["authors"]=authors
+    if publishers!="*" and publishers!="":
+        queries["publishers"]=publishers
+    matchingDocs = elastic.query_multifields(queries, page=page, perPage=perPage)
+    textResponse = MultiResponse(
+        'text',
+        matchingDocs.hits.total,
+        request.base_url,
+        queries,
+        page,
+        perPage
+    )
+    qManager = QueryManager(db.session)
+    for entry in matchingDocs:
+        if entry.meta.index == 'cce':
+            dbEntry = qManager.registrationQuery(entry.uuid)
+            textResponse.addResult(MultiResponse.parseEntry(
+                dbEntry, 
+                xml=sourceReturn
+            ))
+        else:
+            try:
+                dbRenewal = qManager.renewalQuery(entry.uuid)
+                textResponse.addResult(MultiResponse.parseRenewal(
+                    dbRenewal,
+                    source=sourceReturn
+                ))
+            except NoResultFound:
+                dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
+                textResponse.addResult(MultiResponse.parseRenewal(
+                    dbRenewal,
+                    source=sourceReturn
+                ))
+
+    textResponse.createDataBlock()    
+    return jsonify(textResponse.createResponse(200))
+
+@search.route('/author', methods=['GET'])
+def authorQuery():
+    queryText = request.args.get('query', '')
+    sourceReturn = request.args.get('source', False)
+    page, perPage = MultiResponse.parsePaging(request.args)
+    matchingDocs = elastic.query_author(queryText, page=page, perPage=perPage)
+    textResponse = MultiResponse(
+        'text',
+        matchingDocs.hits.total,
+        request.base_url,
+        queryText,
+        page,
+        perPage
+    )
+    qManager = QueryManager(db.session)
+    for entry in matchingDocs:
+        if entry.meta.index == 'cce':
+            dbEntry = qManager.registrationQuery(entry.uuid)
+            textResponse.addResult(MultiResponse.parseEntry(
+                dbEntry, 
+                xml=sourceReturn
+            ))
+        else:
+            try:
+                dbRenewal = qManager.renewalQuery(entry.uuid)
+                textResponse.addResult(MultiResponse.parseRenewal(
+                    dbRenewal,
+                    source=sourceReturn
+                ))
+            except NoResultFound:
+                dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
+                textResponse.addResult(MultiResponse.parseRenewal(
+                    dbRenewal,
+                    source=sourceReturn
+                ))
+
+    textResponse.createDataBlock()    
+    return jsonify(textResponse.createResponse(200))
+
+@search.route('/title', methods=['GET'])
+def titleQuery():
+    queryText = request.args.get('query', '')
+    sourceReturn = request.args.get('source', False)
+    page, perPage = MultiResponse.parsePaging(request.args)
+    matchingDocs = elastic.query_title(queryText, page=page, perPage=perPage)
+    textResponse = MultiResponse(
+        'text',
+        matchingDocs.hits.total,
+        request.base_url,
+        queryText,
+        page,
+        perPage
+    )
+    qManager = QueryManager(db.session)
+    for entry in matchingDocs:
+        if entry.meta.index == 'cce':
+            dbEntry = qManager.registrationQuery(entry.uuid)
+            textResponse.addResult(MultiResponse.parseEntry(
+                dbEntry, 
+                xml=sourceReturn
+            ))
+        else:
+            try:
+                dbRenewal = qManager.renewalQuery(entry.uuid)
+                textResponse.addResult(MultiResponse.parseRenewal(
+                    dbRenewal,
+                    source=sourceReturn
+                ))
+            except NoResultFound:
+                dbRenewal = qManager.orphanRenewalQuery(entry.uuid)
+                textResponse.addResult(MultiResponse.parseRenewal(
+                    dbRenewal,
+                    source=sourceReturn
+                ))
+
+    textResponse.createDataBlock()    
+    return jsonify(textResponse.createResponse(200))

@search.route('/fulltext', methods=['GET'])
 def fullTextQuery():
@ -92,8 +217,7 @@ def renQuery(rennum):
    for entry in matchingDocs:
        dbRenewal = qManager.renewalQuery(entry.uuid)
        renResponse.extendResults(parseRetRenewal(
-            dbRenewal,
-            source=sourceReturn    
+            dbRenewal
        ))

    renResponse.createDataBlock()
--- a/api/prints/swagger/swag.py
+++ b/api/prints/swagger/swag.py
@ -23,6 +23,147 @@ class SwaggerDoc():
                "https"
            ],
            "paths": {
+                "/search/multi": {
+                    "get": {
+                        "tags": ["Search"],
+                        "summary": "Returns a set of registration and renewal objects",
+                        "description": "Accepts a query string to search across both registration and renewal records in the author field",
+                        "parameters": [
+                            {
+                                "name": "title",
+                                "in": "query",
+                                "type": "string",
+                                "required": False,
+                                "default": "*"
+                            },{
+                                "name": "authors",
+                                "in": "query",
+                                "type": "string",
+                                "required": False,
+                                "default": "*"
+                            },{
+                                "name": "publishers",
+                                "in": "query",
+                                "type": "string",
+                                "required": False,
+                                "default": "*"
+                            },{
+                                "name": "source",
+                                "in": "query",
+                                "type": "boolean",
+                                "required": False,
+                                "default": False,
+                                "description": "Return source XML/CSV data"
+                            },{
+                                "name": "page",
+                                "in": "query",
+                                "type": "number",
+                                "required": False,
+                                "default": 0
+                            },{
+                                "name": "per_page",
+                                "in": "query",
+                                "type": "number",
+                                "required": False,
+                                "default": 10
+                            }
+                        ],
+                        "responses": {
+                            200: {
+                                "description": "A list of copyright registrations and renewals",
+                                "schema": {
+                                    "$ref": "#/definitions/MultiResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "/search/author": {
+                    "get": {
+                        "tags": ["Search"],
+                        "summary": "Returns a set of registration and renewal objects",
+                        "description": "Accepts a query string to search across both registration and renewal records in the author field",
+                        "parameters": [
+                            {
+                                "name": "query",
+                                "in": "query",
+                                "type": "string",
+                                "required": True,
+                                "default": "*"
+                            },{
+                                "name": "source",
+                                "in": "query",
+                                "type": "boolean",
+                                "required": False,
+                                "default": False,
+                                "description": "Return source XML/CSV data"
+                            },{
+                                "name": "page",
+                                "in": "query",
+                                "type": "number",
+                                "required": False,
+                                "default": 0
+                            },{
+                                "name": "per_page",
+                                "in": "query",
+                                "type": "number",
+                                "required": False,
+                                "default": 10
+                            }
+                        ],
+                        "responses": {
+                            200: {
+                                "description": "A list of copyright registrations and renewals",
+                                "schema": {
+                                    "$ref": "#/definitions/MultiResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "/search/title": {
+                    "get": {
+                        "tags": ["Search"],
+                        "summary": "Returns a set of registration and renewal objects",
+                        "description": "Accepts a query string to search across both registration and renewal records in the title fiel",
+                        "parameters": [
+                            {
+                                "name": "query",
+                                "in": "query",
+                                "type": "string",
+                                "required": True,
+                                "default": "*"
+                            },{
+                                "name": "source",
+                                "in": "query",
+                                "type": "boolean",
+                                "required": False,
+                                "default": False,
+                                "description": "Return source XML/CSV data"
+                            },{
+                                "name": "page",
+                                "in": "query",
+                                "type": "number",
+                                "required": False,
+                                "default": 0
+                            },{
+                                "name": "per_page",
+                                "in": "query",
+                                "type": "number",
+                                "required": False,
+                                "default": 10
+                            }
+                        ],
+                        "responses": {
+                            200: {
+                                "description": "A list of copyright registrations and renewals",
+                                "schema": {
+                                    "$ref": "#/definitions/MultiResponse"
+                                }
+                            }
+                        }
+                    }
+                },
                "/search/fulltext": {
                    "get": {
                        "tags": ["Search"],
--- a/api/response.py
+++ b/api/response.py
@ -1,3 +1,4 @@
+import math

 class Response():
    def __init__(self, queryType, endpoint):
@ -151,7 +152,7 @@ class MultiResponse(Response):
        else:
            paging['next'] = None
        
-        lastPage = int((self.total - self.perPage) / self.perPage)
+        lastPage = math.ceil(((self.total - self.perPage) / self.perPage))
        if (
            self.page * self.perPage < self.total and 
            self.total > self.perPage
--- a/builder.py
+++ b/builder.py
@ -6,6 +6,13 @@ from lxml import etree
 import os
 import re
 import traceback
+import sys
+
+import io
+
+sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = 'utf-8')
+
+sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = 'utf-8')

 from model.cce import CCE
 from model.errorCCE import ErrorCCE
--- a/config.yaml-dist
+++ b/config.yaml-dist
@ -7,12 +7,12 @@ DATABASE:

 GITHUB:
  ACCESS_TOKEN: 
-  CCE_REPO: 
-  CCR_REPO: 
+  CCE_REPO:
+  CCR_REPO:

 ELASTICSEARCH:
-  ES_CCE_INDEX: 
-  ES_CCR_INDEX: 
+  ES_CCE_INDEX:
+  ES_CCR_INDEX:
  ES_HOST: 
  ES_PORT: 
-  ES_TIMEOUT: 
+  ES_TIMEOUT:
--- a/esIndexer.py
+++ b/esIndexer.py
@ -148,6 +148,7 @@ class ESRen():
        self.renewal.rennum = self.dbRen.renewal_num
        self.renewal.rendate = self.dbRen.renewal_date
        self.renewal.title = self.dbRen.title
+        self.renewal.authors = self.dbRen.author
        self.renewal.claimants = [
            Claimant(name=c.name, claim_type=c.claimant_type)
            for c in self.dbRen.claimants
--- a/main.py
+++ b/main.py
@ -14,12 +14,10 @@ def main(secondsAgo=None, year=None, exclude=None, reinit=False):
    startTime = datetime.now()
    if secondsAgo is not None:
        loadFromTime = startTime - timedelta(seconds=secondsAgo)
-
    if exclude != 'cce':
        loadCCE(manager, loadFromTime, year)
    if exclude != 'ccr':
        loadCCR(manager, loadFromTime, year)
-    
    indexUpdates(manager, loadFromTime)
    
    manager.closeConnection()
@ -62,7 +60,7 @@ def parseArgs():


 def loadConfig():
-    with open('config.yaml', 'r') as yamlFile:
+    with open('config.yaml-dist', 'r') as yamlFile:
        config = yaml.safe_load(yamlFile)
        for section in config:
            sectionDict = config[section]
@ -75,6 +73,7 @@ if __name__ == '__main__':
    try:
        loadConfig()
    except FileNotFoundError:
+        print("Unable to set environment variables")
        pass

    from sessionManager import SessionManager
@ -87,4 +86,4 @@ if __name__ == '__main__':
        year=args.year,
        exclude=args.exclude,
        reinit=args.REINITIALIZE
-    )
+    )
--- a/model/elastic.py
+++ b/model/elastic.py
@ -41,9 +41,9 @@ class Renewal(BaseDoc):
    rennum = Keyword()
    rendate = Date()
    title = Text(fields={'keyword': Keyword()})
-    
-    claimants = Nested(Claimant)
+    authors = Text()

+    claimants = Nested(Claimant)
    class Index:
        name = os.environ['ES_CCR_INDEX']

@ -54,7 +54,6 @@ class CCE(BaseDoc):
    authors = Text(multi=True)
    publishers = Text(multi=True)
    lccns = Keyword(multi=True)
-
    registrations = Nested(Registration)

    class Index:
--- a/renBuilder.py
+++ b/renBuilder.py
@ -16,7 +16,6 @@ class CCRReader():
        self.git = Github(os.environ['ACCESS_TOKEN'])
        self.repo = self.git.get_repo(os.environ['CCR_REPO'])
        self.ccrYears = {}
-
        self.dbManager = manager

    def loadYears(self, selectedYear, loadFromTime):
@ -190,4 +189,4 @@ class CCRFile():
            except KeyError:
                pass
        print('No matching field found!')
-        raise KeyError
+        raise KeyError