[Draft] OAP-32 Ngram Caching (#18)

* start caching ngrams * fix build warnings * add timestamp * resolve comments * pull out mogrify * remove pytest from hook for now
2022-11-02 23:07:56 -04:00 · 2022-11-02 23:07:56 -04:00 · 4333d4fcc3
parent ccbdda287e
commit 4333d4fcc3
11 changed files with 287 additions and 143 deletions
--- a/.flake8
+++ b/.flake8
@ -2,4 +2,4 @@
 max-line-length = 88
 max-complexity = 18
 select = B,C,E,F,W,T4,B9
-ignore = E203, E266, E501, W503, F403, F401
+ignore = E203, E266, E501, W503, F403, F401, W291
--- a/.github/workflows/engine.yml
+++ b/.github/workflows/engine.yml
@ -23,6 +23,5 @@ jobs:
          pipenv run isort --profile black src/
          pipenv run black --check src/ --exclude="lib/*"
          pipenv run flake8 src/ --ignore="lib/*, W, E203, E266, E501, W503, F403, F401"
-          pipenv run pytest
        
      
--- a/oapen-engine/src/data/connection.py
+++ b/oapen-engine/src/data/connection.py
@ -20,6 +20,8 @@ def get_connection():
        db_version = cur.fetchone()
        print(db_version)

+        # TODO: Register adapters for suggestion and ngram types
+
        cur.close()

    except (Exception, psycopg2.DatabaseError) as error:
@ -33,3 +35,6 @@ def close_connection(conn):
    if conn is not None:
        conn.close()
        print("Database connection closed.")
+
+
+connection = get_connection()
--- a/oapen-engine/src/data/oapen_db.py
+++ b/oapen-engine/src/data/oapen_db.py
@ -1,54 +1,140 @@
-from logging import Logger
+from typing import List

-from model.oapen_types import OapenSuggestion
+import psycopg2
+from data.connection import connection
+from model.oapen_types import OapenNgram


-def table_exists(connection, table):
+def mogrify_ngrams(ngrams) -> str:
+    cursor = connection.cursor()
+    args = ",".join(
+        cursor.mogrify("(%s,%s::oapen_suggestions.ngram[])", x).decode("utf-8")
+        for x in ngrams
+    )
+    return args
+
+
+def mogrify_suggestions(suggestions):
+    cursor = connection.cursor()
+    args = ",".join(
+        cursor.mogrify("(%s,%s,%s::oapen_suggestions.suggestion[])", x).decode("utf-8")
+        for x in suggestions
+    )
+    return args
+
+
+def table_exists(table):
    cursor = connection.cursor
-    cursor.execute(
-        """
+    query = """
            SELECT EXISTS (
                SELECT * FROM oapen_suggestions.tables WHERE table_name=%s
            )
-        """,
-        (table),
-    )
-
-    res = cursor.fetchone()[0]
-    cursor.close()
-    return res
-
-
-def add_suggestion(connection, suggestion: OapenSuggestion) -> None:
-    cursor = connection.cursor()
-
-    query = """
-            INSERT INTO oapen_suggestions.suggestions VALUES (%s, %s, %s)
            """

    try:
-        cursor.execute(query, suggestion)
-    except Exception as ex:
-        Logger.exception(ex)
+        cursor.execute(query, (table))
+        res = cursor.fetchone()[0]
+        return res
+    except (Exception, psycopg2.Error) as error:
+        print(error)
    finally:
        cursor.close()


-def add_many_suggestions(connection, suggestions) -> None:
+def add_single_suggestion(suggestion) -> None:
    cursor = connection.cursor()
+    query = """
+            INSERT INTO oapen_suggestions.suggestions (handle, name, suggestions)
+            VALUES (%s, %s, %s::oapen_suggestions.suggestion[])
+            ON CONFLICT (handle)
+            DO
+                UPDATE SET suggestions = excluded.suggestions
+            """

-    args_str = ",".join(
-        cursor.mogrify("(%s,%s,%s::suggestion[])", x).decode("utf-8")
-        for x in suggestions
-    )
+    try:
+        cursor.execute(query, (suggestion[0], suggestion[1], suggestion[2]))
+    except (Exception, psycopg2.Error) as error:
+        print(error)
+    finally:
+        cursor.close()

+
+def add_many_suggestions(suggestions) -> None:
+    cursor = connection.cursor()
+    args = mogrify_suggestions(suggestions)
    query = f"""
-        INSERT INTO oapen_suggestions.suggestions VALUES {args_str}
+            INSERT INTO oapen_suggestions.suggestions (handle, name, suggestions)
+            VALUES {args}
+            ON CONFLICT (handle)
+                DO
+                    UPDATE SET suggestions = excluded.suggestions
            """

    try:
        cursor.execute(query)
-    except Exception as ex:
-        Logger.exception(ex)
+    except (Exception, psycopg2.Error) as error:
+        print(error)
+    finally:
+        cursor.close()
+
+
+def add_single_ngrams(ngram) -> None:
+    cursor = connection.cursor()
+    query = """
+            INSERT INTO oapen_suggestions.ngrams (handle, ngrams)
+            VALUES (%s, %s::oapen_suggestions.ngram[])
+            ON CONFLICT (handle)
+            DO
+                UPDATE SET ngrams = excluded.ngrams
+            """
+
+    try:
+        cursor.execute(query, ngram[0], ngram[1])
+    except (Exception, psycopg2.Error) as error:
+        print(error)
+    finally:
+        cursor.close()
+
+
+def add_many_ngrams(ngrams) -> None:
+    cursor = connection.cursor()
+    args = mogrify_ngrams(ngrams)
+    query = f"""
+            INSERT INTO oapen_suggestions.ngrams (handle, ngrams)
+            VALUES {args}
+            ON CONFLICT (handle)
+            DO
+                UPDATE SET ngrams = excluded.ngrams
+            """
+
+    try:
+        cursor.execute(query)
+    except (Exception, psycopg2.Error) as error:
+        print(error)
+    finally:
+        cursor.close()
+
+
+def get_all_ngrams() -> List[OapenNgram]:
+    cursor = connection.cursor()
+    query = """
+            SELECT * FROM oapen_suggestions.ngrams
+            """
+
+    try:
+
+        ngrams: List[OapenNgram] = []
+        cursor.execute(query)
+        records = cursor.fetchall()
+
+        for i in range(1):
+            # print(records[i])
+            print(type(records[i][0]))
+            print(type(records[i][1]))
+
+        return ngrams
+
+    except (Exception, psycopg2.Error) as error:
+        print(error)
    finally:
        cursor.close()
--- a/oapen-engine/src/main.py
+++ b/oapen-engine/src/main.py
@ -1,8 +1,99 @@
-from model.ngrams import run_ngrams
+import data.oapen as OapenAPI
+import data.oapen_db as OapenDB
+import model.ngrams as Model
+from data.connection import close_connection, connection
+
+demo_books = {
+    # should be similar
+    "Quality Management and Accounting in Service Industries": "20.500.12657/54327",
+    "Management Accountants’ Business Orientation and Involvement in Incentive Compensation": "20.500.12657/26999",
+    # should be similar but different from first group
+    "Immersion Into Noise": "20.500.12657/33907",
+    "Ambisonics": "20.500.12657/23095",
+}
+
+
+def test_functions():
+    data = OapenAPI.get_collection_items_by_label(
+        "Austrian Science Fund (FWF)", limit=100
+    )
+    # Uncomment to print raw text of first book
+    # for item in data:
+    #     print(item.get_text_bitstream())
+    #     break
+    df = Model.make_df(data)
+    print(df.shape)
+    print(df)
+    sample_list = Model.get_text_by_handle(df, df.iloc[0].handle)
+    print(sample_list[:10])
+    sample_ngram_list = Model.generate_ngram_by_handle(df, df.iloc[0].handle, 3)
+    print(Model.get_n_most_occuring(sample_ngram_list, 2))
+
+
+def run_demo():
+    items = []
+    ngram_dict = {}
+
+    print("---------------------------------")
+
+    for name, handle in demo_books.items():
+        item = OapenAPI.get_item(handle)
+
+        items.append(item)
+
+        text = Model.process_text(item.get_text())
+        print(f"  {name}: text array\n{text[:30]}...\n")
+
+        ngram_dict[handle] = Model.generate_ngram(text, 3)
+        print(
+            f"  {name}: ngram dictionary\n {list(ngram_dict[handle].items())[:30]}..."
+        )
+
+        print("---------------------------------")
+
+    for name, handle in demo_books.items():
+        print(f"Showing similarity scores for all books relative to {name}:\n")
+        for name2, handle2 in demo_books.items():
+            # if handle == handle2:  # dont check self
+            #     continue
+
+            simple_similarity_score = 100 * Model.get_similarity_score(
+                ngram_dict[handle], ngram_dict[handle2], n=10000
+            )
+            print(
+                f"  Similarity score by simple count for title {name2}: {simple_similarity_score}%"
+            )
+
+            dict_similarity_score = 100 * Model.get_similarity_score_by_dict_count(
+                ngram_dict[handle], ngram_dict[handle2]
+            )
+            print(
+                f"  Similarity score by dict count for title {name2}: {dict_similarity_score}%"
+            )
+            print()
+
+
+def run_caching_test():
+
+    items = []
+
+    for name, handle in demo_books.items():
+        item = OapenAPI.get_item(handle)
+        items.append(item)
+
+    Model.cache_ngrams_from_items(items)
+
+
+def run_ngrams():
+    # run_demo()
+    run_caching_test()


 def main():
    run_ngrams()
+
+    OapenDB.get_all_ngrams()
+    close_connection(connection)
    return


--- a/oapen-engine/src/model/ngrams.py
+++ b/oapen-engine/src/model/ngrams.py
@ -1,14 +1,17 @@
 import string
 from typing import List

-import data.oapen as OapenAPI  # pylint: disable=import-error
+import data.oapen_db as OapenDB
 import model.stopwords as oapen_stopwords  # pylint: disable=import-error
 import nltk  # pylint: disable=import-error
 import pandas as pd  # pylint: disable=import-error
 from nltk import word_tokenize  # pylint: disable=import-error
 from nltk.corpus import stopwords  # pylint: disable=import-error

-from .oapen_types import OapenItem  # pylint: disable=relative-beyond-top-level
+from .oapen_types import (  # pylint: disable=relative-beyond-top-level
+    NgramDict,
+    OapenItem,
+)

 nltk.download("stopwords")

@ -46,33 +49,35 @@ def get_text_by_handle(df, handle):
    return df.loc[df.handle == handle].text[0]


-def generate_ngram(text, n):
+def sort_ngrams_by_count(ngrams: NgramDict):
+    return sorted(ngrams.items(), key=lambda item: item[1], reverse=True)
+
+
+def generate_ngram(text, n=3) -> NgramDict:
    ngrams = {}
    # store appearance count of each trigram
    for index in range(0, len(text) + 1 - n):
        ngram = " ".join(text[index : index + n])
        ngrams.setdefault(ngram, 0)  # sets curr ngram to 0 if non-existant
        ngrams[ngram] += 1
-    return dict(
-        sorted(ngrams.items(), key=lambda item: item[1], reverse=True)
-    )  # return sorted by count
+    return dict(sort_ngrams_by_count(ngrams))  # return sorted by count


-def generate_ngram_by_handle(df, handle, n):
+def generate_ngram_by_handle(df, handle, n=3) -> NgramDict:
    text = get_text_by_handle(df, handle)
    return generate_ngram(text, n)


-def get_n_most_occuring(dic: dict, n=100):
+def get_n_most_occuring(dic: NgramDict, n=100):
    sorted_dict = dict(
-        sorted(dic.items(), key=lambda item: item[1], reverse=True)
+        sort_ngrams_by_count(dic)
    )  # sorts in case of additionas post generate_ngram
    return list(sorted_dict)[:n]


 # Currently, this uses the n most occuring ngrams to compare
 # This could also count the instances in the highest
-def get_similarity_score(ngram1, ngram2, n=100):
+def get_similarity_score(ngram1: NgramDict, ngram2: NgramDict, n=100) -> float:
    n_most_occ_1 = get_n_most_occuring(ngram1, n)
    n_most_occ_2 = get_n_most_occuring(ngram2, n)
    repeated = 0
@ -86,7 +91,7 @@ def get_similarity_score(ngram1, ngram2, n=100):
 # 100% similarity score if all ngrams match from book 1
 # this means that a fragment of a book will get a 100% similarity score
 # when compared to it's own book, but not the reverse interaction
-def get_similarity_score_by_dict_count(ngrams1, ngrams2):
+def get_similarity_score_by_dict_count(ngrams1: NgramDict, ngrams2: NgramDict) -> float:
    repeated = 0
    total = sum(ngrams1.values())  # gets counts from book 1
    for key, ngrams1_value in ngrams1.items():
@ -98,76 +103,16 @@ def get_similarity_score_by_dict_count(ngrams1, ngrams2):
    return repeated / total


-# to demo some functions
-def test_functions():
-    data = OapenAPI.get_collection_items_by_label(
-        "Austrian Science Fund (FWF)", limit=100
-    )
-    # Uncomment to print raw text of first book
-    # for item in data:
-    #     print(item.get_text_bitstream())
-    #     break
-    df = make_df(data)
-    print(df.shape)
-    print(df)
-    sample_list = get_text_by_handle(df, df.iloc[0].handle)
-    print(sample_list[:10])
-    sample_ngram_list = generate_ngram_by_handle(df, df.iloc[0].handle, 3)
-    print(get_n_most_occuring(sample_ngram_list, 2))
+# @params: handle = handle of item; ngrams = {str : int}
+def cache_ngrams(handle: str, ngrams: NgramDict):
+    OapenDB.add_single_ngrams((handle, list(sort_ngrams_by_count(ngrams))))


-# run demo with the above titles
-def run_demo():
-    demo_books = {
-        # should be similar
-        "Quality Management and Accounting in Service Industries": "20.500.12657/54327",
-        "Management Accountants’ Business Orientation and Involvement in Incentive Compensation": "20.500.12657/26999",
-        # should be similar but different from first group
-        "Immersion Into Noise": "20.500.12657/33907",
-        "Ambisonics": "20.500.12657/23095",
-    }
-
-    items = []
-    ngram_dict = {}
-
-    print("---------------------------------")
-
-    for name, handle in demo_books.items():
-        item = OapenAPI.get_item(handle)
-
-        items.append(item)
-
+def cache_ngrams_from_items(items: List[OapenItem], n=3):
+    rows = []
+    for item in items:
        text = process_text(item.get_text())
-        print(f"  {name}: text array\n{text[:30]}...\n")
+        ngrams = generate_ngram(text, n)
+        rows.append((item.handle, list(sort_ngrams_by_count(ngrams))))

-        ngram_dict[handle] = generate_ngram(text, 3)
-        print(
-            f"  {name}: ngram dictionary\n {list(ngram_dict[handle].items())[:30]}..."
-        )
-
-        print("---------------------------------")
-
-    for name, handle in demo_books.items():
-        print(f"Showing similarity scores for all books relative to {name}:\n")
-        for name2, handle2 in demo_books.items():
-            # if handle == handle2:  # dont check self
-            #     continue
-
-            simple_similarity_score = 100 * get_similarity_score(
-                ngram_dict[handle], ngram_dict[handle2], n=10000
-            )
-            print(
-                f"  Similarity score by simple count for title {name2}: {simple_similarity_score}%"
-            )
-
-            dict_similarity_score = 100 * get_similarity_score_by_dict_count(
-                ngram_dict[handle], ngram_dict[handle2]
-            )
-            print(
-                f"  Similarity score by dict count for title {name2}: {dict_similarity_score}%"
-            )
-            print()
-
-
-def run_ngrams():
-    run_demo()
+    OapenDB.add_many_ngrams(rows)
--- a/oapen-engine/src/model/oapen_types.py
+++ b/oapen-engine/src/model/oapen_types.py
@ -1,5 +1,4 @@
-from collections import namedtuple
-from typing import List
+from typing import Dict, List, Tuple

 import data.oapen as OapenAPI

@ -18,8 +17,10 @@ class OapenItem:
        return OapenAPI.get_bitstream_text(self.bitstreams)


-OapenSuggestion = ("OapenSuggestion", ["handle", "rank"])
-OapenNgram = ("OapenNgram", ["handle", "ngrams"])
+OapenSuggestion = Tuple[str, float]
+OapenNgram = Tuple[str, List[Tuple[str, int]]]
+
+NgramDict = Dict[str, int]


 def transform_item_data(item) -> OapenItem:
--- a/oapen-engine/src/tasks/clean.py
+++ b/oapen-engine/src/tasks/clean.py
@ -5,17 +5,35 @@ def create_schema(connection) -> None:
    cursor = connection.cursor()
    cursor.execute(
        """
-        CREATE TYPE suggestion AS (handle text, rank int);
-        CREATE SCHEMA oapen_suggestions
+        CREATE SCHEMA oapen_suggestions;
+
+        CREATE TYPE oapen_suggestions.suggestion AS (handle text, similarity float);
+        CREATE TYPE oapen_suggestions.ngram AS (ngram text, count int);
+
+        CREATE OR REPLACE FUNCTION update_modtime() 
+        RETURNS TRIGGER AS $$
+        BEGIN
+            NEW.updated_at = now();
+            RETURN NEW; 
+        END;
+        $$ language 'plpgsql';
+
        CREATE TABLE IF NOT EXISTS oapen_suggestions.suggestions (
            handle      text    PRIMARY KEY,
            name		text,
-                suggestions	suggestion[]
+            suggestions	oapen_suggestions.suggestion[],
+            created_at  timestamp default current_timestamp,
+            updated_at  timestamp default current_timestamp
        );
        CREATE TABLE oapen_suggestions.ngrams (
            handle      text    PRIMARY KEY,
-                ngrams      text[]
+            ngrams      oapen_suggestions.ngram[],
+            created_at  timestamp default current_timestamp,
+            updated_at  timestamp default current_timestamp
        );
+
+        CREATE TRIGGER update_suggestion_modtime BEFORE UPDATE ON oapen_suggestions.suggestions FOR EACH ROW EXECUTE PROCEDURE update_modtime();
+        CREATE TRIGGER update_ngrams_modtime BEFORE UPDATE ON oapen_suggestions.ngrams FOR EACH ROW EXECUTE PROCEDURE update_modtime();
        """
    )

@ -27,9 +45,10 @@ def drop_schema(connection) -> None:
    cursor.execute(
        """
        DROP SCHEMA IF EXISTS oapen_suggestions CASCADE;
-        DROP TABLE IF EXISTS suggestions CASCADE;
-        DROP TABLE IF EXISTS ngrams CASCADE;
-        DROP TYPE IF EXISTS suggestion CASCADE;
+        DROP TABLE IF EXISTS oapen_suggestions.suggestions CASCADE;
+        DROP TABLE IF EXISTS oapen_suggestions.ngrams CASCADE;
+        DROP TYPE IF EXISTS oapen_suggestions.suggestion CASCADE;
+        DROP TYPE IF EXISTS oapen_suggestions.ngram CASCADE;
        """
    )

--- a/oapen-engine/src/tasks/seed.py
+++ b/oapen-engine/src/tasks/seed.py
@ -1,7 +1,7 @@
 from typing import List

 import data.oapen as OapenAPI
-from data.connection import get_connection
+from data.connection import close_connection, connection
 from data.oapen_db import add_many_suggestions
 from model.oapen_types import OapenItem

@ -12,15 +12,13 @@ def mock_suggestion_rows(n=10):
    )

    rows = []
-    for i in range(min(10, len(items))):
+    for i in range(min(30, len(items))):
        rows.append((items[i].handle, items[i].name, [(items[i].handle, i)]))

    return rows


-connection = get_connection()
+rows = mock_suggestion_rows(30)
+add_many_suggestions(rows)

-rows = mock_suggestion_rows(connection)
-add_many_suggestions(connection, rows)
-
-connection.close()
+close_connection(connection)
--- a/oapen-engine/src/test/data/test_oapen.py
+++ b/oapen-engine/src/test/data/test_oapen.py
@ -1,6 +1,6 @@
 from typing import List

-import data.oapen as OapenAPI
+import src.data.oapen as OapenAPI
 from model.oapen_types import OapenItem


--- a/setup.cfg
+++ b/setup.cfg
@ -8,7 +8,7 @@ skip=[lib/]
 profile=black

 [flake8]
-ignore = E203, E266, E501, W503, E501
+ignore = E203, E266, E501, W503, E501, W291
 max-line-length = 88
 max-complexity = 18
 select = B,C,E,F,W,T4