[Draft] OAP-32 Ngram Caching (#18)

* start caching ngrams

* fix build warnings

* add timestamp

* resolve comments

* pull out mogrify

* remove pytest from hook for now
pull/20/head
Celina Peralta 2022-11-02 23:07:56 -04:00 committed by GitHub
parent ccbdda287e
commit 4333d4fcc3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 287 additions and 143 deletions

View File

@ -2,4 +2,4 @@
max-line-length = 88
max-complexity = 18
select = B,C,E,F,W,T4,B9
ignore = E203, E266, E501, W503, F403, F401
ignore = E203, E266, E501, W503, F403, F401, W291

View File

@ -23,6 +23,5 @@ jobs:
pipenv run isort --profile black src/
pipenv run black --check src/ --exclude="lib/*"
pipenv run flake8 src/ --ignore="lib/*, W, E203, E266, E501, W503, F403, F401"
pipenv run pytest

View File

@ -20,6 +20,8 @@ def get_connection():
db_version = cur.fetchone()
print(db_version)
# TODO: Register adapters for suggestion and ngram types
cur.close()
except (Exception, psycopg2.DatabaseError) as error:
@ -33,3 +35,6 @@ def close_connection(conn):
if conn is not None:
conn.close()
print("Database connection closed.")
connection = get_connection()

View File

@ -1,54 +1,140 @@
from logging import Logger
from typing import List
from model.oapen_types import OapenSuggestion
import psycopg2
from data.connection import connection
from model.oapen_types import OapenNgram
def table_exists(connection, table):
cursor = connection.cursor
cursor.execute(
"""
SELECT EXISTS (
SELECT * FROM oapen_suggestions.tables WHERE table_name=%s
)
""",
(table),
)
res = cursor.fetchone()[0]
cursor.close()
return res
def add_suggestion(connection, suggestion: OapenSuggestion) -> None:
def mogrify_ngrams(ngrams) -> str:
cursor = connection.cursor()
args = ",".join(
cursor.mogrify("(%s,%s::oapen_suggestions.ngram[])", x).decode("utf-8")
for x in ngrams
)
return args
def mogrify_suggestions(suggestions):
cursor = connection.cursor()
args = ",".join(
cursor.mogrify("(%s,%s,%s::oapen_suggestions.suggestion[])", x).decode("utf-8")
for x in suggestions
)
return args
def table_exists(table):
cursor = connection.cursor
query = """
INSERT INTO oapen_suggestions.suggestions VALUES (%s, %s, %s)
"""
SELECT EXISTS (
SELECT * FROM oapen_suggestions.tables WHERE table_name=%s
)
"""
try:
cursor.execute(query, suggestion)
except Exception as ex:
Logger.exception(ex)
cursor.execute(query, (table))
res = cursor.fetchone()[0]
return res
except (Exception, psycopg2.Error) as error:
print(error)
finally:
cursor.close()
def add_many_suggestions(connection, suggestions) -> None:
def add_single_suggestion(suggestion) -> None:
cursor = connection.cursor()
query = """
INSERT INTO oapen_suggestions.suggestions (handle, name, suggestions)
VALUES (%s, %s, %s::oapen_suggestions.suggestion[])
ON CONFLICT (handle)
DO
UPDATE SET suggestions = excluded.suggestions
"""
args_str = ",".join(
cursor.mogrify("(%s,%s,%s::suggestion[])", x).decode("utf-8")
for x in suggestions
)
try:
cursor.execute(query, (suggestion[0], suggestion[1], suggestion[2]))
except (Exception, psycopg2.Error) as error:
print(error)
finally:
cursor.close()
def add_many_suggestions(suggestions) -> None:
cursor = connection.cursor()
args = mogrify_suggestions(suggestions)
query = f"""
INSERT INTO oapen_suggestions.suggestions VALUES {args_str}
"""
INSERT INTO oapen_suggestions.suggestions (handle, name, suggestions)
VALUES {args}
ON CONFLICT (handle)
DO
UPDATE SET suggestions = excluded.suggestions
"""
try:
cursor.execute(query)
except Exception as ex:
Logger.exception(ex)
except (Exception, psycopg2.Error) as error:
print(error)
finally:
cursor.close()
def add_single_ngrams(ngram) -> None:
cursor = connection.cursor()
query = """
INSERT INTO oapen_suggestions.ngrams (handle, ngrams)
VALUES (%s, %s::oapen_suggestions.ngram[])
ON CONFLICT (handle)
DO
UPDATE SET ngrams = excluded.ngrams
"""
try:
cursor.execute(query, ngram[0], ngram[1])
except (Exception, psycopg2.Error) as error:
print(error)
finally:
cursor.close()
def add_many_ngrams(ngrams) -> None:
cursor = connection.cursor()
args = mogrify_ngrams(ngrams)
query = f"""
INSERT INTO oapen_suggestions.ngrams (handle, ngrams)
VALUES {args}
ON CONFLICT (handle)
DO
UPDATE SET ngrams = excluded.ngrams
"""
try:
cursor.execute(query)
except (Exception, psycopg2.Error) as error:
print(error)
finally:
cursor.close()
def get_all_ngrams() -> List[OapenNgram]:
cursor = connection.cursor()
query = """
SELECT * FROM oapen_suggestions.ngrams
"""
try:
ngrams: List[OapenNgram] = []
cursor.execute(query)
records = cursor.fetchall()
for i in range(1):
# print(records[i])
print(type(records[i][0]))
print(type(records[i][1]))
return ngrams
except (Exception, psycopg2.Error) as error:
print(error)
finally:
cursor.close()

View File

@ -1,8 +1,99 @@
from model.ngrams import run_ngrams
import data.oapen as OapenAPI
import data.oapen_db as OapenDB
import model.ngrams as Model
from data.connection import close_connection, connection
demo_books = {
# should be similar
"Quality Management and Accounting in Service Industries": "20.500.12657/54327",
"Management Accountants Business Orientation and Involvement in Incentive Compensation": "20.500.12657/26999",
# should be similar but different from first group
"Immersion Into Noise": "20.500.12657/33907",
"Ambisonics": "20.500.12657/23095",
}
def test_functions():
data = OapenAPI.get_collection_items_by_label(
"Austrian Science Fund (FWF)", limit=100
)
# Uncomment to print raw text of first book
# for item in data:
# print(item.get_text_bitstream())
# break
df = Model.make_df(data)
print(df.shape)
print(df)
sample_list = Model.get_text_by_handle(df, df.iloc[0].handle)
print(sample_list[:10])
sample_ngram_list = Model.generate_ngram_by_handle(df, df.iloc[0].handle, 3)
print(Model.get_n_most_occuring(sample_ngram_list, 2))
def run_demo():
items = []
ngram_dict = {}
print("---------------------------------")
for name, handle in demo_books.items():
item = OapenAPI.get_item(handle)
items.append(item)
text = Model.process_text(item.get_text())
print(f" {name}: text array\n{text[:30]}...\n")
ngram_dict[handle] = Model.generate_ngram(text, 3)
print(
f" {name}: ngram dictionary\n {list(ngram_dict[handle].items())[:30]}..."
)
print("---------------------------------")
for name, handle in demo_books.items():
print(f"Showing similarity scores for all books relative to {name}:\n")
for name2, handle2 in demo_books.items():
# if handle == handle2: # dont check self
# continue
simple_similarity_score = 100 * Model.get_similarity_score(
ngram_dict[handle], ngram_dict[handle2], n=10000
)
print(
f" Similarity score by simple count for title {name2}: {simple_similarity_score}%"
)
dict_similarity_score = 100 * Model.get_similarity_score_by_dict_count(
ngram_dict[handle], ngram_dict[handle2]
)
print(
f" Similarity score by dict count for title {name2}: {dict_similarity_score}%"
)
print()
def run_caching_test():
items = []
for name, handle in demo_books.items():
item = OapenAPI.get_item(handle)
items.append(item)
Model.cache_ngrams_from_items(items)
def run_ngrams():
# run_demo()
run_caching_test()
def main():
run_ngrams()
OapenDB.get_all_ngrams()
close_connection(connection)
return

View File

@ -1,14 +1,17 @@
import string
from typing import List
import data.oapen as OapenAPI # pylint: disable=import-error
import data.oapen_db as OapenDB
import model.stopwords as oapen_stopwords # pylint: disable=import-error
import nltk # pylint: disable=import-error
import pandas as pd # pylint: disable=import-error
from nltk import word_tokenize # pylint: disable=import-error
from nltk.corpus import stopwords # pylint: disable=import-error
from .oapen_types import OapenItem # pylint: disable=relative-beyond-top-level
from .oapen_types import ( # pylint: disable=relative-beyond-top-level
NgramDict,
OapenItem,
)
nltk.download("stopwords")
@ -46,33 +49,35 @@ def get_text_by_handle(df, handle):
return df.loc[df.handle == handle].text[0]
def generate_ngram(text, n):
def sort_ngrams_by_count(ngrams: NgramDict):
return sorted(ngrams.items(), key=lambda item: item[1], reverse=True)
def generate_ngram(text, n=3) -> NgramDict:
ngrams = {}
# store appearance count of each trigram
for index in range(0, len(text) + 1 - n):
ngram = " ".join(text[index : index + n])
ngrams.setdefault(ngram, 0) # sets curr ngram to 0 if non-existant
ngrams[ngram] += 1
return dict(
sorted(ngrams.items(), key=lambda item: item[1], reverse=True)
) # return sorted by count
return dict(sort_ngrams_by_count(ngrams)) # return sorted by count
def generate_ngram_by_handle(df, handle, n):
def generate_ngram_by_handle(df, handle, n=3) -> NgramDict:
text = get_text_by_handle(df, handle)
return generate_ngram(text, n)
def get_n_most_occuring(dic: dict, n=100):
def get_n_most_occuring(dic: NgramDict, n=100):
sorted_dict = dict(
sorted(dic.items(), key=lambda item: item[1], reverse=True)
sort_ngrams_by_count(dic)
) # sorts in case of additionas post generate_ngram
return list(sorted_dict)[:n]
# Currently, this uses the n most occuring ngrams to compare
# This could also count the instances in the highest
def get_similarity_score(ngram1, ngram2, n=100):
def get_similarity_score(ngram1: NgramDict, ngram2: NgramDict, n=100) -> float:
n_most_occ_1 = get_n_most_occuring(ngram1, n)
n_most_occ_2 = get_n_most_occuring(ngram2, n)
repeated = 0
@ -86,7 +91,7 @@ def get_similarity_score(ngram1, ngram2, n=100):
# 100% similarity score if all ngrams match from book 1
# this means that a fragment of a book will get a 100% similarity score
# when compared to it's own book, but not the reverse interaction
def get_similarity_score_by_dict_count(ngrams1, ngrams2):
def get_similarity_score_by_dict_count(ngrams1: NgramDict, ngrams2: NgramDict) -> float:
repeated = 0
total = sum(ngrams1.values()) # gets counts from book 1
for key, ngrams1_value in ngrams1.items():
@ -98,76 +103,16 @@ def get_similarity_score_by_dict_count(ngrams1, ngrams2):
return repeated / total
# to demo some functions
def test_functions():
data = OapenAPI.get_collection_items_by_label(
"Austrian Science Fund (FWF)", limit=100
)
# Uncomment to print raw text of first book
# for item in data:
# print(item.get_text_bitstream())
# break
df = make_df(data)
print(df.shape)
print(df)
sample_list = get_text_by_handle(df, df.iloc[0].handle)
print(sample_list[:10])
sample_ngram_list = generate_ngram_by_handle(df, df.iloc[0].handle, 3)
print(get_n_most_occuring(sample_ngram_list, 2))
# @params: handle = handle of item; ngrams = {str : int}
def cache_ngrams(handle: str, ngrams: NgramDict):
OapenDB.add_single_ngrams((handle, list(sort_ngrams_by_count(ngrams))))
# run demo with the above titles
def run_demo():
demo_books = {
# should be similar
"Quality Management and Accounting in Service Industries": "20.500.12657/54327",
"Management Accountants Business Orientation and Involvement in Incentive Compensation": "20.500.12657/26999",
# should be similar but different from first group
"Immersion Into Noise": "20.500.12657/33907",
"Ambisonics": "20.500.12657/23095",
}
items = []
ngram_dict = {}
print("---------------------------------")
for name, handle in demo_books.items():
item = OapenAPI.get_item(handle)
items.append(item)
def cache_ngrams_from_items(items: List[OapenItem], n=3):
rows = []
for item in items:
text = process_text(item.get_text())
print(f" {name}: text array\n{text[:30]}...\n")
ngrams = generate_ngram(text, n)
rows.append((item.handle, list(sort_ngrams_by_count(ngrams))))
ngram_dict[handle] = generate_ngram(text, 3)
print(
f" {name}: ngram dictionary\n {list(ngram_dict[handle].items())[:30]}..."
)
print("---------------------------------")
for name, handle in demo_books.items():
print(f"Showing similarity scores for all books relative to {name}:\n")
for name2, handle2 in demo_books.items():
# if handle == handle2: # dont check self
# continue
simple_similarity_score = 100 * get_similarity_score(
ngram_dict[handle], ngram_dict[handle2], n=10000
)
print(
f" Similarity score by simple count for title {name2}: {simple_similarity_score}%"
)
dict_similarity_score = 100 * get_similarity_score_by_dict_count(
ngram_dict[handle], ngram_dict[handle2]
)
print(
f" Similarity score by dict count for title {name2}: {dict_similarity_score}%"
)
print()
def run_ngrams():
run_demo()
OapenDB.add_many_ngrams(rows)

View File

@ -1,5 +1,4 @@
from collections import namedtuple
from typing import List
from typing import Dict, List, Tuple
import data.oapen as OapenAPI
@ -18,8 +17,10 @@ class OapenItem:
return OapenAPI.get_bitstream_text(self.bitstreams)
OapenSuggestion = ("OapenSuggestion", ["handle", "rank"])
OapenNgram = ("OapenNgram", ["handle", "ngrams"])
OapenSuggestion = Tuple[str, float]
OapenNgram = Tuple[str, List[Tuple[str, int]]]
NgramDict = Dict[str, int]
def transform_item_data(item) -> OapenItem:

View File

@ -5,17 +5,35 @@ def create_schema(connection) -> None:
cursor = connection.cursor()
cursor.execute(
"""
CREATE TYPE suggestion AS (handle text, rank int);
CREATE SCHEMA oapen_suggestions
CREATE TABLE IF NOT EXISTS oapen_suggestions.suggestions (
handle text PRIMARY KEY,
name text,
suggestions suggestion[]
);
CREATE TABLE oapen_suggestions.ngrams (
handle text PRIMARY KEY,
ngrams text[]
);
CREATE SCHEMA oapen_suggestions;
CREATE TYPE oapen_suggestions.suggestion AS (handle text, similarity float);
CREATE TYPE oapen_suggestions.ngram AS (ngram text, count int);
CREATE OR REPLACE FUNCTION update_modtime()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = now();
RETURN NEW;
END;
$$ language 'plpgsql';
CREATE TABLE IF NOT EXISTS oapen_suggestions.suggestions (
handle text PRIMARY KEY,
name text,
suggestions oapen_suggestions.suggestion[],
created_at timestamp default current_timestamp,
updated_at timestamp default current_timestamp
);
CREATE TABLE oapen_suggestions.ngrams (
handle text PRIMARY KEY,
ngrams oapen_suggestions.ngram[],
created_at timestamp default current_timestamp,
updated_at timestamp default current_timestamp
);
CREATE TRIGGER update_suggestion_modtime BEFORE UPDATE ON oapen_suggestions.suggestions FOR EACH ROW EXECUTE PROCEDURE update_modtime();
CREATE TRIGGER update_ngrams_modtime BEFORE UPDATE ON oapen_suggestions.ngrams FOR EACH ROW EXECUTE PROCEDURE update_modtime();
"""
)
@ -27,9 +45,10 @@ def drop_schema(connection) -> None:
cursor.execute(
"""
DROP SCHEMA IF EXISTS oapen_suggestions CASCADE;
DROP TABLE IF EXISTS suggestions CASCADE;
DROP TABLE IF EXISTS ngrams CASCADE;
DROP TYPE IF EXISTS suggestion CASCADE;
DROP TABLE IF EXISTS oapen_suggestions.suggestions CASCADE;
DROP TABLE IF EXISTS oapen_suggestions.ngrams CASCADE;
DROP TYPE IF EXISTS oapen_suggestions.suggestion CASCADE;
DROP TYPE IF EXISTS oapen_suggestions.ngram CASCADE;
"""
)

View File

@ -1,7 +1,7 @@
from typing import List
import data.oapen as OapenAPI
from data.connection import get_connection
from data.connection import close_connection, connection
from data.oapen_db import add_many_suggestions
from model.oapen_types import OapenItem
@ -12,15 +12,13 @@ def mock_suggestion_rows(n=10):
)
rows = []
for i in range(min(10, len(items))):
for i in range(min(30, len(items))):
rows.append((items[i].handle, items[i].name, [(items[i].handle, i)]))
return rows
connection = get_connection()
rows = mock_suggestion_rows(30)
add_many_suggestions(rows)
rows = mock_suggestion_rows(connection)
add_many_suggestions(connection, rows)
connection.close()
close_connection(connection)

View File

@ -1,6 +1,6 @@
from typing import List
import data.oapen as OapenAPI
import src.data.oapen as OapenAPI
from model.oapen_types import OapenItem

View File

@ -8,7 +8,7 @@ skip=[lib/]
profile=black
[flake8]
ignore = E203, E266, E501, W503, E501
ignore = E203, E266, E501, W503, E501, W291
max-line-length = 88
max-complexity = 18
select = B,C,E,F,W,T4