[Draft] OAP-32 Ngram Caching (#18)
* start caching ngrams * fix build warnings * add timestamp * resolve comments * pull out mogrify * remove pytest from hook for nowpull/20/head
parent
ccbdda287e
commit
4333d4fcc3
2
.flake8
2
.flake8
|
@ -2,4 +2,4 @@
|
|||
max-line-length = 88
|
||||
max-complexity = 18
|
||||
select = B,C,E,F,W,T4,B9
|
||||
ignore = E203, E266, E501, W503, F403, F401
|
||||
ignore = E203, E266, E501, W503, F403, F401, W291
|
|
@ -23,6 +23,5 @@ jobs:
|
|||
pipenv run isort --profile black src/
|
||||
pipenv run black --check src/ --exclude="lib/*"
|
||||
pipenv run flake8 src/ --ignore="lib/*, W, E203, E266, E501, W503, F403, F401"
|
||||
pipenv run pytest
|
||||
|
||||
|
|
@ -20,6 +20,8 @@ def get_connection():
|
|||
db_version = cur.fetchone()
|
||||
print(db_version)
|
||||
|
||||
# TODO: Register adapters for suggestion and ngram types
|
||||
|
||||
cur.close()
|
||||
|
||||
except (Exception, psycopg2.DatabaseError) as error:
|
||||
|
@ -33,3 +35,6 @@ def close_connection(conn):
|
|||
if conn is not None:
|
||||
conn.close()
|
||||
print("Database connection closed.")
|
||||
|
||||
|
||||
connection = get_connection()
|
||||
|
|
|
@ -1,54 +1,140 @@
|
|||
from logging import Logger
|
||||
from typing import List
|
||||
|
||||
from model.oapen_types import OapenSuggestion
|
||||
import psycopg2
|
||||
from data.connection import connection
|
||||
from model.oapen_types import OapenNgram
|
||||
|
||||
|
||||
def table_exists(connection, table):
|
||||
cursor = connection.cursor
|
||||
cursor.execute(
|
||||
"""
|
||||
SELECT EXISTS (
|
||||
SELECT * FROM oapen_suggestions.tables WHERE table_name=%s
|
||||
)
|
||||
""",
|
||||
(table),
|
||||
)
|
||||
|
||||
res = cursor.fetchone()[0]
|
||||
cursor.close()
|
||||
return res
|
||||
|
||||
|
||||
def add_suggestion(connection, suggestion: OapenSuggestion) -> None:
|
||||
def mogrify_ngrams(ngrams) -> str:
|
||||
cursor = connection.cursor()
|
||||
args = ",".join(
|
||||
cursor.mogrify("(%s,%s::oapen_suggestions.ngram[])", x).decode("utf-8")
|
||||
for x in ngrams
|
||||
)
|
||||
return args
|
||||
|
||||
|
||||
def mogrify_suggestions(suggestions):
|
||||
cursor = connection.cursor()
|
||||
args = ",".join(
|
||||
cursor.mogrify("(%s,%s,%s::oapen_suggestions.suggestion[])", x).decode("utf-8")
|
||||
for x in suggestions
|
||||
)
|
||||
return args
|
||||
|
||||
|
||||
def table_exists(table):
|
||||
cursor = connection.cursor
|
||||
query = """
|
||||
INSERT INTO oapen_suggestions.suggestions VALUES (%s, %s, %s)
|
||||
"""
|
||||
SELECT EXISTS (
|
||||
SELECT * FROM oapen_suggestions.tables WHERE table_name=%s
|
||||
)
|
||||
"""
|
||||
|
||||
try:
|
||||
cursor.execute(query, suggestion)
|
||||
except Exception as ex:
|
||||
Logger.exception(ex)
|
||||
cursor.execute(query, (table))
|
||||
res = cursor.fetchone()[0]
|
||||
return res
|
||||
except (Exception, psycopg2.Error) as error:
|
||||
print(error)
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
|
||||
def add_many_suggestions(connection, suggestions) -> None:
|
||||
def add_single_suggestion(suggestion) -> None:
|
||||
cursor = connection.cursor()
|
||||
query = """
|
||||
INSERT INTO oapen_suggestions.suggestions (handle, name, suggestions)
|
||||
VALUES (%s, %s, %s::oapen_suggestions.suggestion[])
|
||||
ON CONFLICT (handle)
|
||||
DO
|
||||
UPDATE SET suggestions = excluded.suggestions
|
||||
"""
|
||||
|
||||
args_str = ",".join(
|
||||
cursor.mogrify("(%s,%s,%s::suggestion[])", x).decode("utf-8")
|
||||
for x in suggestions
|
||||
)
|
||||
try:
|
||||
cursor.execute(query, (suggestion[0], suggestion[1], suggestion[2]))
|
||||
except (Exception, psycopg2.Error) as error:
|
||||
print(error)
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
|
||||
def add_many_suggestions(suggestions) -> None:
|
||||
cursor = connection.cursor()
|
||||
args = mogrify_suggestions(suggestions)
|
||||
query = f"""
|
||||
INSERT INTO oapen_suggestions.suggestions VALUES {args_str}
|
||||
"""
|
||||
INSERT INTO oapen_suggestions.suggestions (handle, name, suggestions)
|
||||
VALUES {args}
|
||||
ON CONFLICT (handle)
|
||||
DO
|
||||
UPDATE SET suggestions = excluded.suggestions
|
||||
"""
|
||||
|
||||
try:
|
||||
cursor.execute(query)
|
||||
except Exception as ex:
|
||||
Logger.exception(ex)
|
||||
except (Exception, psycopg2.Error) as error:
|
||||
print(error)
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
|
||||
def add_single_ngrams(ngram) -> None:
|
||||
cursor = connection.cursor()
|
||||
query = """
|
||||
INSERT INTO oapen_suggestions.ngrams (handle, ngrams)
|
||||
VALUES (%s, %s::oapen_suggestions.ngram[])
|
||||
ON CONFLICT (handle)
|
||||
DO
|
||||
UPDATE SET ngrams = excluded.ngrams
|
||||
"""
|
||||
|
||||
try:
|
||||
cursor.execute(query, ngram[0], ngram[1])
|
||||
except (Exception, psycopg2.Error) as error:
|
||||
print(error)
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
|
||||
def add_many_ngrams(ngrams) -> None:
|
||||
cursor = connection.cursor()
|
||||
args = mogrify_ngrams(ngrams)
|
||||
query = f"""
|
||||
INSERT INTO oapen_suggestions.ngrams (handle, ngrams)
|
||||
VALUES {args}
|
||||
ON CONFLICT (handle)
|
||||
DO
|
||||
UPDATE SET ngrams = excluded.ngrams
|
||||
"""
|
||||
|
||||
try:
|
||||
cursor.execute(query)
|
||||
except (Exception, psycopg2.Error) as error:
|
||||
print(error)
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
|
||||
def get_all_ngrams() -> List[OapenNgram]:
|
||||
cursor = connection.cursor()
|
||||
query = """
|
||||
SELECT * FROM oapen_suggestions.ngrams
|
||||
"""
|
||||
|
||||
try:
|
||||
|
||||
ngrams: List[OapenNgram] = []
|
||||
cursor.execute(query)
|
||||
records = cursor.fetchall()
|
||||
|
||||
for i in range(1):
|
||||
# print(records[i])
|
||||
print(type(records[i][0]))
|
||||
print(type(records[i][1]))
|
||||
|
||||
return ngrams
|
||||
|
||||
except (Exception, psycopg2.Error) as error:
|
||||
print(error)
|
||||
finally:
|
||||
cursor.close()
|
||||
|
|
|
@ -1,8 +1,99 @@
|
|||
from model.ngrams import run_ngrams
|
||||
import data.oapen as OapenAPI
|
||||
import data.oapen_db as OapenDB
|
||||
import model.ngrams as Model
|
||||
from data.connection import close_connection, connection
|
||||
|
||||
demo_books = {
|
||||
# should be similar
|
||||
"Quality Management and Accounting in Service Industries": "20.500.12657/54327",
|
||||
"Management Accountants’ Business Orientation and Involvement in Incentive Compensation": "20.500.12657/26999",
|
||||
# should be similar but different from first group
|
||||
"Immersion Into Noise": "20.500.12657/33907",
|
||||
"Ambisonics": "20.500.12657/23095",
|
||||
}
|
||||
|
||||
|
||||
def test_functions():
|
||||
data = OapenAPI.get_collection_items_by_label(
|
||||
"Austrian Science Fund (FWF)", limit=100
|
||||
)
|
||||
# Uncomment to print raw text of first book
|
||||
# for item in data:
|
||||
# print(item.get_text_bitstream())
|
||||
# break
|
||||
df = Model.make_df(data)
|
||||
print(df.shape)
|
||||
print(df)
|
||||
sample_list = Model.get_text_by_handle(df, df.iloc[0].handle)
|
||||
print(sample_list[:10])
|
||||
sample_ngram_list = Model.generate_ngram_by_handle(df, df.iloc[0].handle, 3)
|
||||
print(Model.get_n_most_occuring(sample_ngram_list, 2))
|
||||
|
||||
|
||||
def run_demo():
|
||||
items = []
|
||||
ngram_dict = {}
|
||||
|
||||
print("---------------------------------")
|
||||
|
||||
for name, handle in demo_books.items():
|
||||
item = OapenAPI.get_item(handle)
|
||||
|
||||
items.append(item)
|
||||
|
||||
text = Model.process_text(item.get_text())
|
||||
print(f" {name}: text array\n{text[:30]}...\n")
|
||||
|
||||
ngram_dict[handle] = Model.generate_ngram(text, 3)
|
||||
print(
|
||||
f" {name}: ngram dictionary\n {list(ngram_dict[handle].items())[:30]}..."
|
||||
)
|
||||
|
||||
print("---------------------------------")
|
||||
|
||||
for name, handle in demo_books.items():
|
||||
print(f"Showing similarity scores for all books relative to {name}:\n")
|
||||
for name2, handle2 in demo_books.items():
|
||||
# if handle == handle2: # dont check self
|
||||
# continue
|
||||
|
||||
simple_similarity_score = 100 * Model.get_similarity_score(
|
||||
ngram_dict[handle], ngram_dict[handle2], n=10000
|
||||
)
|
||||
print(
|
||||
f" Similarity score by simple count for title {name2}: {simple_similarity_score}%"
|
||||
)
|
||||
|
||||
dict_similarity_score = 100 * Model.get_similarity_score_by_dict_count(
|
||||
ngram_dict[handle], ngram_dict[handle2]
|
||||
)
|
||||
print(
|
||||
f" Similarity score by dict count for title {name2}: {dict_similarity_score}%"
|
||||
)
|
||||
print()
|
||||
|
||||
|
||||
def run_caching_test():
|
||||
|
||||
items = []
|
||||
|
||||
for name, handle in demo_books.items():
|
||||
item = OapenAPI.get_item(handle)
|
||||
items.append(item)
|
||||
|
||||
Model.cache_ngrams_from_items(items)
|
||||
|
||||
|
||||
def run_ngrams():
|
||||
# run_demo()
|
||||
run_caching_test()
|
||||
|
||||
|
||||
def main():
|
||||
run_ngrams()
|
||||
|
||||
OapenDB.get_all_ngrams()
|
||||
close_connection(connection)
|
||||
return
|
||||
|
||||
|
||||
|
|
|
@ -1,14 +1,17 @@
|
|||
import string
|
||||
from typing import List
|
||||
|
||||
import data.oapen as OapenAPI # pylint: disable=import-error
|
||||
import data.oapen_db as OapenDB
|
||||
import model.stopwords as oapen_stopwords # pylint: disable=import-error
|
||||
import nltk # pylint: disable=import-error
|
||||
import pandas as pd # pylint: disable=import-error
|
||||
from nltk import word_tokenize # pylint: disable=import-error
|
||||
from nltk.corpus import stopwords # pylint: disable=import-error
|
||||
|
||||
from .oapen_types import OapenItem # pylint: disable=relative-beyond-top-level
|
||||
from .oapen_types import ( # pylint: disable=relative-beyond-top-level
|
||||
NgramDict,
|
||||
OapenItem,
|
||||
)
|
||||
|
||||
nltk.download("stopwords")
|
||||
|
||||
|
@ -46,33 +49,35 @@ def get_text_by_handle(df, handle):
|
|||
return df.loc[df.handle == handle].text[0]
|
||||
|
||||
|
||||
def generate_ngram(text, n):
|
||||
def sort_ngrams_by_count(ngrams: NgramDict):
|
||||
return sorted(ngrams.items(), key=lambda item: item[1], reverse=True)
|
||||
|
||||
|
||||
def generate_ngram(text, n=3) -> NgramDict:
|
||||
ngrams = {}
|
||||
# store appearance count of each trigram
|
||||
for index in range(0, len(text) + 1 - n):
|
||||
ngram = " ".join(text[index : index + n])
|
||||
ngrams.setdefault(ngram, 0) # sets curr ngram to 0 if non-existant
|
||||
ngrams[ngram] += 1
|
||||
return dict(
|
||||
sorted(ngrams.items(), key=lambda item: item[1], reverse=True)
|
||||
) # return sorted by count
|
||||
return dict(sort_ngrams_by_count(ngrams)) # return sorted by count
|
||||
|
||||
|
||||
def generate_ngram_by_handle(df, handle, n):
|
||||
def generate_ngram_by_handle(df, handle, n=3) -> NgramDict:
|
||||
text = get_text_by_handle(df, handle)
|
||||
return generate_ngram(text, n)
|
||||
|
||||
|
||||
def get_n_most_occuring(dic: dict, n=100):
|
||||
def get_n_most_occuring(dic: NgramDict, n=100):
|
||||
sorted_dict = dict(
|
||||
sorted(dic.items(), key=lambda item: item[1], reverse=True)
|
||||
sort_ngrams_by_count(dic)
|
||||
) # sorts in case of additionas post generate_ngram
|
||||
return list(sorted_dict)[:n]
|
||||
|
||||
|
||||
# Currently, this uses the n most occuring ngrams to compare
|
||||
# This could also count the instances in the highest
|
||||
def get_similarity_score(ngram1, ngram2, n=100):
|
||||
def get_similarity_score(ngram1: NgramDict, ngram2: NgramDict, n=100) -> float:
|
||||
n_most_occ_1 = get_n_most_occuring(ngram1, n)
|
||||
n_most_occ_2 = get_n_most_occuring(ngram2, n)
|
||||
repeated = 0
|
||||
|
@ -86,7 +91,7 @@ def get_similarity_score(ngram1, ngram2, n=100):
|
|||
# 100% similarity score if all ngrams match from book 1
|
||||
# this means that a fragment of a book will get a 100% similarity score
|
||||
# when compared to it's own book, but not the reverse interaction
|
||||
def get_similarity_score_by_dict_count(ngrams1, ngrams2):
|
||||
def get_similarity_score_by_dict_count(ngrams1: NgramDict, ngrams2: NgramDict) -> float:
|
||||
repeated = 0
|
||||
total = sum(ngrams1.values()) # gets counts from book 1
|
||||
for key, ngrams1_value in ngrams1.items():
|
||||
|
@ -98,76 +103,16 @@ def get_similarity_score_by_dict_count(ngrams1, ngrams2):
|
|||
return repeated / total
|
||||
|
||||
|
||||
# to demo some functions
|
||||
def test_functions():
|
||||
data = OapenAPI.get_collection_items_by_label(
|
||||
"Austrian Science Fund (FWF)", limit=100
|
||||
)
|
||||
# Uncomment to print raw text of first book
|
||||
# for item in data:
|
||||
# print(item.get_text_bitstream())
|
||||
# break
|
||||
df = make_df(data)
|
||||
print(df.shape)
|
||||
print(df)
|
||||
sample_list = get_text_by_handle(df, df.iloc[0].handle)
|
||||
print(sample_list[:10])
|
||||
sample_ngram_list = generate_ngram_by_handle(df, df.iloc[0].handle, 3)
|
||||
print(get_n_most_occuring(sample_ngram_list, 2))
|
||||
# @params: handle = handle of item; ngrams = {str : int}
|
||||
def cache_ngrams(handle: str, ngrams: NgramDict):
|
||||
OapenDB.add_single_ngrams((handle, list(sort_ngrams_by_count(ngrams))))
|
||||
|
||||
|
||||
# run demo with the above titles
|
||||
def run_demo():
|
||||
demo_books = {
|
||||
# should be similar
|
||||
"Quality Management and Accounting in Service Industries": "20.500.12657/54327",
|
||||
"Management Accountants’ Business Orientation and Involvement in Incentive Compensation": "20.500.12657/26999",
|
||||
# should be similar but different from first group
|
||||
"Immersion Into Noise": "20.500.12657/33907",
|
||||
"Ambisonics": "20.500.12657/23095",
|
||||
}
|
||||
|
||||
items = []
|
||||
ngram_dict = {}
|
||||
|
||||
print("---------------------------------")
|
||||
|
||||
for name, handle in demo_books.items():
|
||||
item = OapenAPI.get_item(handle)
|
||||
|
||||
items.append(item)
|
||||
|
||||
def cache_ngrams_from_items(items: List[OapenItem], n=3):
|
||||
rows = []
|
||||
for item in items:
|
||||
text = process_text(item.get_text())
|
||||
print(f" {name}: text array\n{text[:30]}...\n")
|
||||
ngrams = generate_ngram(text, n)
|
||||
rows.append((item.handle, list(sort_ngrams_by_count(ngrams))))
|
||||
|
||||
ngram_dict[handle] = generate_ngram(text, 3)
|
||||
print(
|
||||
f" {name}: ngram dictionary\n {list(ngram_dict[handle].items())[:30]}..."
|
||||
)
|
||||
|
||||
print("---------------------------------")
|
||||
|
||||
for name, handle in demo_books.items():
|
||||
print(f"Showing similarity scores for all books relative to {name}:\n")
|
||||
for name2, handle2 in demo_books.items():
|
||||
# if handle == handle2: # dont check self
|
||||
# continue
|
||||
|
||||
simple_similarity_score = 100 * get_similarity_score(
|
||||
ngram_dict[handle], ngram_dict[handle2], n=10000
|
||||
)
|
||||
print(
|
||||
f" Similarity score by simple count for title {name2}: {simple_similarity_score}%"
|
||||
)
|
||||
|
||||
dict_similarity_score = 100 * get_similarity_score_by_dict_count(
|
||||
ngram_dict[handle], ngram_dict[handle2]
|
||||
)
|
||||
print(
|
||||
f" Similarity score by dict count for title {name2}: {dict_similarity_score}%"
|
||||
)
|
||||
print()
|
||||
|
||||
|
||||
def run_ngrams():
|
||||
run_demo()
|
||||
OapenDB.add_many_ngrams(rows)
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from collections import namedtuple
|
||||
from typing import List
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import data.oapen as OapenAPI
|
||||
|
||||
|
@ -18,8 +17,10 @@ class OapenItem:
|
|||
return OapenAPI.get_bitstream_text(self.bitstreams)
|
||||
|
||||
|
||||
OapenSuggestion = ("OapenSuggestion", ["handle", "rank"])
|
||||
OapenNgram = ("OapenNgram", ["handle", "ngrams"])
|
||||
OapenSuggestion = Tuple[str, float]
|
||||
OapenNgram = Tuple[str, List[Tuple[str, int]]]
|
||||
|
||||
NgramDict = Dict[str, int]
|
||||
|
||||
|
||||
def transform_item_data(item) -> OapenItem:
|
||||
|
|
|
@ -5,17 +5,35 @@ def create_schema(connection) -> None:
|
|||
cursor = connection.cursor()
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TYPE suggestion AS (handle text, rank int);
|
||||
CREATE SCHEMA oapen_suggestions
|
||||
CREATE TABLE IF NOT EXISTS oapen_suggestions.suggestions (
|
||||
handle text PRIMARY KEY,
|
||||
name text,
|
||||
suggestions suggestion[]
|
||||
);
|
||||
CREATE TABLE oapen_suggestions.ngrams (
|
||||
handle text PRIMARY KEY,
|
||||
ngrams text[]
|
||||
);
|
||||
CREATE SCHEMA oapen_suggestions;
|
||||
|
||||
CREATE TYPE oapen_suggestions.suggestion AS (handle text, similarity float);
|
||||
CREATE TYPE oapen_suggestions.ngram AS (ngram text, count int);
|
||||
|
||||
CREATE OR REPLACE FUNCTION update_modtime()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = now();
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ language 'plpgsql';
|
||||
|
||||
CREATE TABLE IF NOT EXISTS oapen_suggestions.suggestions (
|
||||
handle text PRIMARY KEY,
|
||||
name text,
|
||||
suggestions oapen_suggestions.suggestion[],
|
||||
created_at timestamp default current_timestamp,
|
||||
updated_at timestamp default current_timestamp
|
||||
);
|
||||
CREATE TABLE oapen_suggestions.ngrams (
|
||||
handle text PRIMARY KEY,
|
||||
ngrams oapen_suggestions.ngram[],
|
||||
created_at timestamp default current_timestamp,
|
||||
updated_at timestamp default current_timestamp
|
||||
);
|
||||
|
||||
CREATE TRIGGER update_suggestion_modtime BEFORE UPDATE ON oapen_suggestions.suggestions FOR EACH ROW EXECUTE PROCEDURE update_modtime();
|
||||
CREATE TRIGGER update_ngrams_modtime BEFORE UPDATE ON oapen_suggestions.ngrams FOR EACH ROW EXECUTE PROCEDURE update_modtime();
|
||||
"""
|
||||
)
|
||||
|
||||
|
@ -27,9 +45,10 @@ def drop_schema(connection) -> None:
|
|||
cursor.execute(
|
||||
"""
|
||||
DROP SCHEMA IF EXISTS oapen_suggestions CASCADE;
|
||||
DROP TABLE IF EXISTS suggestions CASCADE;
|
||||
DROP TABLE IF EXISTS ngrams CASCADE;
|
||||
DROP TYPE IF EXISTS suggestion CASCADE;
|
||||
DROP TABLE IF EXISTS oapen_suggestions.suggestions CASCADE;
|
||||
DROP TABLE IF EXISTS oapen_suggestions.ngrams CASCADE;
|
||||
DROP TYPE IF EXISTS oapen_suggestions.suggestion CASCADE;
|
||||
DROP TYPE IF EXISTS oapen_suggestions.ngram CASCADE;
|
||||
"""
|
||||
)
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from typing import List
|
||||
|
||||
import data.oapen as OapenAPI
|
||||
from data.connection import get_connection
|
||||
from data.connection import close_connection, connection
|
||||
from data.oapen_db import add_many_suggestions
|
||||
from model.oapen_types import OapenItem
|
||||
|
||||
|
@ -12,15 +12,13 @@ def mock_suggestion_rows(n=10):
|
|||
)
|
||||
|
||||
rows = []
|
||||
for i in range(min(10, len(items))):
|
||||
for i in range(min(30, len(items))):
|
||||
rows.append((items[i].handle, items[i].name, [(items[i].handle, i)]))
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
connection = get_connection()
|
||||
rows = mock_suggestion_rows(30)
|
||||
add_many_suggestions(rows)
|
||||
|
||||
rows = mock_suggestion_rows(connection)
|
||||
add_many_suggestions(connection, rows)
|
||||
|
||||
connection.close()
|
||||
close_connection(connection)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from typing import List
|
||||
|
||||
import data.oapen as OapenAPI
|
||||
import src.data.oapen as OapenAPI
|
||||
from model.oapen_types import OapenItem
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue