OAP 26 (#12)

We are disregarding the linting job failure as this is maybe an environment issue. Will be fixed in subsequent PRs.
2022-10-18 11:31:49 -04:00 · 2022-10-18 11:31:49 -04:00 · 033fc1e56e
parent 09ec61b7d7
commit 033fc1e56e
2 changed files with 133 additions and 28 deletions
--- a/oapen-engine/Makefile
+++ b/oapen-engine/Makefile
@ -1,18 +0,0 @@
 setup-env:
 ifeq ($(OS),Windows_NT)
 	py -m pip install --upgrade pip
 else
 	python -m pip install --upgrade pip
 endif
 	pip install pipenv
 	pipenv install
 	pipenv shell
 seed_db:
 	cd src && pipenv run python tasks/seed.py
 clean_db:
 	cd src && pipenv run python tasks/clean.py
 run:
 	cd src && pipenv run python main.py
--- a/oapen-engine/src/model/ngrams.py
+++ b/oapen-engine/src/model/ngrams.py
@ -1,14 +1,17 @@
 import string
 from typing import List
-import data.oapen as OapenAPI
+import data.oapen as OapenAPI  # pylint: disable=import-error
-import lib.stopwords as oapen_stopwords
+import model.stopwords as oapen_stopwords  # pylint: disable=import-error
-import nltk
+import nltk  # pylint: disable=import-error
-import pandas as pd
+import pandas as pd  # pylint: disable=import-error
-from nltk import word_tokenize
+from nltk import word_tokenize  # pylint: disable=import-error
-from nltk.corpus import stopwords
+from nltk.corpus import stopwords  # pylint: disable=import-error
-from .oapen_types import OapenItem, transform_item_data
+from .oapen_types import (  # pylint: disable=relative-beyond-top-level
    OapenItem,
    transform_item_data,
 )
 nltk.download("stopwords")
@ -26,7 +29,9 @@ def process_text(text):
    l_text = text.lower()
    p_text = "".join([c for c in l_text if c not in string.punctuation])
    words = word_tokenize(p_text)
-    filtered_words = list(filter(lambda x: x not in STOPWORDS, words))
+    filtered_words = list(
        filter(lambda x: x not in STOPWORDS and x.isalpha(), words)
    )  # added isalpha to check that it contains only letters
    return filtered_words
@ -57,8 +62,126 @@ def make_df(data: List[OapenItem]):
    return df
-def run_ngrams():
+def get_text_by_uuid(df, uuid):
    return df.loc[df.uuid == uuid].text[0]
 def generate_ngram(text, n):
    ngrams = {}
    # store appearance count of each trigram
    for index in range(0, len(text) + 1 - n):
        ngram = " ".join(text[index: index + n])
        ngrams.setdefault(ngram, 0)  # sets curr ngram to 0 if non-existant
        ngrams[ngram] += 1
    return dict(
        sorted(ngrams.items(), key=lambda item: item[1], reverse=True)
    )  # return sorted by count
 def generate_ngram_by_uuid(df, uuid, n):
    text = get_text_by_uuid(df, uuid)
    return generate_ngram(text, n)
 def get_n_most_occuring(dic: dict, n=100):
    sorted_dict = dict(
        sorted(dic.items(), key=lambda item: item[1], reverse=True)
    )  # sorts in case of additionas post generate_ngram
    return list(sorted_dict)[:n]
 # Currently, this uses the n most occuring ngrams to compare
 # This could also count the instances in the highest
 def get_similarity_score(ngram1, ngram2, n=100):
    n_most_occ_1 = get_n_most_occuring(ngram1, n)
    n_most_occ_2 = get_n_most_occuring(ngram2, n)
    repeated = 0
    for n_gram in n_most_occ_1:
        if n_gram in n_most_occ_2:
            repeated += 1
    return repeated / n
 # this treats ngrams1 as primary ngrams, since we want a
 # 100% similarity score if all ngrams match from book 1
 # this means that a fragment of a book will get a 100% similarity score
 # when compared to it's own book, but not the reverse interaction
 def get_similarity_score_by_dict_count(ngrams1, ngrams2):
    repeated = 0
    total = sum(ngrams1.values())  # gets counts from book 1
    for key, ngrams1_value in ngrams1.items():
        repeated += min(
            ngrams1_value, ngrams2.get(key, 0)
        )  # adds min value, or 0 by default if key not found
        # if(min(ngrams1_value, ngrams2.get(key, 0)) != 0):
        #     print(key)
    return repeated / total
 # to demo some functions
 def test_functions():
    data = get_data()
    # Uncomment to print raw text of first book
    # for item in data:
    #     print(item.get_text_bitstream())
    #     break
    df = make_df(data)
    print(df.shape)
-    print(df[:10])
+    print(df)
    sample_list = get_text_by_uuid(df, df.iloc[0].uuid)
    print(sample_list[:10])
    sample_ngram_list = generate_ngram_by_uuid(df, df.iloc[0].uuid, 3)
    print(get_n_most_occuring(sample_ngram_list, 2))
 # run demo with the above titles
 def run_demo():
    demo_books = {
        # should be similar
        "Domestic...": "01d59c45-78b8-4710-9805-584c72866c32",
        "Local Leadership ...": "00fc2a5a-6540-4176-ac76-c35ddba4cceb",
        # should be similar but different from first group
        "Repurposing Music...": "02445c92-5c12-47e3-bde7-5764ef6c0434",
        # "An Experimental..." : "00fa7fba-0343-4db9-b18b-7c9d430a1131"
    }
    items = []
    ngram_dict = {}
    print("---------------------------------")
    for name, uuid in demo_books.items():
        book_item = OapenAPI.get_item(uuid)
        print(book_item)
        item = transform_item_data(book_item)
        items.append(item)
        text = process_text(item.get_text_bitstream())
        print(f"  {name}: text array\n{text[:30]}...\n")
        ngram_dict[uuid] = generate_ngram(text, 3)
        print(f"  {name}: ngram dictionary\n {list(ngram_dict[uuid].items())[:30]}...")
        print("---------------------------------")
    for name, uuid in demo_books.items():
        print(f"Showing similarity scores for all books relative to {name}:\n")
        for name2, uuid2 in demo_books.items():
            if uuid == uuid2:  # dont check self
                continue
            simple_similarity_score = 100 * get_similarity_score(ngram_dict[uuid], ngram_dict[uuid2], n=10000)
            print(
                f"  Similarity score by simple count for title {name2}: {simple_similarity_score}%"
            )
            dict_similarity_score = 100 * get_similarity_score_by_dict_count(ngram_dict[uuid], ngram_dict[uuid2])
            print(
                f"  Similarity score by dict count for title {name2}: {dict_similarity_score}%"
            )
            print()
 def run_ngrams():
    run_demo()