OAP 26 (#12)
We are disregarding the linting job failure as this is maybe an environment issue. Will be fixed in subsequent PRs.pull/15/head^2
parent
09ec61b7d7
commit
033fc1e56e
|
@ -1,18 +0,0 @@
|
||||||
setup-env:
|
|
||||||
ifeq ($(OS),Windows_NT)
|
|
||||||
py -m pip install --upgrade pip
|
|
||||||
else
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
endif
|
|
||||||
pip install pipenv
|
|
||||||
pipenv install
|
|
||||||
pipenv shell
|
|
||||||
|
|
||||||
seed_db:
|
|
||||||
cd src && pipenv run python tasks/seed.py
|
|
||||||
|
|
||||||
clean_db:
|
|
||||||
cd src && pipenv run python tasks/clean.py
|
|
||||||
|
|
||||||
run:
|
|
||||||
cd src && pipenv run python main.py
|
|
|
@ -1,14 +1,17 @@
|
||||||
import string
|
import string
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import data.oapen as OapenAPI
|
import data.oapen as OapenAPI # pylint: disable=import-error
|
||||||
import lib.stopwords as oapen_stopwords
|
import model.stopwords as oapen_stopwords # pylint: disable=import-error
|
||||||
import nltk
|
import nltk # pylint: disable=import-error
|
||||||
import pandas as pd
|
import pandas as pd # pylint: disable=import-error
|
||||||
from nltk import word_tokenize
|
from nltk import word_tokenize # pylint: disable=import-error
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords # pylint: disable=import-error
|
||||||
|
|
||||||
from .oapen_types import OapenItem, transform_item_data
|
from .oapen_types import ( # pylint: disable=relative-beyond-top-level
|
||||||
|
OapenItem,
|
||||||
|
transform_item_data,
|
||||||
|
)
|
||||||
|
|
||||||
nltk.download("stopwords")
|
nltk.download("stopwords")
|
||||||
|
|
||||||
|
@ -26,7 +29,9 @@ def process_text(text):
|
||||||
l_text = text.lower()
|
l_text = text.lower()
|
||||||
p_text = "".join([c for c in l_text if c not in string.punctuation])
|
p_text = "".join([c for c in l_text if c not in string.punctuation])
|
||||||
words = word_tokenize(p_text)
|
words = word_tokenize(p_text)
|
||||||
filtered_words = list(filter(lambda x: x not in STOPWORDS, words))
|
filtered_words = list(
|
||||||
|
filter(lambda x: x not in STOPWORDS and x.isalpha(), words)
|
||||||
|
) # added isalpha to check that it contains only letters
|
||||||
|
|
||||||
return filtered_words
|
return filtered_words
|
||||||
|
|
||||||
|
@ -57,8 +62,126 @@ def make_df(data: List[OapenItem]):
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
def run_ngrams():
|
def get_text_by_uuid(df, uuid):
|
||||||
|
return df.loc[df.uuid == uuid].text[0]
|
||||||
|
|
||||||
|
|
||||||
|
def generate_ngram(text, n):
|
||||||
|
ngrams = {}
|
||||||
|
# store appearance count of each trigram
|
||||||
|
for index in range(0, len(text) + 1 - n):
|
||||||
|
ngram = " ".join(text[index: index + n])
|
||||||
|
ngrams.setdefault(ngram, 0) # sets curr ngram to 0 if non-existant
|
||||||
|
ngrams[ngram] += 1
|
||||||
|
return dict(
|
||||||
|
sorted(ngrams.items(), key=lambda item: item[1], reverse=True)
|
||||||
|
) # return sorted by count
|
||||||
|
|
||||||
|
|
||||||
|
def generate_ngram_by_uuid(df, uuid, n):
|
||||||
|
text = get_text_by_uuid(df, uuid)
|
||||||
|
return generate_ngram(text, n)
|
||||||
|
|
||||||
|
|
||||||
|
def get_n_most_occuring(dic: dict, n=100):
|
||||||
|
sorted_dict = dict(
|
||||||
|
sorted(dic.items(), key=lambda item: item[1], reverse=True)
|
||||||
|
) # sorts in case of additionas post generate_ngram
|
||||||
|
return list(sorted_dict)[:n]
|
||||||
|
|
||||||
|
|
||||||
|
# Currently, this uses the n most occuring ngrams to compare
|
||||||
|
# This could also count the instances in the highest
|
||||||
|
def get_similarity_score(ngram1, ngram2, n=100):
|
||||||
|
n_most_occ_1 = get_n_most_occuring(ngram1, n)
|
||||||
|
n_most_occ_2 = get_n_most_occuring(ngram2, n)
|
||||||
|
repeated = 0
|
||||||
|
for n_gram in n_most_occ_1:
|
||||||
|
if n_gram in n_most_occ_2:
|
||||||
|
repeated += 1
|
||||||
|
return repeated / n
|
||||||
|
|
||||||
|
|
||||||
|
# this treats ngrams1 as primary ngrams, since we want a
|
||||||
|
# 100% similarity score if all ngrams match from book 1
|
||||||
|
# this means that a fragment of a book will get a 100% similarity score
|
||||||
|
# when compared to it's own book, but not the reverse interaction
|
||||||
|
def get_similarity_score_by_dict_count(ngrams1, ngrams2):
|
||||||
|
repeated = 0
|
||||||
|
total = sum(ngrams1.values()) # gets counts from book 1
|
||||||
|
for key, ngrams1_value in ngrams1.items():
|
||||||
|
repeated += min(
|
||||||
|
ngrams1_value, ngrams2.get(key, 0)
|
||||||
|
) # adds min value, or 0 by default if key not found
|
||||||
|
# if(min(ngrams1_value, ngrams2.get(key, 0)) != 0):
|
||||||
|
# print(key)
|
||||||
|
return repeated / total
|
||||||
|
|
||||||
|
|
||||||
|
# to demo some functions
|
||||||
|
def test_functions():
|
||||||
data = get_data()
|
data = get_data()
|
||||||
|
# Uncomment to print raw text of first book
|
||||||
|
# for item in data:
|
||||||
|
# print(item.get_text_bitstream())
|
||||||
|
# break
|
||||||
df = make_df(data)
|
df = make_df(data)
|
||||||
print(df.shape)
|
print(df.shape)
|
||||||
print(df[:10])
|
print(df)
|
||||||
|
sample_list = get_text_by_uuid(df, df.iloc[0].uuid)
|
||||||
|
print(sample_list[:10])
|
||||||
|
sample_ngram_list = generate_ngram_by_uuid(df, df.iloc[0].uuid, 3)
|
||||||
|
print(get_n_most_occuring(sample_ngram_list, 2))
|
||||||
|
|
||||||
|
|
||||||
|
# run demo with the above titles
|
||||||
|
def run_demo():
|
||||||
|
demo_books = {
|
||||||
|
# should be similar
|
||||||
|
"Domestic...": "01d59c45-78b8-4710-9805-584c72866c32",
|
||||||
|
"Local Leadership ...": "00fc2a5a-6540-4176-ac76-c35ddba4cceb",
|
||||||
|
# should be similar but different from first group
|
||||||
|
"Repurposing Music...": "02445c92-5c12-47e3-bde7-5764ef6c0434",
|
||||||
|
# "An Experimental..." : "00fa7fba-0343-4db9-b18b-7c9d430a1131"
|
||||||
|
}
|
||||||
|
|
||||||
|
items = []
|
||||||
|
ngram_dict = {}
|
||||||
|
|
||||||
|
print("---------------------------------")
|
||||||
|
|
||||||
|
for name, uuid in demo_books.items():
|
||||||
|
book_item = OapenAPI.get_item(uuid)
|
||||||
|
print(book_item)
|
||||||
|
|
||||||
|
item = transform_item_data(book_item)
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
|
text = process_text(item.get_text_bitstream())
|
||||||
|
print(f" {name}: text array\n{text[:30]}...\n")
|
||||||
|
|
||||||
|
ngram_dict[uuid] = generate_ngram(text, 3)
|
||||||
|
print(f" {name}: ngram dictionary\n {list(ngram_dict[uuid].items())[:30]}...")
|
||||||
|
|
||||||
|
print("---------------------------------")
|
||||||
|
|
||||||
|
for name, uuid in demo_books.items():
|
||||||
|
print(f"Showing similarity scores for all books relative to {name}:\n")
|
||||||
|
for name2, uuid2 in demo_books.items():
|
||||||
|
if uuid == uuid2: # dont check self
|
||||||
|
continue
|
||||||
|
|
||||||
|
simple_similarity_score = 100 * get_similarity_score(ngram_dict[uuid], ngram_dict[uuid2], n=10000)
|
||||||
|
print(
|
||||||
|
f" Similarity score by simple count for title {name2}: {simple_similarity_score}%"
|
||||||
|
)
|
||||||
|
|
||||||
|
dict_similarity_score = 100 * get_similarity_score_by_dict_count(ngram_dict[uuid], ngram_dict[uuid2])
|
||||||
|
print(
|
||||||
|
f" Similarity score by dict count for title {name2}: {dict_similarity_score}%"
|
||||||
|
)
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def run_ngrams():
|
||||||
|
run_demo()
|
||||||
|
|
Loading…
Reference in New Issue