Celina Peralta 2022-10-23 19:52:52 -04:00
commit ee45695fb6
9 changed files with 186 additions and 64 deletions

5
.flake8 Normal file
View File

@ -0,0 +1,5 @@
[flake8]
max-line-length = 88
max-complexity = 18
select = B,C,E,F,W,T4,B9
ignore = E203, E266, E501, W503, F403, F401

View File

@ -22,7 +22,7 @@ jobs:
pipenv install --deploy --dev
pipenv run isort --profile black src/
pipenv run black --check src/ --exclude="lib/*"
pipenv run flake8 src/ --ignore="lib/* W"
pipenv run flake8 src/ --ignore="lib/*, W, E203, E266, E501, W503, F403, F401"
pipenv run pytest

9
.isort.cfg Normal file
View File

@ -0,0 +1,9 @@
[settings]
multi_line_output=3
include_trailing_comma=True
force_grid_wrap=0
use_parentheses=True
line_length=88
skip=[lib/, bin/]
profile=black
known_third_party = data,lib,model,nltk,pandas,psycopg2,requests

View File

@ -1,18 +1,14 @@
repos:
- repo: https://github.com/pycqa/isort
rev: 5.10.1
- repo: https://github.com/pre-commit/mirrors-isort
rev: v5.10.1
hooks:
- id: isort
args: [--profile, black, --filter-files, oapen-engine/src]
name: isort (python)
- repo: https://github.com/psf/black
rev: stable
hooks:
- id: black
language_version: python3.10
args: [oapen-engine/src]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v1.2.3
hooks:
- id: flake8
args: [oapen-engine/src]
- id: isort
- repo: https://github.com/ambv/black
rev: 22.10.0
hooks:
- id: black
language_version: python3.10
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.3.0
hooks:
- id: flake8

View File

@ -161,4 +161,7 @@ cython_debug/
database.ini
lib/python3.7/
lib/python3.10/
lib/python3.10/
bin/
.pytest_cache
lib/

View File

@ -1,18 +0,0 @@
setup-env:
ifeq ($(OS),Windows_NT)
py -m pip install --upgrade pip
else
python -m pip install --upgrade pip
endif
pip install pipenv
pipenv install
pipenv shell
seed_db:
cd src && pipenv run python tasks/seed.py
clean_db:
cd src && pipenv run python tasks/clean.py
run:
cd src && pipenv run python main.py

View File

@ -1,14 +1,17 @@
import string
from typing import List
import data.oapen as OapenAPI
import lib.stopwords as oapen_stopwords
import nltk
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
import data.oapen as OapenAPI # pylint: disable=import-error
import model.stopwords as oapen_stopwords # pylint: disable=import-error
import nltk # pylint: disable=import-error
import pandas as pd # pylint: disable=import-error
from nltk import word_tokenize # pylint: disable=import-error
from nltk.corpus import stopwords # pylint: disable=import-error
from .oapen_types import OapenItem, transform_item_data
from .oapen_types import ( # pylint: disable=relative-beyond-top-level
OapenItem,
transform_item_data,
)
nltk.download("stopwords")
@ -26,7 +29,9 @@ def process_text(text):
l_text = text.lower()
p_text = "".join([c for c in l_text if c not in string.punctuation])
words = word_tokenize(p_text)
filtered_words = list(filter(lambda x: x not in STOPWORDS, words))
filtered_words = list(
filter(lambda x: x not in STOPWORDS and x.isalpha(), words)
) # added isalpha to check that it contains only letters
return filtered_words
@ -57,8 +62,126 @@ def make_df(data: List[OapenItem]):
return df
def run_ngrams():
def get_text_by_uuid(df, uuid):
return df.loc[df.uuid == uuid].text[0]
def generate_ngram(text, n):
ngrams = {}
# store appearance count of each trigram
for index in range(0, len(text) + 1 - n):
ngram = " ".join(text[index: index + n])
ngrams.setdefault(ngram, 0) # sets curr ngram to 0 if non-existant
ngrams[ngram] += 1
return dict(
sorted(ngrams.items(), key=lambda item: item[1], reverse=True)
) # return sorted by count
def generate_ngram_by_uuid(df, uuid, n):
text = get_text_by_uuid(df, uuid)
return generate_ngram(text, n)
def get_n_most_occuring(dic: dict, n=100):
sorted_dict = dict(
sorted(dic.items(), key=lambda item: item[1], reverse=True)
) # sorts in case of additionas post generate_ngram
return list(sorted_dict)[:n]
# Currently, this uses the n most occuring ngrams to compare
# This could also count the instances in the highest
def get_similarity_score(ngram1, ngram2, n=100):
n_most_occ_1 = get_n_most_occuring(ngram1, n)
n_most_occ_2 = get_n_most_occuring(ngram2, n)
repeated = 0
for n_gram in n_most_occ_1:
if n_gram in n_most_occ_2:
repeated += 1
return repeated / n
# this treats ngrams1 as primary ngrams, since we want a
# 100% similarity score if all ngrams match from book 1
# this means that a fragment of a book will get a 100% similarity score
# when compared to it's own book, but not the reverse interaction
def get_similarity_score_by_dict_count(ngrams1, ngrams2):
repeated = 0
total = sum(ngrams1.values()) # gets counts from book 1
for key, ngrams1_value in ngrams1.items():
repeated += min(
ngrams1_value, ngrams2.get(key, 0)
) # adds min value, or 0 by default if key not found
# if(min(ngrams1_value, ngrams2.get(key, 0)) != 0):
# print(key)
return repeated / total
# to demo some functions
def test_functions():
data = get_data()
# Uncomment to print raw text of first book
# for item in data:
# print(item.get_text_bitstream())
# break
df = make_df(data)
print(df.shape)
print(df[:10])
print(df)
sample_list = get_text_by_uuid(df, df.iloc[0].uuid)
print(sample_list[:10])
sample_ngram_list = generate_ngram_by_uuid(df, df.iloc[0].uuid, 3)
print(get_n_most_occuring(sample_ngram_list, 2))
# run demo with the above titles
def run_demo():
demo_books = {
# should be similar
"Domestic...": "01d59c45-78b8-4710-9805-584c72866c32",
"Local Leadership ...": "00fc2a5a-6540-4176-ac76-c35ddba4cceb",
# should be similar but different from first group
"Repurposing Music...": "02445c92-5c12-47e3-bde7-5764ef6c0434",
# "An Experimental..." : "00fa7fba-0343-4db9-b18b-7c9d430a1131"
}
items = []
ngram_dict = {}
print("---------------------------------")
for name, uuid in demo_books.items():
book_item = OapenAPI.get_item(uuid)
print(book_item)
item = transform_item_data(book_item)
items.append(item)
text = process_text(item.get_text_bitstream())
print(f" {name}: text array\n{text[:30]}...\n")
ngram_dict[uuid] = generate_ngram(text, 3)
print(f" {name}: ngram dictionary\n {list(ngram_dict[uuid].items())[:30]}...")
print("---------------------------------")
for name, uuid in demo_books.items():
print(f"Showing similarity scores for all books relative to {name}:\n")
for name2, uuid2 in demo_books.items():
if uuid == uuid2: # dont check self
continue
simple_similarity_score = 100 * get_similarity_score(ngram_dict[uuid], ngram_dict[uuid2], n=10000)
print(
f" Similarity score by simple count for title {name2}: {simple_similarity_score}%"
)
dict_similarity_score = 100 * get_similarity_score_by_dict_count(ngram_dict[uuid], ngram_dict[uuid2])
print(
f" Similarity score by dict count for title {name2}: {dict_similarity_score}%"
)
print()
def run_ngrams():
run_demo()

22
pyproject.toml Normal file
View File

@ -0,0 +1,22 @@
[tool.black]
line-length = 88
include = '\.pyi?$'
exclude = '''
/(
\.git
| \.hg
| \.mypy_cache
| \.tox
| \.venv
| _build
| buck-out
| build
| dist
| lib
| ./oapen-engine/lib
| __pycache__
| bin
| ./oapen-engine/bin
|
)/
'''

View File

@ -1,18 +0,0 @@
[isort]
multi_line_output=3
include_trailing_comma=True
force_grid_wrap=0
use_parentheses=True
line_length=88
skip=[lib/]
profile=black
[flake8]
ignore = E203, E266, E501, W503, E501
max-line-length = 88
max-complexity = 18
select = B,C,E,F,W,T4
exclude=.git,lib,__pycache__
[tool:pytest]
testpaths=src/test