Merge branch 'main' of https://github.com/EbookFoundation/oapen-suggestion-service into main
commit
ee45695fb6
|
@ -0,0 +1,5 @@
|
||||||
|
[flake8]
|
||||||
|
max-line-length = 88
|
||||||
|
max-complexity = 18
|
||||||
|
select = B,C,E,F,W,T4,B9
|
||||||
|
ignore = E203, E266, E501, W503, F403, F401
|
|
@ -22,7 +22,7 @@ jobs:
|
||||||
pipenv install --deploy --dev
|
pipenv install --deploy --dev
|
||||||
pipenv run isort --profile black src/
|
pipenv run isort --profile black src/
|
||||||
pipenv run black --check src/ --exclude="lib/*"
|
pipenv run black --check src/ --exclude="lib/*"
|
||||||
pipenv run flake8 src/ --ignore="lib/* W"
|
pipenv run flake8 src/ --ignore="lib/*, W, E203, E266, E501, W503, F403, F401"
|
||||||
pipenv run pytest
|
pipenv run pytest
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
[settings]
|
||||||
|
multi_line_output=3
|
||||||
|
include_trailing_comma=True
|
||||||
|
force_grid_wrap=0
|
||||||
|
use_parentheses=True
|
||||||
|
line_length=88
|
||||||
|
skip=[lib/, bin/]
|
||||||
|
profile=black
|
||||||
|
known_third_party = data,lib,model,nltk,pandas,psycopg2,requests
|
|
@ -1,18 +1,14 @@
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/pycqa/isort
|
- repo: https://github.com/pre-commit/mirrors-isort
|
||||||
rev: 5.10.1
|
rev: v5.10.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
args: [--profile, black, --filter-files, oapen-engine/src]
|
- repo: https://github.com/ambv/black
|
||||||
name: isort (python)
|
rev: 22.10.0
|
||||||
- repo: https://github.com/psf/black
|
hooks:
|
||||||
rev: stable
|
- id: black
|
||||||
hooks:
|
language_version: python3.10
|
||||||
- id: black
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
language_version: python3.10
|
rev: v2.3.0
|
||||||
args: [oapen-engine/src]
|
hooks:
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- id: flake8
|
||||||
rev: v1.2.3
|
|
||||||
hooks:
|
|
||||||
- id: flake8
|
|
||||||
args: [oapen-engine/src]
|
|
|
@ -161,4 +161,7 @@ cython_debug/
|
||||||
|
|
||||||
database.ini
|
database.ini
|
||||||
lib/python3.7/
|
lib/python3.7/
|
||||||
lib/python3.10/
|
lib/python3.10/
|
||||||
|
bin/
|
||||||
|
.pytest_cache
|
||||||
|
lib/
|
|
@ -1,18 +0,0 @@
|
||||||
setup-env:
|
|
||||||
ifeq ($(OS),Windows_NT)
|
|
||||||
py -m pip install --upgrade pip
|
|
||||||
else
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
endif
|
|
||||||
pip install pipenv
|
|
||||||
pipenv install
|
|
||||||
pipenv shell
|
|
||||||
|
|
||||||
seed_db:
|
|
||||||
cd src && pipenv run python tasks/seed.py
|
|
||||||
|
|
||||||
clean_db:
|
|
||||||
cd src && pipenv run python tasks/clean.py
|
|
||||||
|
|
||||||
run:
|
|
||||||
cd src && pipenv run python main.py
|
|
|
@ -1,14 +1,17 @@
|
||||||
import string
|
import string
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import data.oapen as OapenAPI
|
import data.oapen as OapenAPI # pylint: disable=import-error
|
||||||
import lib.stopwords as oapen_stopwords
|
import model.stopwords as oapen_stopwords # pylint: disable=import-error
|
||||||
import nltk
|
import nltk # pylint: disable=import-error
|
||||||
import pandas as pd
|
import pandas as pd # pylint: disable=import-error
|
||||||
from nltk import word_tokenize
|
from nltk import word_tokenize # pylint: disable=import-error
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords # pylint: disable=import-error
|
||||||
|
|
||||||
from .oapen_types import OapenItem, transform_item_data
|
from .oapen_types import ( # pylint: disable=relative-beyond-top-level
|
||||||
|
OapenItem,
|
||||||
|
transform_item_data,
|
||||||
|
)
|
||||||
|
|
||||||
nltk.download("stopwords")
|
nltk.download("stopwords")
|
||||||
|
|
||||||
|
@ -26,7 +29,9 @@ def process_text(text):
|
||||||
l_text = text.lower()
|
l_text = text.lower()
|
||||||
p_text = "".join([c for c in l_text if c not in string.punctuation])
|
p_text = "".join([c for c in l_text if c not in string.punctuation])
|
||||||
words = word_tokenize(p_text)
|
words = word_tokenize(p_text)
|
||||||
filtered_words = list(filter(lambda x: x not in STOPWORDS, words))
|
filtered_words = list(
|
||||||
|
filter(lambda x: x not in STOPWORDS and x.isalpha(), words)
|
||||||
|
) # added isalpha to check that it contains only letters
|
||||||
|
|
||||||
return filtered_words
|
return filtered_words
|
||||||
|
|
||||||
|
@ -57,8 +62,126 @@ def make_df(data: List[OapenItem]):
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
def run_ngrams():
|
def get_text_by_uuid(df, uuid):
|
||||||
|
return df.loc[df.uuid == uuid].text[0]
|
||||||
|
|
||||||
|
|
||||||
|
def generate_ngram(text, n):
|
||||||
|
ngrams = {}
|
||||||
|
# store appearance count of each trigram
|
||||||
|
for index in range(0, len(text) + 1 - n):
|
||||||
|
ngram = " ".join(text[index: index + n])
|
||||||
|
ngrams.setdefault(ngram, 0) # sets curr ngram to 0 if non-existant
|
||||||
|
ngrams[ngram] += 1
|
||||||
|
return dict(
|
||||||
|
sorted(ngrams.items(), key=lambda item: item[1], reverse=True)
|
||||||
|
) # return sorted by count
|
||||||
|
|
||||||
|
|
||||||
|
def generate_ngram_by_uuid(df, uuid, n):
|
||||||
|
text = get_text_by_uuid(df, uuid)
|
||||||
|
return generate_ngram(text, n)
|
||||||
|
|
||||||
|
|
||||||
|
def get_n_most_occuring(dic: dict, n=100):
|
||||||
|
sorted_dict = dict(
|
||||||
|
sorted(dic.items(), key=lambda item: item[1], reverse=True)
|
||||||
|
) # sorts in case of additionas post generate_ngram
|
||||||
|
return list(sorted_dict)[:n]
|
||||||
|
|
||||||
|
|
||||||
|
# Currently, this uses the n most occuring ngrams to compare
|
||||||
|
# This could also count the instances in the highest
|
||||||
|
def get_similarity_score(ngram1, ngram2, n=100):
|
||||||
|
n_most_occ_1 = get_n_most_occuring(ngram1, n)
|
||||||
|
n_most_occ_2 = get_n_most_occuring(ngram2, n)
|
||||||
|
repeated = 0
|
||||||
|
for n_gram in n_most_occ_1:
|
||||||
|
if n_gram in n_most_occ_2:
|
||||||
|
repeated += 1
|
||||||
|
return repeated / n
|
||||||
|
|
||||||
|
|
||||||
|
# this treats ngrams1 as primary ngrams, since we want a
|
||||||
|
# 100% similarity score if all ngrams match from book 1
|
||||||
|
# this means that a fragment of a book will get a 100% similarity score
|
||||||
|
# when compared to it's own book, but not the reverse interaction
|
||||||
|
def get_similarity_score_by_dict_count(ngrams1, ngrams2):
|
||||||
|
repeated = 0
|
||||||
|
total = sum(ngrams1.values()) # gets counts from book 1
|
||||||
|
for key, ngrams1_value in ngrams1.items():
|
||||||
|
repeated += min(
|
||||||
|
ngrams1_value, ngrams2.get(key, 0)
|
||||||
|
) # adds min value, or 0 by default if key not found
|
||||||
|
# if(min(ngrams1_value, ngrams2.get(key, 0)) != 0):
|
||||||
|
# print(key)
|
||||||
|
return repeated / total
|
||||||
|
|
||||||
|
|
||||||
|
# to demo some functions
|
||||||
|
def test_functions():
|
||||||
data = get_data()
|
data = get_data()
|
||||||
|
# Uncomment to print raw text of first book
|
||||||
|
# for item in data:
|
||||||
|
# print(item.get_text_bitstream())
|
||||||
|
# break
|
||||||
df = make_df(data)
|
df = make_df(data)
|
||||||
print(df.shape)
|
print(df.shape)
|
||||||
print(df[:10])
|
print(df)
|
||||||
|
sample_list = get_text_by_uuid(df, df.iloc[0].uuid)
|
||||||
|
print(sample_list[:10])
|
||||||
|
sample_ngram_list = generate_ngram_by_uuid(df, df.iloc[0].uuid, 3)
|
||||||
|
print(get_n_most_occuring(sample_ngram_list, 2))
|
||||||
|
|
||||||
|
|
||||||
|
# run demo with the above titles
|
||||||
|
def run_demo():
|
||||||
|
demo_books = {
|
||||||
|
# should be similar
|
||||||
|
"Domestic...": "01d59c45-78b8-4710-9805-584c72866c32",
|
||||||
|
"Local Leadership ...": "00fc2a5a-6540-4176-ac76-c35ddba4cceb",
|
||||||
|
# should be similar but different from first group
|
||||||
|
"Repurposing Music...": "02445c92-5c12-47e3-bde7-5764ef6c0434",
|
||||||
|
# "An Experimental..." : "00fa7fba-0343-4db9-b18b-7c9d430a1131"
|
||||||
|
}
|
||||||
|
|
||||||
|
items = []
|
||||||
|
ngram_dict = {}
|
||||||
|
|
||||||
|
print("---------------------------------")
|
||||||
|
|
||||||
|
for name, uuid in demo_books.items():
|
||||||
|
book_item = OapenAPI.get_item(uuid)
|
||||||
|
print(book_item)
|
||||||
|
|
||||||
|
item = transform_item_data(book_item)
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
|
text = process_text(item.get_text_bitstream())
|
||||||
|
print(f" {name}: text array\n{text[:30]}...\n")
|
||||||
|
|
||||||
|
ngram_dict[uuid] = generate_ngram(text, 3)
|
||||||
|
print(f" {name}: ngram dictionary\n {list(ngram_dict[uuid].items())[:30]}...")
|
||||||
|
|
||||||
|
print("---------------------------------")
|
||||||
|
|
||||||
|
for name, uuid in demo_books.items():
|
||||||
|
print(f"Showing similarity scores for all books relative to {name}:\n")
|
||||||
|
for name2, uuid2 in demo_books.items():
|
||||||
|
if uuid == uuid2: # dont check self
|
||||||
|
continue
|
||||||
|
|
||||||
|
simple_similarity_score = 100 * get_similarity_score(ngram_dict[uuid], ngram_dict[uuid2], n=10000)
|
||||||
|
print(
|
||||||
|
f" Similarity score by simple count for title {name2}: {simple_similarity_score}%"
|
||||||
|
)
|
||||||
|
|
||||||
|
dict_similarity_score = 100 * get_similarity_score_by_dict_count(ngram_dict[uuid], ngram_dict[uuid2])
|
||||||
|
print(
|
||||||
|
f" Similarity score by dict count for title {name2}: {dict_similarity_score}%"
|
||||||
|
)
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def run_ngrams():
|
||||||
|
run_demo()
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
[tool.black]
|
||||||
|
line-length = 88
|
||||||
|
include = '\.pyi?$'
|
||||||
|
exclude = '''
|
||||||
|
/(
|
||||||
|
\.git
|
||||||
|
| \.hg
|
||||||
|
| \.mypy_cache
|
||||||
|
| \.tox
|
||||||
|
| \.venv
|
||||||
|
| _build
|
||||||
|
| buck-out
|
||||||
|
| build
|
||||||
|
| dist
|
||||||
|
| lib
|
||||||
|
| ./oapen-engine/lib
|
||||||
|
| __pycache__
|
||||||
|
| bin
|
||||||
|
| ./oapen-engine/bin
|
||||||
|
|
|
||||||
|
)/
|
||||||
|
'''
|
18
setup.cfg
18
setup.cfg
|
@ -1,18 +0,0 @@
|
||||||
[isort]
|
|
||||||
multi_line_output=3
|
|
||||||
include_trailing_comma=True
|
|
||||||
force_grid_wrap=0
|
|
||||||
use_parentheses=True
|
|
||||||
line_length=88
|
|
||||||
skip=[lib/]
|
|
||||||
profile=black
|
|
||||||
|
|
||||||
[flake8]
|
|
||||||
ignore = E203, E266, E501, W503, E501
|
|
||||||
max-line-length = 88
|
|
||||||
max-complexity = 18
|
|
||||||
select = B,C,E,F,W,T4
|
|
||||||
exclude=.git,lib,__pycache__
|
|
||||||
|
|
||||||
[tool:pytest]
|
|
||||||
testpaths=src/test
|
|
Loading…
Reference in New Issue