Merge branch 'main' of https://github.com/EbookFoundation/oapen-suggestion-service into main
commit
ee45695fb6
|
@ -0,0 +1,5 @@
|
|||
[flake8]
|
||||
max-line-length = 88
|
||||
max-complexity = 18
|
||||
select = B,C,E,F,W,T4,B9
|
||||
ignore = E203, E266, E501, W503, F403, F401
|
|
@ -22,7 +22,7 @@ jobs:
|
|||
pipenv install --deploy --dev
|
||||
pipenv run isort --profile black src/
|
||||
pipenv run black --check src/ --exclude="lib/*"
|
||||
pipenv run flake8 src/ --ignore="lib/* W"
|
||||
pipenv run flake8 src/ --ignore="lib/*, W, E203, E266, E501, W503, F403, F401"
|
||||
pipenv run pytest
|
||||
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
[settings]
|
||||
multi_line_output=3
|
||||
include_trailing_comma=True
|
||||
force_grid_wrap=0
|
||||
use_parentheses=True
|
||||
line_length=88
|
||||
skip=[lib/, bin/]
|
||||
profile=black
|
||||
known_third_party = data,lib,model,nltk,pandas,psycopg2,requests
|
|
@ -1,18 +1,14 @@
|
|||
repos:
|
||||
- repo: https://github.com/pycqa/isort
|
||||
rev: 5.10.1
|
||||
- repo: https://github.com/pre-commit/mirrors-isort
|
||||
rev: v5.10.1
|
||||
hooks:
|
||||
- id: isort
|
||||
args: [--profile, black, --filter-files, oapen-engine/src]
|
||||
name: isort (python)
|
||||
- repo: https://github.com/psf/black
|
||||
rev: stable
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3.10
|
||||
args: [oapen-engine/src]
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v1.2.3
|
||||
hooks:
|
||||
- id: flake8
|
||||
args: [oapen-engine/src]
|
||||
- id: isort
|
||||
- repo: https://github.com/ambv/black
|
||||
rev: 22.10.0
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3.10
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v2.3.0
|
||||
hooks:
|
||||
- id: flake8
|
|
@ -161,4 +161,7 @@ cython_debug/
|
|||
|
||||
database.ini
|
||||
lib/python3.7/
|
||||
lib/python3.10/
|
||||
lib/python3.10/
|
||||
bin/
|
||||
.pytest_cache
|
||||
lib/
|
|
@ -1,18 +0,0 @@
|
|||
setup-env:
|
||||
ifeq ($(OS),Windows_NT)
|
||||
py -m pip install --upgrade pip
|
||||
else
|
||||
python -m pip install --upgrade pip
|
||||
endif
|
||||
pip install pipenv
|
||||
pipenv install
|
||||
pipenv shell
|
||||
|
||||
seed_db:
|
||||
cd src && pipenv run python tasks/seed.py
|
||||
|
||||
clean_db:
|
||||
cd src && pipenv run python tasks/clean.py
|
||||
|
||||
run:
|
||||
cd src && pipenv run python main.py
|
|
@ -1,14 +1,17 @@
|
|||
import string
|
||||
from typing import List
|
||||
|
||||
import data.oapen as OapenAPI
|
||||
import lib.stopwords as oapen_stopwords
|
||||
import nltk
|
||||
import pandas as pd
|
||||
from nltk import word_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
import data.oapen as OapenAPI # pylint: disable=import-error
|
||||
import model.stopwords as oapen_stopwords # pylint: disable=import-error
|
||||
import nltk # pylint: disable=import-error
|
||||
import pandas as pd # pylint: disable=import-error
|
||||
from nltk import word_tokenize # pylint: disable=import-error
|
||||
from nltk.corpus import stopwords # pylint: disable=import-error
|
||||
|
||||
from .oapen_types import OapenItem, transform_item_data
|
||||
from .oapen_types import ( # pylint: disable=relative-beyond-top-level
|
||||
OapenItem,
|
||||
transform_item_data,
|
||||
)
|
||||
|
||||
nltk.download("stopwords")
|
||||
|
||||
|
@ -26,7 +29,9 @@ def process_text(text):
|
|||
l_text = text.lower()
|
||||
p_text = "".join([c for c in l_text if c not in string.punctuation])
|
||||
words = word_tokenize(p_text)
|
||||
filtered_words = list(filter(lambda x: x not in STOPWORDS, words))
|
||||
filtered_words = list(
|
||||
filter(lambda x: x not in STOPWORDS and x.isalpha(), words)
|
||||
) # added isalpha to check that it contains only letters
|
||||
|
||||
return filtered_words
|
||||
|
||||
|
@ -57,8 +62,126 @@ def make_df(data: List[OapenItem]):
|
|||
return df
|
||||
|
||||
|
||||
def run_ngrams():
|
||||
def get_text_by_uuid(df, uuid):
|
||||
return df.loc[df.uuid == uuid].text[0]
|
||||
|
||||
|
||||
def generate_ngram(text, n):
|
||||
ngrams = {}
|
||||
# store appearance count of each trigram
|
||||
for index in range(0, len(text) + 1 - n):
|
||||
ngram = " ".join(text[index: index + n])
|
||||
ngrams.setdefault(ngram, 0) # sets curr ngram to 0 if non-existant
|
||||
ngrams[ngram] += 1
|
||||
return dict(
|
||||
sorted(ngrams.items(), key=lambda item: item[1], reverse=True)
|
||||
) # return sorted by count
|
||||
|
||||
|
||||
def generate_ngram_by_uuid(df, uuid, n):
|
||||
text = get_text_by_uuid(df, uuid)
|
||||
return generate_ngram(text, n)
|
||||
|
||||
|
||||
def get_n_most_occuring(dic: dict, n=100):
|
||||
sorted_dict = dict(
|
||||
sorted(dic.items(), key=lambda item: item[1], reverse=True)
|
||||
) # sorts in case of additionas post generate_ngram
|
||||
return list(sorted_dict)[:n]
|
||||
|
||||
|
||||
# Currently, this uses the n most occuring ngrams to compare
|
||||
# This could also count the instances in the highest
|
||||
def get_similarity_score(ngram1, ngram2, n=100):
|
||||
n_most_occ_1 = get_n_most_occuring(ngram1, n)
|
||||
n_most_occ_2 = get_n_most_occuring(ngram2, n)
|
||||
repeated = 0
|
||||
for n_gram in n_most_occ_1:
|
||||
if n_gram in n_most_occ_2:
|
||||
repeated += 1
|
||||
return repeated / n
|
||||
|
||||
|
||||
# this treats ngrams1 as primary ngrams, since we want a
|
||||
# 100% similarity score if all ngrams match from book 1
|
||||
# this means that a fragment of a book will get a 100% similarity score
|
||||
# when compared to it's own book, but not the reverse interaction
|
||||
def get_similarity_score_by_dict_count(ngrams1, ngrams2):
|
||||
repeated = 0
|
||||
total = sum(ngrams1.values()) # gets counts from book 1
|
||||
for key, ngrams1_value in ngrams1.items():
|
||||
repeated += min(
|
||||
ngrams1_value, ngrams2.get(key, 0)
|
||||
) # adds min value, or 0 by default if key not found
|
||||
# if(min(ngrams1_value, ngrams2.get(key, 0)) != 0):
|
||||
# print(key)
|
||||
return repeated / total
|
||||
|
||||
|
||||
# to demo some functions
|
||||
def test_functions():
|
||||
data = get_data()
|
||||
# Uncomment to print raw text of first book
|
||||
# for item in data:
|
||||
# print(item.get_text_bitstream())
|
||||
# break
|
||||
df = make_df(data)
|
||||
print(df.shape)
|
||||
print(df[:10])
|
||||
print(df)
|
||||
sample_list = get_text_by_uuid(df, df.iloc[0].uuid)
|
||||
print(sample_list[:10])
|
||||
sample_ngram_list = generate_ngram_by_uuid(df, df.iloc[0].uuid, 3)
|
||||
print(get_n_most_occuring(sample_ngram_list, 2))
|
||||
|
||||
|
||||
# run demo with the above titles
|
||||
def run_demo():
|
||||
demo_books = {
|
||||
# should be similar
|
||||
"Domestic...": "01d59c45-78b8-4710-9805-584c72866c32",
|
||||
"Local Leadership ...": "00fc2a5a-6540-4176-ac76-c35ddba4cceb",
|
||||
# should be similar but different from first group
|
||||
"Repurposing Music...": "02445c92-5c12-47e3-bde7-5764ef6c0434",
|
||||
# "An Experimental..." : "00fa7fba-0343-4db9-b18b-7c9d430a1131"
|
||||
}
|
||||
|
||||
items = []
|
||||
ngram_dict = {}
|
||||
|
||||
print("---------------------------------")
|
||||
|
||||
for name, uuid in demo_books.items():
|
||||
book_item = OapenAPI.get_item(uuid)
|
||||
print(book_item)
|
||||
|
||||
item = transform_item_data(book_item)
|
||||
items.append(item)
|
||||
|
||||
text = process_text(item.get_text_bitstream())
|
||||
print(f" {name}: text array\n{text[:30]}...\n")
|
||||
|
||||
ngram_dict[uuid] = generate_ngram(text, 3)
|
||||
print(f" {name}: ngram dictionary\n {list(ngram_dict[uuid].items())[:30]}...")
|
||||
|
||||
print("---------------------------------")
|
||||
|
||||
for name, uuid in demo_books.items():
|
||||
print(f"Showing similarity scores for all books relative to {name}:\n")
|
||||
for name2, uuid2 in demo_books.items():
|
||||
if uuid == uuid2: # dont check self
|
||||
continue
|
||||
|
||||
simple_similarity_score = 100 * get_similarity_score(ngram_dict[uuid], ngram_dict[uuid2], n=10000)
|
||||
print(
|
||||
f" Similarity score by simple count for title {name2}: {simple_similarity_score}%"
|
||||
)
|
||||
|
||||
dict_similarity_score = 100 * get_similarity_score_by_dict_count(ngram_dict[uuid], ngram_dict[uuid2])
|
||||
print(
|
||||
f" Similarity score by dict count for title {name2}: {dict_similarity_score}%"
|
||||
)
|
||||
print()
|
||||
|
||||
|
||||
def run_ngrams():
|
||||
run_demo()
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
[tool.black]
|
||||
line-length = 88
|
||||
include = '\.pyi?$'
|
||||
exclude = '''
|
||||
/(
|
||||
\.git
|
||||
| \.hg
|
||||
| \.mypy_cache
|
||||
| \.tox
|
||||
| \.venv
|
||||
| _build
|
||||
| buck-out
|
||||
| build
|
||||
| dist
|
||||
| lib
|
||||
| ./oapen-engine/lib
|
||||
| __pycache__
|
||||
| bin
|
||||
| ./oapen-engine/bin
|
||||
|
|
||||
)/
|
||||
'''
|
18
setup.cfg
18
setup.cfg
|
@ -1,18 +0,0 @@
|
|||
[isort]
|
||||
multi_line_output=3
|
||||
include_trailing_comma=True
|
||||
force_grid_wrap=0
|
||||
use_parentheses=True
|
||||
line_length=88
|
||||
skip=[lib/]
|
||||
profile=black
|
||||
|
||||
[flake8]
|
||||
ignore = E203, E266, E501, W503, E501
|
||||
max-line-length = 88
|
||||
max-complexity = 18
|
||||
select = B,C,E,F,W,T4
|
||||
exclude=.git,lib,__pycache__
|
||||
|
||||
[tool:pytest]
|
||||
testpaths=src/test
|
Loading…
Reference in New Issue