Merge branch 'main' of https://github.com/EbookFoundation/oapen-suggestion-service into main

2022-10-23 19:52:52 -04:00 · 2022-10-23 19:52:52 -04:00 · ee45695fb6
parent 417c55ed33 1520f08b05
commit ee45695fb6
9 changed files with 186 additions and 64 deletions
--- a/.flake8
+++ b/.flake8
@ -0,0 +1,5 @@
+[flake8]
+max-line-length = 88
+max-complexity = 18
+select = B,C,E,F,W,T4,B9
+ignore = E203, E266, E501, W503, F403, F401
--- a/.github/workflows/engine.yml
+++ b/.github/workflows/engine.yml
@ -22,7 +22,7 @@ jobs:
          pipenv install --deploy --dev
          pipenv run isort --profile black src/
          pipenv run black --check src/ --exclude="lib/*"
-          pipenv run flake8 src/ --ignore="lib/* W" 
+          pipenv run flake8 src/ --ignore="lib/*, W, E203, E266, E501, W503, F403, F401"
          pipenv run pytest
        
      
--- a/.isort.cfg
+++ b/.isort.cfg
@ -0,0 +1,9 @@
+[settings]
+multi_line_output=3
+include_trailing_comma=True
+force_grid_wrap=0
+use_parentheses=True
+line_length=88
+skip=[lib/, bin/]
+profile=black
+known_third_party = data,lib,model,nltk,pandas,psycopg2,requests
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,18 +1,14 @@
 repos:
-  - repo: https://github.com/pycqa/isort
-    rev: 5.10.1
+-   repo: https://github.com/pre-commit/mirrors-isort
+    rev: v5.10.1
    hooks:
-      - id: isort
-        args: [--profile, black, --filter-files, oapen-engine/src]
-        name: isort (python)
-  -   repo: https://github.com/psf/black
-      rev: stable
-      hooks:
-      - id: black
-        language_version: python3.10
-        args: [oapen-engine/src]
-  -   repo: https://github.com/pre-commit/pre-commit-hooks
-      rev: v1.2.3
-      hooks:
-      - id: flake8
-        args: [oapen-engine/src]
+    - id: isort
+-   repo: https://github.com/ambv/black
+    rev: 22.10.0
+    hooks:
+    - id: black
+      language_version: python3.10
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+    - id: flake8
--- a/oapen-engine/.gitignore
+++ b/oapen-engine/.gitignore
@ -161,4 +161,7 @@ cython_debug/

 database.ini
 lib/python3.7/
-lib/python3.10/
+lib/python3.10/
+bin/
+.pytest_cache
+lib/
--- a/oapen-engine/Makefile
+++ b/oapen-engine/Makefile
@ -1,18 +0,0 @@
-setup-env:
-ifeq ($(OS),Windows_NT)
-	py -m pip install --upgrade pip
-else
-	python -m pip install --upgrade pip
-endif
-	pip install pipenv
-	pipenv install
-	pipenv shell
-
-seed_db:
-	cd src && pipenv run python tasks/seed.py
-
-clean_db:
-	cd src && pipenv run python tasks/clean.py
-	
-run:
-	cd src && pipenv run python main.py
--- a/oapen-engine/src/model/ngrams.py
+++ b/oapen-engine/src/model/ngrams.py
@ -1,14 +1,17 @@
 import string
 from typing import List

-import data.oapen as OapenAPI
-import lib.stopwords as oapen_stopwords
-import nltk
-import pandas as pd
-from nltk import word_tokenize
-from nltk.corpus import stopwords
+import data.oapen as OapenAPI  # pylint: disable=import-error
+import model.stopwords as oapen_stopwords  # pylint: disable=import-error
+import nltk  # pylint: disable=import-error
+import pandas as pd  # pylint: disable=import-error
+from nltk import word_tokenize  # pylint: disable=import-error
+from nltk.corpus import stopwords  # pylint: disable=import-error

-from .oapen_types import OapenItem, transform_item_data
+from .oapen_types import (  # pylint: disable=relative-beyond-top-level
+    OapenItem,
+    transform_item_data,
+)

 nltk.download("stopwords")

@ -26,7 +29,9 @@ def process_text(text):
    l_text = text.lower()
    p_text = "".join([c for c in l_text if c not in string.punctuation])
    words = word_tokenize(p_text)
-    filtered_words = list(filter(lambda x: x not in STOPWORDS, words))
+    filtered_words = list(
+        filter(lambda x: x not in STOPWORDS and x.isalpha(), words)
+    )  # added isalpha to check that it contains only letters

    return filtered_words

@ -57,8 +62,126 @@ def make_df(data: List[OapenItem]):
    return df


-def run_ngrams():
+def get_text_by_uuid(df, uuid):
+    return df.loc[df.uuid == uuid].text[0]
+
+
+def generate_ngram(text, n):
+    ngrams = {}
+    # store appearance count of each trigram
+    for index in range(0, len(text) + 1 - n):
+        ngram = " ".join(text[index: index + n])
+        ngrams.setdefault(ngram, 0)  # sets curr ngram to 0 if non-existant
+        ngrams[ngram] += 1
+    return dict(
+        sorted(ngrams.items(), key=lambda item: item[1], reverse=True)
+    )  # return sorted by count
+
+
+def generate_ngram_by_uuid(df, uuid, n):
+    text = get_text_by_uuid(df, uuid)
+    return generate_ngram(text, n)
+
+
+def get_n_most_occuring(dic: dict, n=100):
+    sorted_dict = dict(
+        sorted(dic.items(), key=lambda item: item[1], reverse=True)
+    )  # sorts in case of additionas post generate_ngram
+    return list(sorted_dict)[:n]
+
+
+# Currently, this uses the n most occuring ngrams to compare
+# This could also count the instances in the highest
+def get_similarity_score(ngram1, ngram2, n=100):
+    n_most_occ_1 = get_n_most_occuring(ngram1, n)
+    n_most_occ_2 = get_n_most_occuring(ngram2, n)
+    repeated = 0
+    for n_gram in n_most_occ_1:
+        if n_gram in n_most_occ_2:
+            repeated += 1
+    return repeated / n
+
+
+# this treats ngrams1 as primary ngrams, since we want a
+# 100% similarity score if all ngrams match from book 1
+# this means that a fragment of a book will get a 100% similarity score
+# when compared to it's own book, but not the reverse interaction
+def get_similarity_score_by_dict_count(ngrams1, ngrams2):
+    repeated = 0
+    total = sum(ngrams1.values())  # gets counts from book 1
+    for key, ngrams1_value in ngrams1.items():
+        repeated += min(
+            ngrams1_value, ngrams2.get(key, 0)
+        )  # adds min value, or 0 by default if key not found
+        # if(min(ngrams1_value, ngrams2.get(key, 0)) != 0):
+        #     print(key)
+    return repeated / total
+
+
+# to demo some functions
+def test_functions():
    data = get_data()
+    # Uncomment to print raw text of first book
+    # for item in data:
+    #     print(item.get_text_bitstream())
+    #     break
    df = make_df(data)
    print(df.shape)
-    print(df[:10])
+    print(df)
+    sample_list = get_text_by_uuid(df, df.iloc[0].uuid)
+    print(sample_list[:10])
+    sample_ngram_list = generate_ngram_by_uuid(df, df.iloc[0].uuid, 3)
+    print(get_n_most_occuring(sample_ngram_list, 2))
+
+
+# run demo with the above titles
+def run_demo():
+    demo_books = {
+        # should be similar
+        "Domestic...": "01d59c45-78b8-4710-9805-584c72866c32",
+        "Local Leadership ...": "00fc2a5a-6540-4176-ac76-c35ddba4cceb",
+        # should be similar but different from first group
+        "Repurposing Music...": "02445c92-5c12-47e3-bde7-5764ef6c0434",
+        # "An Experimental..." : "00fa7fba-0343-4db9-b18b-7c9d430a1131"
+    }
+
+    items = []
+    ngram_dict = {}
+
+    print("---------------------------------")
+
+    for name, uuid in demo_books.items():
+        book_item = OapenAPI.get_item(uuid)
+        print(book_item)
+
+        item = transform_item_data(book_item)
+        items.append(item)
+
+        text = process_text(item.get_text_bitstream())
+        print(f"  {name}: text array\n{text[:30]}...\n")
+
+        ngram_dict[uuid] = generate_ngram(text, 3)
+        print(f"  {name}: ngram dictionary\n {list(ngram_dict[uuid].items())[:30]}...")
+
+        print("---------------------------------")
+
+    for name, uuid in demo_books.items():
+        print(f"Showing similarity scores for all books relative to {name}:\n")
+        for name2, uuid2 in demo_books.items():
+            if uuid == uuid2:  # dont check self
+                continue
+
+            simple_similarity_score = 100 * get_similarity_score(ngram_dict[uuid], ngram_dict[uuid2], n=10000)
+            print(
+                f"  Similarity score by simple count for title {name2}: {simple_similarity_score}%"
+            )
+
+            dict_similarity_score = 100 * get_similarity_score_by_dict_count(ngram_dict[uuid], ngram_dict[uuid2])
+            print(
+                f"  Similarity score by dict count for title {name2}: {dict_similarity_score}%"
+            )
+            print()
+
+
+def run_ngrams():
+    run_demo()
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,22 @@
+[tool.black]
+line-length = 88
+include = '\.pyi?$'
+exclude = '''
+/(
+    \.git
+  | \.hg
+  | \.mypy_cache
+  | \.tox
+  | \.venv
+  | _build
+  | buck-out
+  | build
+  | dist
+  | lib
+  | ./oapen-engine/lib
+  | __pycache__
+  | bin
+  | ./oapen-engine/bin
+  | 
+)/
+'''
--- a/setup.cfg
+++ b/setup.cfg
@ -1,18 +0,0 @@
-[isort]
-multi_line_output=3
-include_trailing_comma=True
-force_grid_wrap=0
-use_parentheses=True
-line_length=88
-skip=[lib/]
-profile=black
-
-[flake8]
-ignore = E203, E266, E501, W503, E501
-max-line-length = 88
-max-complexity = 18
-select = B,C,E,F,W,T4
-exclude=.git,lib,__pycache__
-
-[tool:pytest]
-testpaths=src/test