Basic testing (#45)

* finished changes to stopwords and langauges * final changes to stopwords * basic testing * add tests * Remove formatter for now * fix merge * cd * touch __init__ * Relative path issue \? * run tests before app * Move tests to inside docker * exit when any command fails --------- Co-authored-by: Max Zaremba <max.zaremba@gmail.com>
2023-03-22 14:52:38 -04:00 · 2023-03-22 14:52:38 -04:00 · 376545450d
parent 884872cf60
commit 376545450d
21 changed files with 224 additions and 59 deletions
--- a/.github/workflows/engine.yml
+++ b/.github/workflows/engine.yml
@ -1,27 +0,0 @@
-name: OAPEN Engine
-
-on: [push]
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    env:
-      working-directory: ./oapen-engine
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.10'
-        
-      - name: Install dependencies with pipenv
-        working-directory: ${{env.working-directory}}
-        run: |
-          pip install pipenv
-          pipenv install --deploy --dev
-          pipenv run isort --profile black src/
-          pipenv run black --check src/ --exclude="lib/*"
-          pipenv run flake8 src/ --ignore="lib/*, W, E203, E266, E501, W503, F403, F401"
-        
-      
--- a/.github/workflows/lint-web.yml
+++ b/.github/workflows/lint-web.yml
@ -1,4 +1,4 @@
-name: Web lint checker
+name: Build and test web
 on: push
 jobs:
  test:
--- a/.github/workflows/test-containers.yml
+++ b/.github/workflows/test-containers.yml
@ -1,4 +1,4 @@
-name: Test Containers
+name: Build and test containers

 on: push

--- a/oapen-engine/Dockerfile
+++ b/oapen-engine/Dockerfile
@ -48,4 +48,4 @@ RUN chmod -R +x scripts
 USER appuser

 # Run the application
-ENTRYPOINT ["./scripts/run.sh"]
+ENTRYPOINT ["./scripts/test-and-run.sh"]
--- a/oapen-engine/Makefile
+++ b/oapen-engine/Makefile
@ -0,0 +1,42 @@
+PYTHONEX ?= "python"
+PYTHONPATH = "$(CURDIR)/src"
+PYTHON = PYTHONPATH="$(PYTHONPATH)" $(PYTHONEX)
+
+setup-env:
+ifeq ($(OS),Windows_NT)
+	py -m pip install --upgrade pip
+else
+	$(PYTHON) -m pip install --upgrade pip
+endif
+	$(PYTHON) -m pip install pipenv
+	$(PYTHON) -m pipenv install --skip-lock
+	$(PYTHON) -m pipenv shell
+
+seed_db:
+	cd src && $(PYTHON) -m pipenv run python tasks/seed.py
+
+clean_db:
+	cd src && $(PYTHON) -m pipenv run python tasks/clean.py
+
+clean_and_seed:
+	$(MAKE) clean_db 
+	$(MAKE) seed_db
+	
+generate_suggestions:
+	cd src && $(PYTHON) -m pipenv run python tasks/generate_suggestions.py
+
+run:
+	$(MAKE) clean_and_seed
+	$(MAKE) generate_suggestions 
+
+run-tests:
+	cd src && $(PYTHON) -m pipenv run pytest
+
+refresh-items:
+	cd src && $(PYTHON) -m pipenv run python tasks/refresh_items.py
+
+run-daemon:
+	cd src && $(PYTHON) -m pipenv run python tasks/daemon.py
+
+run-unit-tests:
+	cd src && $(PYTHON) -m pipenv run python test/data/run_tests.py
--- a/oapen-engine/Pipfile
+++ b/oapen-engine/Pipfile
@ -10,6 +10,9 @@ psycopg2 = "2.9.3"
 pandas = "*"
 scikit-learn = "*"
 lxml = "*"
+charset_normalizer = "*"
+idna = "*"
+certifi = "*"

 [dev-packages]
 pytest = "*"
--- a/oapen-engine/scripts/run.sh
+++ b/oapen-engine/scripts/run.sh
@ -1,3 +0,0 @@
-#!/bin/sh
-
-python src/tasks/daemon.py
--- a/oapen-engine/scripts/test-and-run.sh
+++ b/oapen-engine/scripts/test-and-run.sh
@ -0,0 +1,9 @@
+#!/bin/sh
+
+# exit when any command fails
+set -e
+
+echo "Running tests..." && \
+python src/test/data/run_tests.py && \
+echo "Running app" && \
+python src/tasks/daemon.py
--- a/oapen-engine/src/data/init.py
+++ b/oapen-engine/src/data/init.py
--- a/oapen-engine/src/model/init.py
+++ b/oapen-engine/src/model/init.py
--- a/oapen-engine/src/model/ngrams.py
+++ b/oapen-engine/src/model/ngrams.py
@ -1,37 +1,19 @@
 import string
 from typing import List

-import pandas as pd  # pylint: disable=import-error
-from nltk import word_tokenize  # pylint: disable=import-error
-from nltk.corpus import stopwords  # pylint: disable=import-error
+import nltk 
+from nltk import word_tokenize 
+from .stopwords_processor import STOPWORDS
+import pandas as pd  

-from .oapen_types import (  # pylint: disable=relative-beyond-top-level
+nltk.download('punkt')
+
+from .oapen_types import (
    NgramDict,
    NgramRowWithoutDate,
    OapenItem,
 )

-stopword_paths = [
-    "src/model/stopwords_broken.txt",
-    "src/model/stopwords_dutch.txt",
-    "src/model/stopwords_filter.txt",
-    "src/model/stopwords_publisher.txt",
-]
-
-stopwords_list = []
-
-for p in stopword_paths:
-    with open(p, "r") as f:
-        stopwords_list += [line.rstrip() for line in f]
-
-STOPWORDS = (
-    stopwords.words("english")
-    + stopwords.words("german")
-    + stopwords.words("dutch")
-    + stopwords_list
-)
-
-
 def process_text(text):
    l_text = text.lower()
    p_text = "".join([c for c in l_text if c not in string.punctuation])
--- a/oapen-engine/src/model/stopwords/broken.txt
+++ b/oapen-engine/src/model/stopwords/broken.txt
--- a/oapen-engine/src/model/stopwords/dutch.txt
+++ b/oapen-engine/src/model/stopwords/dutch.txt
--- a/oapen-engine/src/model/stopwords/filter.txt
+++ b/oapen-engine/src/model/stopwords/filter.txt
--- a/oapen-engine/src/model/stopwords/publisher.txt
+++ b/oapen-engine/src/model/stopwords/publisher.txt
--- a/oapen-engine/src/model/stopwords/stopwords_full_list.py
+++ b/oapen-engine/src/model/stopwords/stopwords_full_list.py
--- a/oapen-engine/src/model/stopwords_processor.py
+++ b/oapen-engine/src/model/stopwords_processor.py
@ -0,0 +1,44 @@
+import nltk
+from nltk.corpus import stopwords
+from functools import reduce
+import os
+
+# This is run as a precaution in case of the error "NLTK stop words not found",
+# which makes sure to download the stop words after installing nltk
+nltk.download("stopwords")
+
+# add additional custom stopwords to ./custom_lists/ folder and update the reference here
+# print working directory
+print("Working directory: " + os.getcwd())
+
+current_dir = os.path.realpath(os.path.dirname(__file__))
+print("Local script directory: " + current_dir)
+
+custom_lists_folder = current_dir + "/stopwords/"
+custom_stopwords_in_use = [
+    "broken",
+    "dutch",
+    "filter",
+    "publisher",
+]
+
+# For reference on available languages, please reference https://pypi.org/project/stop-words/
+enabled_languages = [
+    "english",
+    "german",
+    "dutch"
+]
+
+# the combined stopwords of all enabled langauges
+nltk_stopwords = []
+for language in enabled_languages:
+    nltk_stopwords += stopwords.words(language)
+
+# get the custom lists
+custom_stopwords = []
+for custom_list in custom_stopwords_in_use:
+    with open(custom_lists_folder + custom_list + ".txt", "r") as file:  # specify folder name
+        custom_stopwords += [line.rstrip() for line in file]
+
+# add languages and custom stopwords for final stopwords var
+STOPWORDS = (nltk_stopwords + custom_stopwords)
--- a/oapen-engine/src/test/data/run_tests.py
+++ b/oapen-engine/src/test/data/run_tests.py
@ -0,0 +1,41 @@
+import test_oapen
+import test_stopwords
+import test_ngrams
+
+def run_test(run_msg, func):
+    print(run_msg, end = " ")
+    func()
+    print("OK")  # will throw on fail
+
+def main():
+    print("Testing connection to OAPEN.")
+    try:
+        run_test("Attempting to get item [Embodying Contagion]:", test_oapen.test_get_item)
+        run_test("Attempting to get null item:", test_oapen.test_get_item_404)
+        run_test("Attempting to get collection limit by label [Knowledge Unlatched (KU)]:",
+            test_oapen.test_get_collection_limit)
+        run_test("Attempting to get null collection:", test_oapen.test_get_collection_404)
+    except Exception as e:
+        print("\nFailed:")
+        print(e)
+
+    print("\nTesting stopwords generation.")
+    try:
+        run_test("Testing stopwords correctly generated:", 
+            test_stopwords.test_stopwords_contains_all)
+    except Exception as e:
+        print("Failed:")
+        print(e)
+
+    print("\nTesting ngrams functionality.")
+    try:
+        run_test("Testing process_text:", test_ngrams.test_process_text)
+        run_test("Testing ngram generation:", test_ngrams.test_generate_ngram)
+        run_test("Testing similarity score:", test_ngrams.test_similarity_score)
+        
+    except Exception as e:
+        print("Failed:")
+        print(e)
+
+if __name__ == "__main__":
+    main()
--- a/oapen-engine/src/test/data/test_ngrams.py
+++ b/oapen-engine/src/test/data/test_ngrams.py
@ -0,0 +1,51 @@
+import model.ngrams as ngrams
+
+test_text1 = "Foxes are cunning animals. There was a quick, red fox known to avoid crossing roads during the day, doing so only at night."
+test_text2 = "The quick red fox jumped over the lazy brown dog. It had a fantastic time doing so, as it felt finally free. The fox had been in the zoo for far too long, held in captivity."
+
+processed_text1 = ['foxes', 'cunning', 'animals', 'quick', 'red', 'fox', 'known', 'avoid', 'crossing', 'roads', 'day', 'night']
+processed_text2 = ['quick', 'red', 'fox', 'jumped', 'lazy', 'brown', 'dog', 'fantastic', 'time', 'felt', 'finally', 'free', 'fox', 'zoo', 'far', 'long', 'held', 'captivity']
+
+ngrams1 = {
+    'foxes cunning animals': 1, 
+    'cunning animals quick': 1, 
+    'animals quick red': 1, 
+    'quick red fox': 1, 
+    'red fox known': 1, 
+    'fox known avoid': 1, 
+    'known avoid crossing': 1, 
+    'avoid crossing roads': 1, 
+    'crossing roads day': 1, 
+    'roads day night': 1
+}
+ngrams2 = {
+    'quick red fox': 1, 
+    'red fox jumped': 1, 
+    'fox jumped lazy': 1, 
+    'jumped lazy brown': 1, 
+    'lazy brown dog': 1, 
+    'brown dog fantastic': 1, 
+    'dog fantastic time': 1, 
+    'fantastic time felt': 1, 
+    'time felt finally': 1, 
+    'felt finally free': 1, 
+    'finally free fox': 1, 
+    'free fox zoo': 1, 
+    'fox zoo far': 1, 
+    'zoo far long': 1, 
+    'far long held': 1, 
+    'long held captivity': 1
+}
+
+def test_process_text():
+    assert(ngrams.process_text(test_text1) == processed_text1)
+    assert(ngrams.process_text(test_text2) == processed_text2)
+
+def test_generate_ngram():
+    assert(ngrams.generate_ngram(processed_text1) == ngrams1)
+    assert(ngrams.generate_ngram(processed_text2) == ngrams2)
+
+def test_similarity_score():
+    assert(ngrams.get_similarity_score(ngrams1, ngrams2, n=5, as_percent=False) == 1)
+    assert(ngrams.get_similarity_score(ngrams1, ngrams2, n=5, as_percent=True) == 0.2)
+    
--- a/oapen-engine/src/test/data/test_oapen.py
+++ b/oapen-engine/src/test/data/test_oapen.py
@ -1,6 +1,6 @@
 from typing import List

-import src.data.oapen as OapenAPI
+import data.oapen as OapenAPI
 from model.oapen_types import OapenItem


--- a/oapen-engine/src/test/data/test_stopwords.py
+++ b/oapen-engine/src/test/data/test_stopwords.py
@ -0,0 +1,23 @@
+from model.stopwords_processor import STOPWORDS
+import model.stopwords.stopwords_full_list as stopwords_full_list
+# currently contains stopwords_filter, stopwords_publisher, stopwords_broken, stopwords_dutch_extra
+
+# tests all at once
+def test_stopwords_contains_all():
+    assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_filter))
+    assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_publisher))
+    assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_broken))
+    assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_dutch_extra))
+
+# individual tests provided if needed
+def test_stopwords_contains_stopwords_filter():
+    assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_filter))
+    
+def test_stopwords_contains_stopwords_publisher():
+    assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_publisher))
+
+def test_stopwords_contains_stopwords_broken():
+    assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_broken))
+
+def test_stopwords_contains_stopwords_dutch_extra():
+    assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_dutch_extra))