Basic testing (#45)
* finished changes to stopwords and langauges * final changes to stopwords * basic testing * add tests * Remove formatter for now * fix merge * cd * touch __init__ * Relative path issue \? * run tests before app * Move tests to inside docker * exit when any command fails --------- Co-authored-by: Max Zaremba <max.zaremba@gmail.com>better-pass-related
parent
884872cf60
commit
376545450d
|
@ -1,27 +0,0 @@
|
||||||
name: OAPEN Engine
|
|
||||||
|
|
||||||
on: [push]
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
build:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
env:
|
|
||||||
working-directory: ./oapen-engine
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
|
|
||||||
- name: Setup Python
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: '3.10'
|
|
||||||
|
|
||||||
- name: Install dependencies with pipenv
|
|
||||||
working-directory: ${{env.working-directory}}
|
|
||||||
run: |
|
|
||||||
pip install pipenv
|
|
||||||
pipenv install --deploy --dev
|
|
||||||
pipenv run isort --profile black src/
|
|
||||||
pipenv run black --check src/ --exclude="lib/*"
|
|
||||||
pipenv run flake8 src/ --ignore="lib/*, W, E203, E266, E501, W503, F403, F401"
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
name: Web lint checker
|
name: Build and test web
|
||||||
on: push
|
on: push
|
||||||
jobs:
|
jobs:
|
||||||
test:
|
test:
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
name: Test Containers
|
name: Build and test containers
|
||||||
|
|
||||||
on: push
|
on: push
|
||||||
|
|
||||||
|
|
|
@ -48,4 +48,4 @@ RUN chmod -R +x scripts
|
||||||
USER appuser
|
USER appuser
|
||||||
|
|
||||||
# Run the application
|
# Run the application
|
||||||
ENTRYPOINT ["./scripts/run.sh"]
|
ENTRYPOINT ["./scripts/test-and-run.sh"]
|
|
@ -0,0 +1,42 @@
|
||||||
|
PYTHONEX ?= "python"
|
||||||
|
PYTHONPATH = "$(CURDIR)/src"
|
||||||
|
PYTHON = PYTHONPATH="$(PYTHONPATH)" $(PYTHONEX)
|
||||||
|
|
||||||
|
setup-env:
|
||||||
|
ifeq ($(OS),Windows_NT)
|
||||||
|
py -m pip install --upgrade pip
|
||||||
|
else
|
||||||
|
$(PYTHON) -m pip install --upgrade pip
|
||||||
|
endif
|
||||||
|
$(PYTHON) -m pip install pipenv
|
||||||
|
$(PYTHON) -m pipenv install --skip-lock
|
||||||
|
$(PYTHON) -m pipenv shell
|
||||||
|
|
||||||
|
seed_db:
|
||||||
|
cd src && $(PYTHON) -m pipenv run python tasks/seed.py
|
||||||
|
|
||||||
|
clean_db:
|
||||||
|
cd src && $(PYTHON) -m pipenv run python tasks/clean.py
|
||||||
|
|
||||||
|
clean_and_seed:
|
||||||
|
$(MAKE) clean_db
|
||||||
|
$(MAKE) seed_db
|
||||||
|
|
||||||
|
generate_suggestions:
|
||||||
|
cd src && $(PYTHON) -m pipenv run python tasks/generate_suggestions.py
|
||||||
|
|
||||||
|
run:
|
||||||
|
$(MAKE) clean_and_seed
|
||||||
|
$(MAKE) generate_suggestions
|
||||||
|
|
||||||
|
run-tests:
|
||||||
|
cd src && $(PYTHON) -m pipenv run pytest
|
||||||
|
|
||||||
|
refresh-items:
|
||||||
|
cd src && $(PYTHON) -m pipenv run python tasks/refresh_items.py
|
||||||
|
|
||||||
|
run-daemon:
|
||||||
|
cd src && $(PYTHON) -m pipenv run python tasks/daemon.py
|
||||||
|
|
||||||
|
run-unit-tests:
|
||||||
|
cd src && $(PYTHON) -m pipenv run python test/data/run_tests.py
|
|
@ -10,6 +10,9 @@ psycopg2 = "2.9.3"
|
||||||
pandas = "*"
|
pandas = "*"
|
||||||
scikit-learn = "*"
|
scikit-learn = "*"
|
||||||
lxml = "*"
|
lxml = "*"
|
||||||
|
charset_normalizer = "*"
|
||||||
|
idna = "*"
|
||||||
|
certifi = "*"
|
||||||
|
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
pytest = "*"
|
pytest = "*"
|
||||||
|
|
|
@ -1,3 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
|
|
||||||
python src/tasks/daemon.py
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
# exit when any command fails
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "Running tests..." && \
|
||||||
|
python src/test/data/run_tests.py && \
|
||||||
|
echo "Running app" && \
|
||||||
|
python src/tasks/daemon.py
|
|
@ -1,37 +1,19 @@
|
||||||
import string
|
import string
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import pandas as pd # pylint: disable=import-error
|
import nltk
|
||||||
from nltk import word_tokenize # pylint: disable=import-error
|
from nltk import word_tokenize
|
||||||
from nltk.corpus import stopwords # pylint: disable=import-error
|
from .stopwords_processor import STOPWORDS
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
from .oapen_types import ( # pylint: disable=relative-beyond-top-level
|
nltk.download('punkt')
|
||||||
|
|
||||||
|
from .oapen_types import (
|
||||||
NgramDict,
|
NgramDict,
|
||||||
NgramRowWithoutDate,
|
NgramRowWithoutDate,
|
||||||
OapenItem,
|
OapenItem,
|
||||||
)
|
)
|
||||||
|
|
||||||
stopword_paths = [
|
|
||||||
"src/model/stopwords_broken.txt",
|
|
||||||
"src/model/stopwords_dutch.txt",
|
|
||||||
"src/model/stopwords_filter.txt",
|
|
||||||
"src/model/stopwords_publisher.txt",
|
|
||||||
]
|
|
||||||
|
|
||||||
stopwords_list = []
|
|
||||||
|
|
||||||
for p in stopword_paths:
|
|
||||||
with open(p, "r") as f:
|
|
||||||
stopwords_list += [line.rstrip() for line in f]
|
|
||||||
|
|
||||||
STOPWORDS = (
|
|
||||||
stopwords.words("english")
|
|
||||||
+ stopwords.words("german")
|
|
||||||
+ stopwords.words("dutch")
|
|
||||||
+ stopwords_list
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def process_text(text):
|
def process_text(text):
|
||||||
l_text = text.lower()
|
l_text = text.lower()
|
||||||
p_text = "".join([c for c in l_text if c not in string.punctuation])
|
p_text = "".join([c for c in l_text if c not in string.punctuation])
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from functools import reduce
|
||||||
|
import os
|
||||||
|
|
||||||
|
# This is run as a precaution in case of the error "NLTK stop words not found",
|
||||||
|
# which makes sure to download the stop words after installing nltk
|
||||||
|
nltk.download("stopwords")
|
||||||
|
|
||||||
|
# add additional custom stopwords to ./custom_lists/ folder and update the reference here
|
||||||
|
# print working directory
|
||||||
|
print("Working directory: " + os.getcwd())
|
||||||
|
|
||||||
|
current_dir = os.path.realpath(os.path.dirname(__file__))
|
||||||
|
print("Local script directory: " + current_dir)
|
||||||
|
|
||||||
|
custom_lists_folder = current_dir + "/stopwords/"
|
||||||
|
custom_stopwords_in_use = [
|
||||||
|
"broken",
|
||||||
|
"dutch",
|
||||||
|
"filter",
|
||||||
|
"publisher",
|
||||||
|
]
|
||||||
|
|
||||||
|
# For reference on available languages, please reference https://pypi.org/project/stop-words/
|
||||||
|
enabled_languages = [
|
||||||
|
"english",
|
||||||
|
"german",
|
||||||
|
"dutch"
|
||||||
|
]
|
||||||
|
|
||||||
|
# the combined stopwords of all enabled langauges
|
||||||
|
nltk_stopwords = []
|
||||||
|
for language in enabled_languages:
|
||||||
|
nltk_stopwords += stopwords.words(language)
|
||||||
|
|
||||||
|
# get the custom lists
|
||||||
|
custom_stopwords = []
|
||||||
|
for custom_list in custom_stopwords_in_use:
|
||||||
|
with open(custom_lists_folder + custom_list + ".txt", "r") as file: # specify folder name
|
||||||
|
custom_stopwords += [line.rstrip() for line in file]
|
||||||
|
|
||||||
|
# add languages and custom stopwords for final stopwords var
|
||||||
|
STOPWORDS = (nltk_stopwords + custom_stopwords)
|
|
@ -0,0 +1,41 @@
|
||||||
|
import test_oapen
|
||||||
|
import test_stopwords
|
||||||
|
import test_ngrams
|
||||||
|
|
||||||
|
def run_test(run_msg, func):
|
||||||
|
print(run_msg, end = " ")
|
||||||
|
func()
|
||||||
|
print("OK") # will throw on fail
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("Testing connection to OAPEN.")
|
||||||
|
try:
|
||||||
|
run_test("Attempting to get item [Embodying Contagion]:", test_oapen.test_get_item)
|
||||||
|
run_test("Attempting to get null item:", test_oapen.test_get_item_404)
|
||||||
|
run_test("Attempting to get collection limit by label [Knowledge Unlatched (KU)]:",
|
||||||
|
test_oapen.test_get_collection_limit)
|
||||||
|
run_test("Attempting to get null collection:", test_oapen.test_get_collection_404)
|
||||||
|
except Exception as e:
|
||||||
|
print("\nFailed:")
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
print("\nTesting stopwords generation.")
|
||||||
|
try:
|
||||||
|
run_test("Testing stopwords correctly generated:",
|
||||||
|
test_stopwords.test_stopwords_contains_all)
|
||||||
|
except Exception as e:
|
||||||
|
print("Failed:")
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
print("\nTesting ngrams functionality.")
|
||||||
|
try:
|
||||||
|
run_test("Testing process_text:", test_ngrams.test_process_text)
|
||||||
|
run_test("Testing ngram generation:", test_ngrams.test_generate_ngram)
|
||||||
|
run_test("Testing similarity score:", test_ngrams.test_similarity_score)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print("Failed:")
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,51 @@
|
||||||
|
import model.ngrams as ngrams
|
||||||
|
|
||||||
|
test_text1 = "Foxes are cunning animals. There was a quick, red fox known to avoid crossing roads during the day, doing so only at night."
|
||||||
|
test_text2 = "The quick red fox jumped over the lazy brown dog. It had a fantastic time doing so, as it felt finally free. The fox had been in the zoo for far too long, held in captivity."
|
||||||
|
|
||||||
|
processed_text1 = ['foxes', 'cunning', 'animals', 'quick', 'red', 'fox', 'known', 'avoid', 'crossing', 'roads', 'day', 'night']
|
||||||
|
processed_text2 = ['quick', 'red', 'fox', 'jumped', 'lazy', 'brown', 'dog', 'fantastic', 'time', 'felt', 'finally', 'free', 'fox', 'zoo', 'far', 'long', 'held', 'captivity']
|
||||||
|
|
||||||
|
ngrams1 = {
|
||||||
|
'foxes cunning animals': 1,
|
||||||
|
'cunning animals quick': 1,
|
||||||
|
'animals quick red': 1,
|
||||||
|
'quick red fox': 1,
|
||||||
|
'red fox known': 1,
|
||||||
|
'fox known avoid': 1,
|
||||||
|
'known avoid crossing': 1,
|
||||||
|
'avoid crossing roads': 1,
|
||||||
|
'crossing roads day': 1,
|
||||||
|
'roads day night': 1
|
||||||
|
}
|
||||||
|
ngrams2 = {
|
||||||
|
'quick red fox': 1,
|
||||||
|
'red fox jumped': 1,
|
||||||
|
'fox jumped lazy': 1,
|
||||||
|
'jumped lazy brown': 1,
|
||||||
|
'lazy brown dog': 1,
|
||||||
|
'brown dog fantastic': 1,
|
||||||
|
'dog fantastic time': 1,
|
||||||
|
'fantastic time felt': 1,
|
||||||
|
'time felt finally': 1,
|
||||||
|
'felt finally free': 1,
|
||||||
|
'finally free fox': 1,
|
||||||
|
'free fox zoo': 1,
|
||||||
|
'fox zoo far': 1,
|
||||||
|
'zoo far long': 1,
|
||||||
|
'far long held': 1,
|
||||||
|
'long held captivity': 1
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_process_text():
|
||||||
|
assert(ngrams.process_text(test_text1) == processed_text1)
|
||||||
|
assert(ngrams.process_text(test_text2) == processed_text2)
|
||||||
|
|
||||||
|
def test_generate_ngram():
|
||||||
|
assert(ngrams.generate_ngram(processed_text1) == ngrams1)
|
||||||
|
assert(ngrams.generate_ngram(processed_text2) == ngrams2)
|
||||||
|
|
||||||
|
def test_similarity_score():
|
||||||
|
assert(ngrams.get_similarity_score(ngrams1, ngrams2, n=5, as_percent=False) == 1)
|
||||||
|
assert(ngrams.get_similarity_score(ngrams1, ngrams2, n=5, as_percent=True) == 0.2)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import src.data.oapen as OapenAPI
|
import data.oapen as OapenAPI
|
||||||
from model.oapen_types import OapenItem
|
from model.oapen_types import OapenItem
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
from model.stopwords_processor import STOPWORDS
|
||||||
|
import model.stopwords.stopwords_full_list as stopwords_full_list
|
||||||
|
# currently contains stopwords_filter, stopwords_publisher, stopwords_broken, stopwords_dutch_extra
|
||||||
|
|
||||||
|
# tests all at once
|
||||||
|
def test_stopwords_contains_all():
|
||||||
|
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_filter))
|
||||||
|
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_publisher))
|
||||||
|
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_broken))
|
||||||
|
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_dutch_extra))
|
||||||
|
|
||||||
|
# individual tests provided if needed
|
||||||
|
def test_stopwords_contains_stopwords_filter():
|
||||||
|
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_filter))
|
||||||
|
|
||||||
|
def test_stopwords_contains_stopwords_publisher():
|
||||||
|
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_publisher))
|
||||||
|
|
||||||
|
def test_stopwords_contains_stopwords_broken():
|
||||||
|
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_broken))
|
||||||
|
|
||||||
|
def test_stopwords_contains_stopwords_dutch_extra():
|
||||||
|
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_dutch_extra))
|
Loading…
Reference in New Issue