From bd82fc9a33f9405d8cdc9badac29c6319c9213c0 Mon Sep 17 00:00:00 2001
From: Max Zaremba <max.zaremba@gmail.com>
Date: Fri, 10 Feb 2023 00:33:51 -0500
Subject: [PATCH] final changes to stopwords

---
 oapen-engine/oapen-suggestion-service         |  1 +
 oapen-engine/src/model/__init__.py            |  0
 oapen-engine/src/model/ngrams.py              | 37 +------------------
 .../stopwords_full_list.py}                   |  0
 oapen-engine/src/model/stopwords_processor.py | 37 +++++++++++++++++++
 5 files changed, 39 insertions(+), 36 deletions(-)
 create mode 160000 oapen-engine/oapen-suggestion-service
 create mode 100644 oapen-engine/src/model/__init__.py
 rename oapen-engine/src/model/{stopwords.py => stopwords/stopwords_full_list.py} (100%)
 create mode 100644 oapen-engine/src/model/stopwords_processor.py

diff --git a/oapen-engine/oapen-suggestion-service b/oapen-engine/oapen-suggestion-service
new file mode 160000
index 0000000..d09adeb
--- /dev/null
+++ b/oapen-engine/oapen-suggestion-service
@@ -0,0 +1 @@
+Subproject commit d09adeb83f0d878f53e36b3dcf19df6ab2bffa50
diff --git a/oapen-engine/src/model/__init__.py b/oapen-engine/src/model/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/oapen-engine/src/model/ngrams.py b/oapen-engine/src/model/ngrams.py
index a3ce8c2..3448ef9 100644
--- a/oapen-engine/src/model/ngrams.py
+++ b/oapen-engine/src/model/ngrams.py
@@ -5,6 +5,7 @@ import nltk
 import pandas as pd  
 from nltk import word_tokenize  
 from nltk.corpus import stopwords  
+from .stopwords_processor import STOPWORDS
 
 nltk.download('punkt')
 
@@ -14,42 +15,6 @@ from .oapen_types import (
     OapenItem,
 )
 
-# define nltk package preset stopwords langauges
-# for more options, see: https://pypi.org/project/stop-words/
-stopwords_languages = [
-    "english",
-    "german",
-    "dutch"
-]
-
-# define paths for additionally defined stopwords
-stopword_paths = [
-    "model/stopwords/broken.txt",
-    "model/stopwords/dutch.txt",
-    "model/stopwords/filter.txt",
-    "model/stopwords/publisher.txt",
-]
-
-# collect the words into a list, pull from each file
-custom_stopwords = []
-for p in stopword_paths:
-    with open(p, "r") as f:
-        custom_stopwords += [line.rstrip() for line in f]
-
-# confirm stopwords package, download if not present
-try:
-    stopwords.words("english")
-except LookupError:
-    nltk.download("stopwords")
-
-# add languages
-nltk_stopwords = []
-for language in stopwords_languages:
-    nltk_stopwords += stopwords.words(language)
-
-# add languages and custom stopwords for final stopwords var
-STOPWORDS = (nltk_stopwords + custom_stopwords)
-
 def process_text(text):
     l_text = text.lower()
     p_text = "".join([c for c in l_text if c not in string.punctuation])
diff --git a/oapen-engine/src/model/stopwords.py b/oapen-engine/src/model/stopwords/stopwords_full_list.py
similarity index 100%
rename from oapen-engine/src/model/stopwords.py
rename to oapen-engine/src/model/stopwords/stopwords_full_list.py
diff --git a/oapen-engine/src/model/stopwords_processor.py b/oapen-engine/src/model/stopwords_processor.py
new file mode 100644
index 0000000..fb61ed0
--- /dev/null
+++ b/oapen-engine/src/model/stopwords_processor.py
@@ -0,0 +1,37 @@
+import nltk
+from nltk.corpus import stopwords
+from functools import reduce
+
+# This is run as a precaution in case of the error "NLTK stop words not found",
+# which makes sure to download the stop words after installing nltk
+nltk.download("stopwords")
+
+# add additional custom stopwords to ./custom_lists/ folder and update the reference here
+custom_lists_folder = "model/stopwords/"
+custom_stopwords_in_use = [
+    "broken",
+    "dutch",
+    "filter",
+    "publisher",
+]
+
+# For reference on available languages, please reference https://pypi.org/project/stop-words/
+enabled_languages = [
+    "english",
+    "german",
+    "dutch"
+]
+
+# the combined stopwords of all enabled langauges
+nltk_stopwords = []
+for language in enabled_languages:
+    nltk_stopwords += stopwords.words(language)
+
+# get the custom lists
+custom_stopwords = []
+for custom_list in custom_stopwords_in_use:
+    with open(custom_lists_folder + custom_list + ".txt", "r") as file:  # specify folder name
+        custom_stopwords += [line.rstrip() for line in file]
+
+# add languages and custom stopwords for final stopwords var
+STOPWORDS = (nltk_stopwords + custom_stopwords)
\ No newline at end of file