final changes to stopwords

2023-02-10 00:33:51 -05:00 · 2023-02-10 00:33:51 -05:00 · bd82fc9a33
parent 7e9f955150
commit bd82fc9a33
5 changed files with 39 additions and 36 deletions
--- a/oapen-engine/oapen-suggestion-service
+++ b/oapen-engine/oapen-suggestion-service
@ -0,0 +1 @@
+Subproject commit d09adeb83f0d878f53e36b3dcf19df6ab2bffa50
--- a/oapen-engine/src/model/init.py
+++ b/oapen-engine/src/model/init.py
--- a/oapen-engine/src/model/ngrams.py
+++ b/oapen-engine/src/model/ngrams.py
@ -5,6 +5,7 @@ import nltk
 import pandas as pd  
 from nltk import word_tokenize  
 from nltk.corpus import stopwords  
+from .stopwords_processor import STOPWORDS

 nltk.download('punkt')

@ -14,42 +15,6 @@ from .oapen_types import (
    OapenItem,
 )

-# define nltk package preset stopwords langauges
-# for more options, see: https://pypi.org/project/stop-words/
-stopwords_languages = [
-    "english",
-    "german",
-    "dutch"
-]
-
-# define paths for additionally defined stopwords
-stopword_paths = [
-    "model/stopwords/broken.txt",
-    "model/stopwords/dutch.txt",
-    "model/stopwords/filter.txt",
-    "model/stopwords/publisher.txt",
-]
-
-# collect the words into a list, pull from each file
-custom_stopwords = []
-for p in stopword_paths:
-    with open(p, "r") as f:
-        custom_stopwords += [line.rstrip() for line in f]
-
-# confirm stopwords package, download if not present
-try:
-    stopwords.words("english")
-except LookupError:
-    nltk.download("stopwords")
-
-# add languages
-nltk_stopwords = []
-for language in stopwords_languages:
-    nltk_stopwords += stopwords.words(language)
-
-# add languages and custom stopwords for final stopwords var
-STOPWORDS = (nltk_stopwords + custom_stopwords)
-
 def process_text(text):
    l_text = text.lower()
    p_text = "".join([c for c in l_text if c not in string.punctuation])
--- a/oapen-engine/src/model/stopwords/stopwords_full_list.py
+++ b/oapen-engine/src/model/stopwords/stopwords_full_list.py
--- a/oapen-engine/src/model/stopwords_processor.py
+++ b/oapen-engine/src/model/stopwords_processor.py
@ -0,0 +1,37 @@
+import nltk
+from nltk.corpus import stopwords
+from functools import reduce
+
+# This is run as a precaution in case of the error "NLTK stop words not found",
+# which makes sure to download the stop words after installing nltk
+nltk.download("stopwords")
+
+# add additional custom stopwords to ./custom_lists/ folder and update the reference here
+custom_lists_folder = "model/stopwords/"
+custom_stopwords_in_use = [
+    "broken",
+    "dutch",
+    "filter",
+    "publisher",
+]
+
+# For reference on available languages, please reference https://pypi.org/project/stop-words/
+enabled_languages = [
+    "english",
+    "german",
+    "dutch"
+]
+
+# the combined stopwords of all enabled langauges
+nltk_stopwords = []
+for language in enabled_languages:
+    nltk_stopwords += stopwords.words(language)
+
+# get the custom lists
+custom_stopwords = []
+for custom_list in custom_stopwords_in_use:
+    with open(custom_lists_folder + custom_list + ".txt", "r") as file:  # specify folder name
+        custom_stopwords += [line.rstrip() for line in file]
+
+# add languages and custom stopwords for final stopwords var
+STOPWORDS = (nltk_stopwords + custom_stopwords)
				`@ -0,0 +1 @@`
				`Subproject commit d09adeb83f0d878f53e36b3dcf19df6ab2bffa50`