From bd82fc9a33f9405d8cdc9badac29c6319c9213c0 Mon Sep 17 00:00:00 2001 From: Max Zaremba Date: Fri, 10 Feb 2023 00:33:51 -0500 Subject: [PATCH] final changes to stopwords --- oapen-engine/oapen-suggestion-service | 1 + oapen-engine/src/model/__init__.py | 0 oapen-engine/src/model/ngrams.py | 37 +------------------ .../stopwords_full_list.py} | 0 oapen-engine/src/model/stopwords_processor.py | 37 +++++++++++++++++++ 5 files changed, 39 insertions(+), 36 deletions(-) create mode 160000 oapen-engine/oapen-suggestion-service create mode 100644 oapen-engine/src/model/__init__.py rename oapen-engine/src/model/{stopwords.py => stopwords/stopwords_full_list.py} (100%) create mode 100644 oapen-engine/src/model/stopwords_processor.py diff --git a/oapen-engine/oapen-suggestion-service b/oapen-engine/oapen-suggestion-service new file mode 160000 index 0000000..d09adeb --- /dev/null +++ b/oapen-engine/oapen-suggestion-service @@ -0,0 +1 @@ +Subproject commit d09adeb83f0d878f53e36b3dcf19df6ab2bffa50 diff --git a/oapen-engine/src/model/__init__.py b/oapen-engine/src/model/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/oapen-engine/src/model/ngrams.py b/oapen-engine/src/model/ngrams.py index a3ce8c2..3448ef9 100644 --- a/oapen-engine/src/model/ngrams.py +++ b/oapen-engine/src/model/ngrams.py @@ -5,6 +5,7 @@ import nltk import pandas as pd from nltk import word_tokenize from nltk.corpus import stopwords +from .stopwords_processor import STOPWORDS nltk.download('punkt') @@ -14,42 +15,6 @@ from .oapen_types import ( OapenItem, ) -# define nltk package preset stopwords langauges -# for more options, see: https://pypi.org/project/stop-words/ -stopwords_languages = [ - "english", - "german", - "dutch" -] - -# define paths for additionally defined stopwords -stopword_paths = [ - "model/stopwords/broken.txt", - "model/stopwords/dutch.txt", - "model/stopwords/filter.txt", - "model/stopwords/publisher.txt", -] - -# collect the words into a list, pull from each file -custom_stopwords = [] -for p in stopword_paths: - with open(p, "r") as f: - custom_stopwords += [line.rstrip() for line in f] - -# confirm stopwords package, download if not present -try: - stopwords.words("english") -except LookupError: - nltk.download("stopwords") - -# add languages -nltk_stopwords = [] -for language in stopwords_languages: - nltk_stopwords += stopwords.words(language) - -# add languages and custom stopwords for final stopwords var -STOPWORDS = (nltk_stopwords + custom_stopwords) - def process_text(text): l_text = text.lower() p_text = "".join([c for c in l_text if c not in string.punctuation]) diff --git a/oapen-engine/src/model/stopwords.py b/oapen-engine/src/model/stopwords/stopwords_full_list.py similarity index 100% rename from oapen-engine/src/model/stopwords.py rename to oapen-engine/src/model/stopwords/stopwords_full_list.py diff --git a/oapen-engine/src/model/stopwords_processor.py b/oapen-engine/src/model/stopwords_processor.py new file mode 100644 index 0000000..fb61ed0 --- /dev/null +++ b/oapen-engine/src/model/stopwords_processor.py @@ -0,0 +1,37 @@ +import nltk +from nltk.corpus import stopwords +from functools import reduce + +# This is run as a precaution in case of the error "NLTK stop words not found", +# which makes sure to download the stop words after installing nltk +nltk.download("stopwords") + +# add additional custom stopwords to ./custom_lists/ folder and update the reference here +custom_lists_folder = "model/stopwords/" +custom_stopwords_in_use = [ + "broken", + "dutch", + "filter", + "publisher", +] + +# For reference on available languages, please reference https://pypi.org/project/stop-words/ +enabled_languages = [ + "english", + "german", + "dutch" +] + +# the combined stopwords of all enabled langauges +nltk_stopwords = [] +for language in enabled_languages: + nltk_stopwords += stopwords.words(language) + +# get the custom lists +custom_stopwords = [] +for custom_list in custom_stopwords_in_use: + with open(custom_lists_folder + custom_list + ".txt", "r") as file: # specify folder name + custom_stopwords += [line.rstrip() for line in file] + +# add languages and custom stopwords for final stopwords var +STOPWORDS = (nltk_stopwords + custom_stopwords) \ No newline at end of file