final changes to stopwords

stopwords
Max Zaremba 2023-02-10 00:33:51 -05:00
parent 7e9f955150
commit bd82fc9a33
5 changed files with 39 additions and 36 deletions

@ -0,0 +1 @@
Subproject commit d09adeb83f0d878f53e36b3dcf19df6ab2bffa50

View File

View File

@ -5,6 +5,7 @@ import nltk
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from .stopwords_processor import STOPWORDS
nltk.download('punkt')
@ -14,42 +15,6 @@ from .oapen_types import (
OapenItem,
)
# define nltk package preset stopwords langauges
# for more options, see: https://pypi.org/project/stop-words/
stopwords_languages = [
"english",
"german",
"dutch"
]
# define paths for additionally defined stopwords
stopword_paths = [
"model/stopwords/broken.txt",
"model/stopwords/dutch.txt",
"model/stopwords/filter.txt",
"model/stopwords/publisher.txt",
]
# collect the words into a list, pull from each file
custom_stopwords = []
for p in stopword_paths:
with open(p, "r") as f:
custom_stopwords += [line.rstrip() for line in f]
# confirm stopwords package, download if not present
try:
stopwords.words("english")
except LookupError:
nltk.download("stopwords")
# add languages
nltk_stopwords = []
for language in stopwords_languages:
nltk_stopwords += stopwords.words(language)
# add languages and custom stopwords for final stopwords var
STOPWORDS = (nltk_stopwords + custom_stopwords)
def process_text(text):
l_text = text.lower()
p_text = "".join([c for c in l_text if c not in string.punctuation])

View File

@ -0,0 +1,37 @@
import nltk
from nltk.corpus import stopwords
from functools import reduce
# This is run as a precaution in case of the error "NLTK stop words not found",
# which makes sure to download the stop words after installing nltk
nltk.download("stopwords")
# add additional custom stopwords to ./custom_lists/ folder and update the reference here
custom_lists_folder = "model/stopwords/"
custom_stopwords_in_use = [
"broken",
"dutch",
"filter",
"publisher",
]
# For reference on available languages, please reference https://pypi.org/project/stop-words/
enabled_languages = [
"english",
"german",
"dutch"
]
# the combined stopwords of all enabled langauges
nltk_stopwords = []
for language in enabled_languages:
nltk_stopwords += stopwords.words(language)
# get the custom lists
custom_stopwords = []
for custom_list in custom_stopwords_in_use:
with open(custom_lists_folder + custom_list + ".txt", "r") as file: # specify folder name
custom_stopwords += [line.rstrip() for line in file]
# add languages and custom stopwords for final stopwords var
STOPWORDS = (nltk_stopwords + custom_stopwords)