final changes to stopwords
parent
7e9f955150
commit
bd82fc9a33
|
@ -0,0 +1 @@
|
|||
Subproject commit d09adeb83f0d878f53e36b3dcf19df6ab2bffa50
|
|
@ -5,6 +5,7 @@ import nltk
|
|||
import pandas as pd
|
||||
from nltk import word_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
from .stopwords_processor import STOPWORDS
|
||||
|
||||
nltk.download('punkt')
|
||||
|
||||
|
@ -14,42 +15,6 @@ from .oapen_types import (
|
|||
OapenItem,
|
||||
)
|
||||
|
||||
# define nltk package preset stopwords langauges
|
||||
# for more options, see: https://pypi.org/project/stop-words/
|
||||
stopwords_languages = [
|
||||
"english",
|
||||
"german",
|
||||
"dutch"
|
||||
]
|
||||
|
||||
# define paths for additionally defined stopwords
|
||||
stopword_paths = [
|
||||
"model/stopwords/broken.txt",
|
||||
"model/stopwords/dutch.txt",
|
||||
"model/stopwords/filter.txt",
|
||||
"model/stopwords/publisher.txt",
|
||||
]
|
||||
|
||||
# collect the words into a list, pull from each file
|
||||
custom_stopwords = []
|
||||
for p in stopword_paths:
|
||||
with open(p, "r") as f:
|
||||
custom_stopwords += [line.rstrip() for line in f]
|
||||
|
||||
# confirm stopwords package, download if not present
|
||||
try:
|
||||
stopwords.words("english")
|
||||
except LookupError:
|
||||
nltk.download("stopwords")
|
||||
|
||||
# add languages
|
||||
nltk_stopwords = []
|
||||
for language in stopwords_languages:
|
||||
nltk_stopwords += stopwords.words(language)
|
||||
|
||||
# add languages and custom stopwords for final stopwords var
|
||||
STOPWORDS = (nltk_stopwords + custom_stopwords)
|
||||
|
||||
def process_text(text):
|
||||
l_text = text.lower()
|
||||
p_text = "".join([c for c in l_text if c not in string.punctuation])
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
import nltk
|
||||
from nltk.corpus import stopwords
|
||||
from functools import reduce
|
||||
|
||||
# This is run as a precaution in case of the error "NLTK stop words not found",
|
||||
# which makes sure to download the stop words after installing nltk
|
||||
nltk.download("stopwords")
|
||||
|
||||
# add additional custom stopwords to ./custom_lists/ folder and update the reference here
|
||||
custom_lists_folder = "model/stopwords/"
|
||||
custom_stopwords_in_use = [
|
||||
"broken",
|
||||
"dutch",
|
||||
"filter",
|
||||
"publisher",
|
||||
]
|
||||
|
||||
# For reference on available languages, please reference https://pypi.org/project/stop-words/
|
||||
enabled_languages = [
|
||||
"english",
|
||||
"german",
|
||||
"dutch"
|
||||
]
|
||||
|
||||
# the combined stopwords of all enabled langauges
|
||||
nltk_stopwords = []
|
||||
for language in enabled_languages:
|
||||
nltk_stopwords += stopwords.words(language)
|
||||
|
||||
# get the custom lists
|
||||
custom_stopwords = []
|
||||
for custom_list in custom_stopwords_in_use:
|
||||
with open(custom_lists_folder + custom_list + ".txt", "r") as file: # specify folder name
|
||||
custom_stopwords += [line.rstrip() for line in file]
|
||||
|
||||
# add languages and custom stopwords for final stopwords var
|
||||
STOPWORDS = (nltk_stopwords + custom_stopwords)
|
Loading…
Reference in New Issue