in sourcecode/scoring/topic_model.py [0:0]
def _get_stop_words(self, texts: np.ndarray) -> List[str]:
"""Identify tokens in the extracted vocabulary that contain seed terms.
Any token containing a seed term will be treated as a stop word (i.e. removed
from the extracted features). This prevents the model from training on the same
tokens used to label the data.
Args:
texts: array containing strings for topic assignment
Returns:
List specifying which tokens to exclude from the features.
"""
# Extract vocabulary
cv = CountVectorizer(strip_accents="unicode")
cv.fit(texts)
rawVocabulary = cv.vocabulary_.keys()
logger.info(f" Initial vocabulary length: {len(rawVocabulary)}")
# Identify stop words
blockedTokens = set()
for terms in self._seedTerms.values():
# Remove whitespace and any escaped characters from terms
blockedTokens |= {re.sub(r"\\.", "", t.strip()) for t in terms}
logger.info(f" Total tokens to filter: {len(blockedTokens)}")
stopWords = [v for v in rawVocabulary if any(t in v for t in blockedTokens)]
logger.info(f" Total identified stopwords: {len(stopWords)}")
return stopWords