|
|
@ -82,9 +82,8 @@ def token_and_stem(text):
|
|
|
|
for token in tokens:
|
|
|
|
for token in tokens:
|
|
|
|
if re.search('[а-яА-Я]', token):
|
|
|
|
if re.search('[а-яА-Я]', token):
|
|
|
|
filtered_tokens.append(token)
|
|
|
|
filtered_tokens.append(token)
|
|
|
|
stems = [stemmer.stem(t) for t in
|
|
|
|
stems = [stemmer.stem(t) for t in filtered_tokens]
|
|
|
|
filtered_tokens]
|
|
|
|
return stems
|
|
|
|
return stems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def token_only(text):
|
|
|
|
def token_only(text):
|
|
|
|
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
|
|
|
|
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
|
|
|
|