|
|
|
@ -75,10 +75,9 @@ print(str(len(all_wall)) + ' запросов считано')
|
|
|
|
|
from nltk.stem.snowball import SnowballStemmer
|
|
|
|
|
stemmer = SnowballStemmer("russian")
|
|
|
|
|
#nltk.download()
|
|
|
|
|
|
|
|
|
|
def token_and_stem(text):
|
|
|
|
|
tokens = [word for sent in
|
|
|
|
|
nltk.sent_tokenize(text) for word in
|
|
|
|
|
nltk.word_tokenize(sent)]
|
|
|
|
|
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
|
|
|
|
|
filtered_tokens = []
|
|
|
|
|
for token in tokens:
|
|
|
|
|
if re.search('[а-яА-Я]', token):
|
|
|
|
@ -88,14 +87,13 @@ filtered_tokens]
|
|
|
|
|
return stems
|
|
|
|
|
|
|
|
|
|
def token_only(text):
|
|
|
|
|
tokens = [word.lower() for sent in
|
|
|
|
|
nltk.sent_tokenize(text) for word in
|
|
|
|
|
nltk.word_tokenize(sent)]
|
|
|
|
|
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
|
|
|
|
|
filtered_tokens = []
|
|
|
|
|
for token in tokens:
|
|
|
|
|
if re.search('[а-яА-Я]', token):
|
|
|
|
|
filtered_tokens.append(token)
|
|
|
|
|
return filtered_tokens
|
|
|
|
|
|
|
|
|
|
#Создаем словари (массивы) из полученных основ
|
|
|
|
|
totalvocab_stem = []
|
|
|
|
|
totalvocab_token = []
|
|
|
|
|