From 97964a79b45a63600d3b3cf6b358453aef4f071a Mon Sep 17 00:00:00 2001 From: Vladimir Protsenko Date: Tue, 7 Sep 2021 10:56:07 +0000 Subject: [PATCH] Update lab2.md --- lab2.md | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/lab2.md b/lab2.md index 4edce66..93f4540 100644 --- a/lab2.md +++ b/lab2.md @@ -77,22 +77,21 @@ stemmer = SnowballStemmer("russian") #nltk.download() def token_and_stem(text): - tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] - filtered_tokens = [] - for token in tokens: - if re.search('[а-яА-Я]', token): - filtered_tokens.append(token) - stems = [stemmer.stem(t) for t in -filtered_tokens] -return stems + tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] + filtered_tokens = [] + for token in tokens: + if re.search('[а-яА-Я]', token): + filtered_tokens.append(token) + stems = [stemmer.stem(t) for t in filtered_tokens] + return stems def token_only(text): - tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] - filtered_tokens = [] - for token in tokens: - if re.search('[а-яА-Я]', token): - filtered_tokens.append(token) - return filtered_tokens + tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] + filtered_tokens = [] + for token in tokens: + if re.search('[а-яА-Я]', token): + filtered_tokens.append(token) + return filtered_tokens #Создаем словари (массивы) из полученных основ totalvocab_stem = []