diff --git a/lab2.md b/lab2.md index 4edce66..93f4540 100644 --- a/lab2.md +++ b/lab2.md @@ -77,22 +77,21 @@ stemmer = SnowballStemmer("russian") #nltk.download() def token_and_stem(text): - tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] - filtered_tokens = [] - for token in tokens: - if re.search('[а-яА-Я]', token): - filtered_tokens.append(token) - stems = [stemmer.stem(t) for t in -filtered_tokens] -return stems + tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] + filtered_tokens = [] + for token in tokens: + if re.search('[а-яА-Я]', token): + filtered_tokens.append(token) + stems = [stemmer.stem(t) for t in filtered_tokens] + return stems def token_only(text): - tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] - filtered_tokens = [] - for token in tokens: - if re.search('[а-яА-Я]', token): - filtered_tokens.append(token) - return filtered_tokens + tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] + filtered_tokens = [] + for token in tokens: + if re.search('[а-яА-Я]', token): + filtered_tokens.append(token) + return filtered_tokens #Создаем словари (массивы) из полученных основ totalvocab_stem = []