diff --git a/lab2.md b/lab2.md index 645815e..4edce66 100644 --- a/lab2.md +++ b/lab2.md @@ -75,10 +75,9 @@ print(str(len(all_wall)) + ' запросов считано') from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("russian") #nltk.download() + def token_and_stem(text): - tokens = [word for sent in -nltk.sent_tokenize(text) for word in -nltk.word_tokenize(sent)] + tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] filtered_tokens = [] for token in tokens: if re.search('[а-яА-Я]', token): @@ -88,14 +87,13 @@ filtered_tokens] return stems def token_only(text): - tokens = [word.lower() for sent in -nltk.sent_tokenize(text) for word in -nltk.word_tokenize(sent)] + tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] filtered_tokens = [] for token in tokens: if re.search('[а-яА-Я]', token): filtered_tokens.append(token) return filtered_tokens + #Создаем словари (массивы) из полученных основ totalvocab_stem = [] totalvocab_token = []