Update lab2.md

master
Vladimir Protsenko 3 years ago
parent 4f6868cddd
commit 97964a79b4

@ -77,22 +77,21 @@ stemmer = SnowballStemmer("russian")
#nltk.download()
def token_and_stem(text):
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
for token in tokens:
if re.search('[а-яА-Я]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in
filtered_tokens]
return stems
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
for token in tokens:
if re.search('[а-яА-Я]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
def token_only(text):
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
for token in tokens:
if re.search('[а-яА-Я]', token):
filtered_tokens.append(token)
return filtered_tokens
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
for token in tokens:
if re.search('[а-яА-Я]', token):
filtered_tokens.append(token)
return filtered_tokens
#Создаем словари (массивы) из полученных основ
totalvocab_stem = []

Loading…
Cancel
Save