Update lab2.md

master
Vladimir Protsenko 3 years ago
parent 7d99eec9e3
commit 4f6868cddd

@ -75,10 +75,9 @@ print(str(len(all_wall)) + ' запросов считано')
from nltk.stem.snowball import SnowballStemmer from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("russian") stemmer = SnowballStemmer("russian")
#nltk.download() #nltk.download()
def token_and_stem(text): def token_and_stem(text):
tokens = [word for sent in tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
nltk.sent_tokenize(text) for word in
nltk.word_tokenize(sent)]
filtered_tokens = [] filtered_tokens = []
for token in tokens: for token in tokens:
if re.search('[а-яА-Я]', token): if re.search('[а-яА-Я]', token):
@ -88,14 +87,13 @@ filtered_tokens]
return stems return stems
def token_only(text): def token_only(text):
tokens = [word.lower() for sent in tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
nltk.sent_tokenize(text) for word in
nltk.word_tokenize(sent)]
filtered_tokens = [] filtered_tokens = []
for token in tokens: for token in tokens:
if re.search('[а-яА-Я]', token): if re.search('[а-яА-Я]', token):
filtered_tokens.append(token) filtered_tokens.append(token)
return filtered_tokens return filtered_tokens
#Создаем словари (массивы) из полученных основ #Создаем словари (массивы) из полученных основ
totalvocab_stem = [] totalvocab_stem = []
totalvocab_token = [] totalvocab_token = []

Loading…
Cancel
Save