Чтобы токенизировать строку и вычислить частоту слов с помощью счетчика в Python, вы можете использовать несколько методов. Вот некоторые из них:
Метод 1: использование функции Split() и коллекций.Счетчик
from collections import Counter
def tokenize_string(string):
tokens = string.split() # Split the string into a list of tokens
word_frequency = Counter(tokens) # Count the frequency of each word
return word_frequency
# Example usage:
text = "This is a sample string. It contains several words, some of which may repeat."
word_frequency = tokenize_string(text)
print(word_frequency)
Метод 2. Использование регулярных выражений и коллекций. Счетчик
import re
from collections import Counter
def tokenize_string(string):
tokens = re.findall(r'\w+', string.lower()) # Find all word tokens using regular expression
word_frequency = Counter(tokens) # Count the frequency of each word
return word_frequency
# Example usage:
text = "This is a sample string. It contains several words, some of which may repeat."
word_frequency = tokenize_string(text)
print(word_frequency)
Метод 3. Использование библиотеки и коллекций nltk. Счетчик
import nltk
from collections import Counter
def tokenize_string(string):
tokens = nltk.word_tokenize(string.lower()) # Tokenize the string using nltk
word_frequency = Counter(tokens) # Count the frequency of each word
return word_frequency
# Example usage:
text = "This is a sample string. It contains several words, some of which may repeat."
word_frequency = tokenize_string(text)
print(word_frequency)