Чтобы создать представление набора слов с помощью scikit-learn в Python, вы можете использовать несколько методов. Вот несколько примеров:
Метод 1: CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# Sample corpus
corpus = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?"
]
# Create an instance of CountVectorizer
vectorizer = CountVectorizer()
# Fit and transform the corpus
X = vectorizer.fit_transform(corpus)
# Get the bag-of-words representation
bag_of_words = X.toarray()
# Print the feature names
feature_names = vectorizer.get_feature_names()
print("Feature names:", feature_names)
# Print the bag-of-words representation
print("Bag-of-words representation:")
print(bag_of_words)
Метод 2: TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Sample corpus
corpus = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?"
]
# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()
# Fit and transform the corpus
X = vectorizer.fit_transform(corpus)
# Get the bag-of-words representation
bag_of_words = X.toarray()
# Print the feature names
feature_names = vectorizer.get_feature_names()
print("Feature names:", feature_names)
# Print the bag-of-words representation
print("Bag-of-words representation:")
print(bag_of_words)
Метод 3: HashingVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
# Sample corpus
corpus = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?"
]
# Create an instance of HashingVectorizer
vectorizer = HashingVectorizer(n_features=10)
# Transform the corpus
X = vectorizer.transform(corpus)
# Get the bag-of-words representation
bag_of_words = X.toarray()
# Print the bag-of-words representation
print("Bag-of-words representation:")
print(bag_of_words)
Это всего лишь несколько примеров создания представления «мешка слов» с помощью scikit-learn. Другие методы включают использование специальных методов токенизации, предварительной обработки и извлечения признаков.