Создание представления «мешка слов» в scikit-learn: методы и примеры кода - Fcodenotes

Чтобы создать представление набора слов с помощью scikit-learn в Python, вы можете использовать несколько методов. Вот несколько примеров:

Метод 1: CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer
# Sample corpus
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]
# Create an instance of CountVectorizer
vectorizer = CountVectorizer()
# Fit and transform the corpus
X = vectorizer.fit_transform(corpus)
# Get the bag-of-words representation
bag_of_words = X.toarray()
# Print the feature names
feature_names = vectorizer.get_feature_names()
print("Feature names:", feature_names)
# Print the bag-of-words representation
print("Bag-of-words representation:")
print(bag_of_words)

Метод 2: TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
# Sample corpus
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]
# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()
# Fit and transform the corpus
X = vectorizer.fit_transform(corpus)
# Get the bag-of-words representation
bag_of_words = X.toarray()
# Print the feature names
feature_names = vectorizer.get_feature_names()
print("Feature names:", feature_names)
# Print the bag-of-words representation
print("Bag-of-words representation:")
print(bag_of_words)

Метод 3: HashingVectorizer

from sklearn.feature_extraction.text import HashingVectorizer
# Sample corpus
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]
# Create an instance of HashingVectorizer
vectorizer = HashingVectorizer(n_features=10)
# Transform the corpus
X = vectorizer.transform(corpus)
# Get the bag-of-words representation
bag_of_words = X.toarray()
# Print the bag-of-words representation
print("Bag-of-words representation:")
print(bag_of_words)

Это всего лишь несколько примеров создания представления «мешка слов» с помощью scikit-learn. Другие методы включают использование специальных методов токенизации, предварительной обработки и извлечения признаков.