from sklearn.feature_extraction.text import CountVectorizer
titles = [
"How to Train Your Dog",
"Tips for Dog Training",
"Dog Training Basics",
"Advanced Dog Training Techniques",
"Effective Dog Training Methods"
]
vectorizer = CountVectorizer()
title_vectors = vectorizer.fit_transform(titles)
# Perform clustering on title_vectors using an appropriate algorithm
# (e.g., K-means, DBSCAN) to group similar titles together
from gensim.models import Word2Vec
titles = [
"How to Train Your Dog",
"Tips for Dog Training",
"Dog Training Basics",
"Advanced Dog Training Techniques",
"Effective Dog Training Methods"
]
# Train a Word2Vec model on a large corpus of text data
model = Word2Vec([title.split() for title in titles], min_count=1)
# Calculate similarity between titles
similarity_matrix = [[model.wv.similarity(title1, title2) for title2 in titles] for title1 in titles]
# Apply clustering algorithms to the similarity matrix
from sklearn.feature_extraction.text import TfidfVectorizer
titles = [
"How to Train Your Dog",
"Tips for Dog Training",
"Dog Training Basics",
"Advanced Dog Training Techniques",
"Effective Dog Training Methods"
]
vectorizer = TfidfVectorizer()
title_vectors = vectorizer.fit_transform(titles)
# Apply clustering algorithms to the title_vectors to group similar titles together
Метод 4. Иерархическая кластеризация.
Иерархическая кластеризация создает иерархию кластеров путем их слияния или разделения. Это особенно полезно, когда количество кластеров неизвестно. Вот пример использования Python и библиотеки scipy:
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
titles = [
"How to Train Your Dog",
"Tips for Dog Training",
"Dog Training Basics",
"Advanced Dog Training Techniques",
"Effective Dog Training Methods"
]
# Convert titles to numerical representation (e.g., using BoW or TF-IDF)
# Compute the linkage matrix
linkage_matrix = linkage(title_vectors.toarray(), method='ward')
# Plot the dendrogram
dendrogram(linkage_matrix, labels=titles, orientation='right')
plt.show()