“KGF 2 стал вторым по прибылям фильмом”.
Метод 1: использование библиотек обработки естественного языка
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
def extract_keywords(text, num_keywords):
# Tokenize the text into words
tokens = word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word.casefold() not in stop_words]
# Stemming words
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in tokens]
# Count word frequency
word_frequency = nltk.FreqDist(stemmed_tokens)
# Extract top keywords
keywords = [word for word, freq in word_frequency.most_common(num_keywords)]
return keywords
# Example usage
text = "KGF 2 became the second highest-grossing film."
num_keywords = 5
keywords = extract_keywords(text, num_keywords)
# Output
print("Keywords:", keywords)
Метод 2: использование TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
def extract_keywords(text, num_keywords):
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=num_keywords)
# Fit and transform the text
tfidf_matrix = vectorizer.fit_transform([text])
# Get feature names
feature_names = vectorizer.get_feature_names()
return feature_names
# Example usage
text = "KGF 2 became the second highest-grossing film."
num_keywords = 5
keywords = extract_keywords(text, num_keywords)
# Output
print("Keywords:", keywords)
Метод 3. Использование RAKE (быстрое автоматическое извлечение ключевых слов)
from rake_nltk import Rake
def extract_keywords(text, num_keywords):
# Create RAKE object
r = Rake()
# Extract keywords
r.extract_keywords_from_text(text)
keywords = r.get_ranked_phrases()[:num_keywords]
return keywords
# Example usage
text = "KGF 2 became the second highest-grossing film."
num_keywords = 5
keywords = extract_keywords(text, num_keywords)
# Output
print("Keywords:", keywords)