Метод 1: анализ текста и извлечение ключевых слов
Пример кода:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def extract_keywords(text, num_keywords):
# Tokenize the text
tokens = word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token.lower() not in stop_words]
# Calculate word frequency
freq_dist = nltk.FreqDist(tokens)
# Extract the most frequent keywords
keywords = [token for token, count in freq_dist.most_common(num_keywords)]
return keywords
# Example usage
text = "FIFA World Cup 2022 is an upcoming international soccer tournament."
num_keywords = 5
keywords = extract_keywords(text, num_keywords)
print(keywords)
Выход:
['FIFA', 'World', 'Cup', '2022', 'upcoming']
Метод 2: TF-IDF (частота документа, обратная частоте терминов)
Пример кода:
from sklearn.feature_extraction.text import TfidfVectorizer
def extract_keywords_tfidf(text, num_keywords):
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=num_keywords)
# Fit and transform the text
tfidf_matrix = vectorizer.fit_transform([text])
# Get feature names (keywords)
feature_names = vectorizer.get_feature_names()
return feature_names
# Example usage
text = "FIFA World Cup 2022 is an upcoming international soccer tournament."
num_keywords = 5
keywords = extract_keywords_tfidf(text, num_keywords)
print(keywords)
Выход:
['cup', 'fifa', 'international', 'soccer', 'tournament']
Метод 3: извлечение именной фразы
Пример кода:
import spacy
def extract_noun_phrases(text, num_phrases):
# Load the English NLP model
nlp = spacy.load('en_core_web_sm')
# Parse the text
doc = nlp(text)
# Extract noun phrases
noun_phrases = [chunk.text for chunk in doc.noun_chunks][:num_phrases]
return noun_phrases
# Example usage
text = "FIFA World Cup 2022 is an upcoming international soccer tournament."
num_phrases = 3
noun_phrases = extract_noun_phrases(text, num_phrases)
print(noun_phrases)
Выход:
['FIFA World Cup', 'an upcoming international soccer tournament']