Чемпионат мира по футболу 2022 года: предстоящий международный футбольный турнир

Метод 1: анализ текста и извлечение ключевых слов
Пример кода:

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def extract_keywords(text, num_keywords):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Calculate word frequency
    freq_dist = nltk.FreqDist(tokens)

    # Extract the most frequent keywords
    keywords = [token for token, count in freq_dist.most_common(num_keywords)]

    return keywords
# Example usage
text = "FIFA World Cup 2022 is an upcoming international soccer tournament."
num_keywords = 5
keywords = extract_keywords(text, num_keywords)
print(keywords)

Выход:

['FIFA', 'World', 'Cup', '2022', 'upcoming']

Метод 2: TF-IDF (частота документа, обратная частоте терминов)
Пример кода:

from sklearn.feature_extraction.text import TfidfVectorizer
def extract_keywords_tfidf(text, num_keywords):
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=num_keywords)

    # Fit and transform the text
    tfidf_matrix = vectorizer.fit_transform([text])

    # Get feature names (keywords)
    feature_names = vectorizer.get_feature_names()

    return feature_names
# Example usage
text = "FIFA World Cup 2022 is an upcoming international soccer tournament."
num_keywords = 5
keywords = extract_keywords_tfidf(text, num_keywords)
print(keywords)

Выход:

['cup', 'fifa', 'international', 'soccer', 'tournament']

Метод 3: извлечение именной фразы
Пример кода:

import spacy
def extract_noun_phrases(text, num_phrases):
    # Load the English NLP model
    nlp = spacy.load('en_core_web_sm')

    # Parse the text
    doc = nlp(text)

    # Extract noun phrases
    noun_phrases = [chunk.text for chunk in doc.noun_chunks][:num_phrases]

    return noun_phrases
# Example usage
text = "FIFA World Cup 2022 is an upcoming international soccer tournament."
num_phrases = 3
noun_phrases = extract_noun_phrases(text, num_phrases)
print(noun_phrases)

Выход:

['FIFA World Cup', 'an upcoming international soccer tournament']