Мартин Лютер Кинг: выдающийся деятель движения за гражданские права - Fcodenotes

Извлечение ключевых слов с использованием обработки естественного языка (NLP):

from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
def extract_keywords(text):
   # Tokenize the text
   tokens = word_tokenize(text)
   # Filter out stopwords and non-alphabetic words
   stop_words = set(stopwords.words('english'))
   keywords = [word.lower() for word, tag in pos_tag(tokens) if word.isalpha() and word.lower() not in stop_words]
   return keywords
# Example usage
text = "Martin Luther King was a prominent civil rights activist."
keywords = extract_keywords(text)
print(keywords)  # Output: ['martin', 'luther', 'king', 'prominent', 'civil', 'rights', 'activist']

Распознавание именованных объектов (NER):

import spacy
def extract_named_entities(text):
   nlp = spacy.load('en_core_web_sm')
   doc = nlp(text)
   named_entities = [entity.text for entity in doc.ents if entity.label_ in ['PERSON', 'ORG']]
   return named_entities
# Example usage
text = "Martin Luther King Jr. was born on January 15, 1929."
named_entities = extract_named_entities(text)
print(named_entities)  # Output: ['Martin Luther King Jr.']

TF-IDF (частота документа, обратная частоте терминов):

from sklearn.feature_extraction.text import TfidfVectorizer
def extract_keywords_tfidf(texts):
   vectorizer = TfidfVectorizer()
   tfidf_matrix = vectorizer.fit_transform(texts)
   # Get the feature names (keywords)
   feature_names = vectorizer.get_feature_names()
   return feature_names
# Example usage
texts = [
   "Martin Luther King was a leader in the American civil rights movement.",
   "He is best known for his role in advancing civil rights using nonviolent civil disobedience."
]
keywords = extract_keywords_tfidf(texts)
print(keywords)  # Output: ['advancing', 'american', 'best', 'civil', 'disobedience', 'he', 'his', 'in', 'is', 'known', 'leader', 'luther', 'martin', 'movement', 'nonviolent', 'rights', 'role', 'the', 'using', 'was']