-
Извлечение ключевых слов с использованием обработки естественного языка (NLP):
from nltk import word_tokenize, pos_tag from nltk.corpus import stopwords def extract_keywords(text): # Tokenize the text tokens = word_tokenize(text) # Filter out stopwords and non-alphabetic words stop_words = set(stopwords.words('english')) keywords = [word.lower() for word, tag in pos_tag(tokens) if word.isalpha() and word.lower() not in stop_words] return keywords # Example usage text = "Martin Luther King was a prominent civil rights activist." keywords = extract_keywords(text) print(keywords) # Output: ['martin', 'luther', 'king', 'prominent', 'civil', 'rights', 'activist'] -
Распознавание именованных объектов (NER):
import spacy def extract_named_entities(text): nlp = spacy.load('en_core_web_sm') doc = nlp(text) named_entities = [entity.text for entity in doc.ents if entity.label_ in ['PERSON', 'ORG']] return named_entities # Example usage text = "Martin Luther King Jr. was born on January 15, 1929." named_entities = extract_named_entities(text) print(named_entities) # Output: ['Martin Luther King Jr.'] -
TF-IDF (частота документа, обратная частоте терминов):
from sklearn.feature_extraction.text import TfidfVectorizer def extract_keywords_tfidf(texts): vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(texts) # Get the feature names (keywords) feature_names = vectorizer.get_feature_names() return feature_names # Example usage texts = [ "Martin Luther King was a leader in the American civil rights movement.", "He is best known for his role in advancing civil rights using nonviolent civil disobedience." ] keywords = extract_keywords_tfidf(texts) print(keywords) # Output: ['advancing', 'american', 'best', 'civil', 'disobedience', 'he', 'his', 'in', 'is', 'known', 'leader', 'luther', 'martin', 'movement', 'nonviolent', 'rights', 'role', 'the', 'using', 'was']