Реализация TextRank Python для экстрактивного суммирования

TextRank – популярный алгоритм для извлечения обобщения текста и извлечения ключевых слов. Он основан на алгоритме PageRank, используемом поисковыми системами для ранжирования веб-страниц. Вот реализация TextRank на Python с использованием библиотеки Natural Language Toolkit (NLTK):

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
def textrank(text, num_sentences=3):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Tokenize the sentences into words
    words = [word_tokenize(sentence) for sentence in sentences]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [[word for word in sentence if word.lower() not in stop_words] for sentence in words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [[lemmatizer.lemmatize(word.lower()) for word in sentence] for sentence in words]

    # Flatten the list of words
    words = [word for sentence in words for word in sentence]

    # Calculate word frequencies
    freq_dist = FreqDist(words)

    # Calculate TF-IDF scores
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)

    # Calculate cosine similarity matrix
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Apply PageRank algorithm
    scores = pagerank(similarity_matrix)

    # Sort the sentences by their scores
    ranked_sentences = sorted(((scores[i], sentence) for i, sentence in enumerate(sentences)), reverse=True)

    # Select the top-ranked sentences
    top_sentences = [sentence for score, sentence in ranked_sentences[:num_sentences]]

    return ' '.join(top_sentences)
def pagerank(similarity_matrix, damping=0.85, max_iterations=100, epsilon=1e-4):
    num_sentences = similarity_matrix.shape[0]
    scores = [1.0] * num_sentences
    old_scores = [0.0] * num_sentences
    iteration = 0

    while iteration < max_iterations and not has_converged(scores, old_scores, epsilon):
        old_scores = list(scores)
        for i in range(num_sentences):
            scores[i] = (1 - damping) + damping * sum(similarity_matrix[i, j] * scores[j] for j in range(num_sentences))
        iteration += 1

    return scores
def has_converged(scores, old_scores, epsilon):
    return max(abs(scores[i] - old_scores[i]) for i in range(len(scores))) < epsilon
# Example usage
text = "TextRank is a graph-based algorithm that extracts key phrases and sentences from text. It works by treating sentences as nodes in a graph, and edges between the nodes represent the strength of their semantic similarity. The algorithm then calculates the importance of each sentence using iterative calculations similar to the PageRank algorithm. Finally, the top-ranked sentences are selected as the summary of the text."
summary = textrank(text, num_sentences=2)
print(summary)

Этот код реализует алгоритм TextRank для извлеченного суммирования. Он принимает на вход фрагмент текста и возвращает сводку, состоящую из предложений с самым высоким рейтингом.