Source code for TextGraphics.Applications.summary

# -*- coding: utf-8 -*-
#	Python implementation of the LexRank algorithm.
#	Reference - LexRank: Graph-based Centrality as Salience in Text Summarization
#	Reference URL - http://tangra.si.umich.edu/~radev/lexrank/lexrank.pdf

#	Author - Janu Verma
#	email - jv367@cornell.edu
#	http://januverma.wordpress.com/
#	@januverma


import sys
import os
import operator
import networkx as nx
from TextGraphics.src.graph import TextGraph
from TextGraphics import Data




[docs]class LexRank: """ Constructs a summary of the input document by extracting most informative sentences. Arguments: directory - A corpus of text files to be summarized. """ def __init__(self, directory): self.graph = TextGraph(directory)
[docs] def lexR(self, graph): """ Compute the LexRank of the sentences. LexRank of a sentence in the sentence graph is the PageRank of the node representing the sentence. It is a measure of the importance and influence of the sentence in the corpus. Arguments: graph - A networkx graph or digraph. Returns: A dictionary of all the nodes with their PageRank scores. """ pr = nx.pagerank_numpy(graph, alpha=0.85) return pr
[docs] def summary(self, compression = 0.25): """ Builds the summary based on the LexRank scores of the sentences. Arguments: compression : A number in [0,1] which is equal to the fraction of total sentences to be included in the summary. Default value is 0.25 Returns: Summary of the input document. """ g = self.graph.sentenceGraph() total_sentences = len(g.nodes()) n_sentences = int(total_sentences * compression) rankings = self.lexR(g) ranked_sentences = sorted(rankings.iteritems(), key=operator.itemgetter(1), reverse=True) summary_sentences = "" i = 0 while (i < n_sentences): u,v = ranked_sentences[i] summary_sentences += u i = i + 1 return summary_sentences