Source code for TextGraphics.Applications.summary

# -*- coding: utf-8 -*-
#	Python implementation of the LexRank algorithm.
#	Reference - LexRank: Graph-based Centrality as Salience in Text Summarization
#	Reference URL - http://tangra.si.umich.edu/~radev/lexrank/lexrank.pdf

#	Author - Janu Verma
#	email - jv367@cornell.edu
#	http://januverma.wordpress.com/
#	@januverma


import sys
import os
import operator
import networkx as nx
from TextGraphics.src.graph import TextGraph
from TextGraphics import Data




[docs]class LexRank:
	"""
	Constructs a summary of the input document by extracting most informative sentences. 

	Arguments:
		directory - A corpus of text files to be summarized. 
	"""
	def __init__(self, directory):
		self.graph = TextGraph(directory)



[docs]	def lexR(self, graph):
		"""
		Compute the LexRank of the sentences. 
		LexRank of a sentence in the sentence graph is the PageRank of the node 
		representing the sentence. It is a measure of the importance and influence 
		of the sentence in the corpus. 

		Arguments:
			graph -  A networkx graph or digraph. 

		Returns:
			A dictionary of all the nodes with their PageRank scores. 
		"""

		pr = nx.pagerank_numpy(graph, alpha=0.85)
		return pr




[docs]	def summary(self, compression = 0.25):
		"""
		Builds the summary based on the LexRank scores of the sentences.

		Arguments:
			compression : A number in [0,1] which is equal to the fraction of total 
			sentences to be included in the summary. 
			Default value is 0.25

		Returns:
			Summary of the input document.	 
		"""
		g = self.graph.sentenceGraph()
		total_sentences = len(g.nodes())
		n_sentences = int(total_sentences * compression)

		rankings = self.lexR(g)
		ranked_sentences = sorted(rankings.iteritems(), key=operator.itemgetter(1), reverse=True)

		summary_sentences = ""
		i = 0
		while (i < n_sentences):
			u,v = ranked_sentences[i]
			summary_sentences += u
			i = i + 1
		return summary_sentences
Navigation

Source code for TextGraphics.Applications.summary

Quick search

Navigation