Source code for InfoR.VectorSpaceModels

# -*- coding: utf-8 -*-

#	A search engine based on Vector Space model of the information retrival. 
#	Also has the option to do Latent Sementic Indexing of the term-document matrix.     
#	Author - Janu Verma
#	email - jv367@cornell.edu
#	http://januverma.wordpress.com/
#	@januverma


import sys
from pydoc import help
import os


import numpy as np 


try:
	from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
	from sklearn.decomposition import TruncatedSVD 
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.preprocessing import Normalizer
except:
	print "Error : Requires scikit-learn."
	sys.exit()



[docs]class VSM:
	"""
	Implements a Vector space search engine. Each document is represented by a vector in a high dimensional 
	vector space where there is a dimension corresponding to each unique word in the corpus. 
	The contents of the vector are the frequencies or tf-idf scores of the term. 

	Latent Sementic Analysis (LSA) of the term-document matrix is performed by Singular Value Decomposition (SVD). 
	A is the term-document matrix where each row corresponds to a row and each term is a column.
	The entries of the matrix a_ij contains the tf-idf score of the term i in document j.
	The SVD maps each document from term space to the (lower dimensional) concept space. 
	"""

	def __init__(self, directory):
		"""
		Arguments:
			directory : Directory of documents to be searched. 
		"""
		self.corpus = os.listdir(directory)
		self.text = []
		for f in self.corpus:
			f = os.path.join(directory,f)
			with open(f) as doc:
				info = doc.read()
				self.text.append(info)

				

[docs]	def search(self, q, n_docs, tf_idf=False, LSA=False, n_comp=None):
		""" 
		Returns documents which are most relavant to the query. 
		Ranking is done by decreasing cosine similarity with the query. 


		Arguments:
			String q : Search query
			Integer n_docs : Number of matching documents retrived. 
			Boolean tf_idf : If True, the vector features will have tf-idf scores. 
			Boolean LSA : If True, the vectors will be mapped to a low dimenional concept space. 
			Integer n_comp : Number of components for the LSA, dimension of the concept space. 


		Returns:
			A list of length n_docs containing documents most relevant to the search query. 
			The list if sorted in the descending order. 
		"""

		##	Default vector have entries the frequencies of occurance of the terms. 
		vectorizer = CountVectorizer(min_df=0,stop_words='english')
		
		##	Vectors with entries as tf-idf scores. 
		if (tf_idf == True):
			vectorizer = TfidfVectorizer(min_df=0, stop_words='english')
		
		X = vectorizer.fit_transform(self.text)
		X = X.toarray()

		##	vectorize the query accordingly. 
		query = [q]
		query = vectorizer.transform(query)
		query = query.toarray()

		##	Reduce the vectors to to n_comp - dimensional vector space. 
		if (LSA == True):
			if (n_comp != None): 
				lsa = TruncatedSVD(n_components=n_comp)
				X = lsa.fit_transform(X)
				X = Normalizer(copy=False).fit_transform(X)
				query = lsa.transform(query)

		#print query
		ranking = cosine_similarity(X,query)
		doc_id = np.argsort(ranking, axis=0)
		doc_id = doc_id[::-1]
		ranked_docs = [self.corpus[doc_id[i][0]] for i in range(n_docs)]
		return ranked_docs
		
	
[docs]	def help(self):
		"""
		Description of the class and the methods. 
		"""
		return help(VSM)
Navigation

Source code for InfoR.VectorSpaceModels

Quick search

Navigation