Source code for InfoR.VectorSpaceModels
# -*- coding: utf-8 -*-
# A search engine based on Vector Space model of the information retrival.
# Also has the option to do Latent Sementic Indexing of the term-document matrix.
# Author - Janu Verma
# email - jv367@cornell.edu
# http://januverma.wordpress.com/
# @januverma
import sys
from pydoc import help
import os
import numpy as np
try:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import Normalizer
except:
print "Error : Requires scikit-learn."
sys.exit()
[docs]class VSM:
"""
Implements a Vector space search engine. Each document is represented by a vector in a high dimensional
vector space where there is a dimension corresponding to each unique word in the corpus.
The contents of the vector are the frequencies or tf-idf scores of the term.
Latent Sementic Analysis (LSA) of the term-document matrix is performed by Singular Value Decomposition (SVD).
A is the term-document matrix where each row corresponds to a row and each term is a column.
The entries of the matrix a_ij contains the tf-idf score of the term i in document j.
The SVD maps each document from term space to the (lower dimensional) concept space.
"""
def __init__(self, directory):
"""
Arguments:
directory : Directory of documents to be searched.
"""
self.corpus = os.listdir(directory)
self.text = []
for f in self.corpus:
f = os.path.join(directory,f)
with open(f) as doc:
info = doc.read()
self.text.append(info)
[docs] def search(self, q, n_docs, tf_idf=False, LSA=False, n_comp=None):
"""
Returns documents which are most relavant to the query.
Ranking is done by decreasing cosine similarity with the query.
Arguments:
String q : Search query
Integer n_docs : Number of matching documents retrived.
Boolean tf_idf : If True, the vector features will have tf-idf scores.
Boolean LSA : If True, the vectors will be mapped to a low dimenional concept space.
Integer n_comp : Number of components for the LSA, dimension of the concept space.
Returns:
A list of length n_docs containing documents most relevant to the search query.
The list if sorted in the descending order.
"""
## Default vector have entries the frequencies of occurance of the terms.
vectorizer = CountVectorizer(min_df=0,stop_words='english')
## Vectors with entries as tf-idf scores.
if (tf_idf == True):
vectorizer = TfidfVectorizer(min_df=0, stop_words='english')
X = vectorizer.fit_transform(self.text)
X = X.toarray()
## vectorize the query accordingly.
query = [q]
query = vectorizer.transform(query)
query = query.toarray()
## Reduce the vectors to to n_comp - dimensional vector space.
if (LSA == True):
if (n_comp != None):
lsa = TruncatedSVD(n_components=n_comp)
X = lsa.fit_transform(X)
X = Normalizer(copy=False).fit_transform(X)
query = lsa.transform(query)
#print query
ranking = cosine_similarity(X,query)
doc_id = np.argsort(ranking, axis=0)
doc_id = doc_id[::-1]
ranked_docs = [self.corpus[doc_id[i][0]] for i in range(n_docs)]
return ranked_docs
[docs] def help(self):
"""
Description of the class and the methods.
"""
return help(VSM)