Source code for InfoR.LanguageModels

# -*- coding: utf-8 -*-
#	A search engine based on probabilitistic language model of the information retrival.      
#	Author - Janu Verma
#	email - jv367@cornell.edu
#	http://januverma.wordpress.com/
#	@januverma


import sys
from pydoc import help
import os
from collections import defaultdict
from math import log, sqrt
import operator



[docs]class LanguageModel:
	"""
	Implements lanuage models for information retrieval.
	Each document in the corpus is a language model and 
	we compute the probability that the query has the 
	same model.  
	"""
	def __init__(self, directory):
		"""
		Arguments:
			directory - Directory of documents to be searched. 
		"""
		self.corpus = os.listdir(directory)
		self.text = {}
		for f in self.corpus:
			f = os.path.join(directory,f)
			with open(f) as doc:
				info = doc.read()
				self.text[f] = info



[docs]	def words(self, document):
		"""
		All the words in a document.

		Arguments:
			Document : A textual document. 

		Returns:
			A list containing all the words in the document. 	
		"""
		words = document.split()
		words = [x.lower() for x in words]
		words = [x for x in words if len(x) >= 2and not x.isdigit()]
		return words	


[docs]	def word_freq(self, wordlist):
		"""
		Build a dictionary of words with the frequencies of their occurance in the document.

		Arguments:
			Document : A list of all the words in a document.

		Returns:
			A dictionary containing all the words in the document with their frequencies.
				
		"""
		wordFreq = defaultdict(int)
		for w in wordlist:
			wordFreq[w] += 1
		return wordFreq


[docs]	def vocabalury(self):
		"""
		All the words in the corpus. 

		Returns:
			A list of al the words in the corpus.
		"""
		allWords = []
		allDocs = self.text
		for d in allDocs.keys():
			d = allDocs[d]
			docWords = self.words(d)
			allWords.extend(docWords)
		return allWords
		
[docs]	def wordDict(self):
		"""
		Compute frequencies of occurance of the words in the corpus. 

		Returns:
			A dictionary containing all the words in the corpus with the frequencies
			of their occurance in the whole corpus.
		"""		
		allWords = self.vocabalury()
		return self.word_freq(allWords)



[docs]	def document_logScore(self, document, query):
		"""
		Compute the log probability of the query coming from the given document. 

		Arguments:
			
			String document : A textual document. 
			String query : The search query. 

		Returns:
			
			A floating variable logScore	
		"""
		docWords = self.words(document)
		docWordFrequency = self.word_freq(docWords)
		corpusVocablury = self.wordDict()
		normalizingFactor = len(self.vocabalury())

		logProb = 0
		queryWords = self.words(query)
		for q in queryWords:
			try:
				qFreq = docWordFrequency[q]
			except:
				qFreq = 0	
			try:
				 qCount = corpusVocablury[q] 
			except:
				qCount = 0
			alpha = float(qFreq + 1)/float(qCount + normalizingFactor)

			logContribution = log(alpha)
			logProb += logContribution

		return logProb


[docs]	def logScoreDict(self, query):
		"""
		Compute the log probability of the query for all the documents.

		Arguments:
			String query: The search query

		Returns:
			A dictionary of all the documents in the corpus with their corresponding logScores.	
		"""	
		rakingDict = defaultdict(float)
		allDocs = self.text 	
		for d in allDocs.keys():
			docText = allDocs[d]
			logScore = self.document_logScore(docText, query)
			rakingDict[d] = logScore
		return rakingDict
		

[docs]	def search(self, query, n_docs):
		"""
		Returns documents which are most relavant to the query. 
		Ranking is done by decreasing log probability of the query coming from the document.

		Arguments:
			String query : Search query
			Integer n_docs : Number of matching documents retrived.

		Returns: 
			A list of length n_docs containing documents most relevant to the search query. 
			The list if sorted in the descending order.
		"""	
		relevantDocs = []
		rankings = self.logScoreDict(query)
		rankings = sorted(rankings.iteritems(), key=operator.itemgetter(1), reverse=True)
		for i in range(n_docs):
			u,v = rankings[i]
			relevantDocs.append(u)

		return relevantDocs
Navigation

Source code for InfoR.LanguageModels

Quick search

Navigation