Source code for InfoR.LanguageModels
# -*- coding: utf-8 -*-
# A search engine based on probabilitistic language model of the information retrival.
# Author - Janu Verma
# email - jv367@cornell.edu
# http://januverma.wordpress.com/
# @januverma
import sys
from pydoc import help
import os
from collections import defaultdict
from math import log, sqrt
import operator
[docs]class LanguageModel:
"""
Implements lanuage models for information retrieval.
Each document in the corpus is a language model and
we compute the probability that the query has the
same model.
"""
def __init__(self, directory):
"""
Arguments:
directory - Directory of documents to be searched.
"""
self.corpus = os.listdir(directory)
self.text = {}
for f in self.corpus:
f = os.path.join(directory,f)
with open(f) as doc:
info = doc.read()
self.text[f] = info
[docs] def words(self, document):
"""
All the words in a document.
Arguments:
Document : A textual document.
Returns:
A list containing all the words in the document.
"""
words = document.split()
words = [x.lower() for x in words]
words = [x for x in words if len(x) >= 2and not x.isdigit()]
return words
[docs] def word_freq(self, wordlist):
"""
Build a dictionary of words with the frequencies of their occurance in the document.
Arguments:
Document : A list of all the words in a document.
Returns:
A dictionary containing all the words in the document with their frequencies.
"""
wordFreq = defaultdict(int)
for w in wordlist:
wordFreq[w] += 1
return wordFreq
[docs] def vocabalury(self):
"""
All the words in the corpus.
Returns:
A list of al the words in the corpus.
"""
allWords = []
allDocs = self.text
for d in allDocs.keys():
d = allDocs[d]
docWords = self.words(d)
allWords.extend(docWords)
return allWords
[docs] def wordDict(self):
"""
Compute frequencies of occurance of the words in the corpus.
Returns:
A dictionary containing all the words in the corpus with the frequencies
of their occurance in the whole corpus.
"""
allWords = self.vocabalury()
return self.word_freq(allWords)
[docs] def document_logScore(self, document, query):
"""
Compute the log probability of the query coming from the given document.
Arguments:
String document : A textual document.
String query : The search query.
Returns:
A floating variable logScore
"""
docWords = self.words(document)
docWordFrequency = self.word_freq(docWords)
corpusVocablury = self.wordDict()
normalizingFactor = len(self.vocabalury())
logProb = 0
queryWords = self.words(query)
for q in queryWords:
try:
qFreq = docWordFrequency[q]
except:
qFreq = 0
try:
qCount = corpusVocablury[q]
except:
qCount = 0
alpha = float(qFreq + 1)/float(qCount + normalizingFactor)
logContribution = log(alpha)
logProb += logContribution
return logProb
[docs] def logScoreDict(self, query):
"""
Compute the log probability of the query for all the documents.
Arguments:
String query: The search query
Returns:
A dictionary of all the documents in the corpus with their corresponding logScores.
"""
rakingDict = defaultdict(float)
allDocs = self.text
for d in allDocs.keys():
docText = allDocs[d]
logScore = self.document_logScore(docText, query)
rakingDict[d] = logScore
return rakingDict
[docs] def search(self, query, n_docs):
"""
Returns documents which are most relavant to the query.
Ranking is done by decreasing log probability of the query coming from the document.
Arguments:
String query : Search query
Integer n_docs : Number of matching documents retrived.
Returns:
A list of length n_docs containing documents most relevant to the search query.
The list if sorted in the descending order.
"""
relevantDocs = []
rankings = self.logScoreDict(query)
rankings = sorted(rankings.iteritems(), key=operator.itemgetter(1), reverse=True)
for i in range(n_docs):
u,v = rankings[i]
relevantDocs.append(u)
return relevantDocs