Source code for tethne.readers.mallet

"""
Reader for output from topic modeling with MALLET.
"""

import csv
import numpy as np
from tethne.data import LDAModel, Paper
from tethne.utilities import Dictionary


[docs]def load(top_doc, word_top, topic_keys, Z, metadata=None, metadata_key='doi'): """ Parse results from LDA modeling with MALLET. MALLET's LDA topic modeling algorithm produces a collection of output files. :func:`.read` takes the topic-document and (sparse) word-topic matrices, as tab-separated value files, along with a metadata file that maps each MALLET document id to a :class:`.Paper`\, using the `metadata_key`. Parameters ---------- top_doc : string Path to topic-document datafile generated with --output-doc-topics. word_top : string Path to word-topic datafile generated with --word-topic-counts-file. topic_keys : string Path to topic-keys datafile generated with --output-topic-keys. Z : int Number of topics. metadata : string (optional) Path to tab-separated metadata file with IDs and :class:`.Paper` keys. Returns ------- ldamodel : :class:`.LDAModel` """ td = _handle_top_doc(top_doc, Z) wt,vocabulary = _handle_word_top(word_top, Z) tk = _handle_topic_keys(topic_keys) if metadata is not None: md = _handle_metadata(metadata) else: md = None ldamodel = LDAModel(td, wt, tk, md, vocabulary) return ldamodel
[docs]def read(top_doc, word_top, topic_keys, Z, metadata=None, metadata_key='doi'): """ Generates :class:`.Paper` objects from Mallet output. Each :class:`.Paper` is assigned a topic vector. Parameters ---------- top_doc : string Path to topic-document datafile generated with --output-doc-topics. word_top : string Path to word-topic datafile generated with --word-topic-counts-file. topic_keys : string Path to topic-keys datafile generated with --output-topic-keys. Z : int Number of topics. metadata : string (optional) Path to tab-separated metadata file with IDs and :class:`.Paper` keys. Returns ------- papers : list List of :class:`.Paper` """ ldamodel = load(top_doc, word_top, topic_keys, Z, metadata, metadata_key) D = ldamodel.doc_topic.shape[0] papers = [] for d in xrange(D): p = Paper() p[metadata_key] = ldamodel.metadata[d] # e.g. doi, wosid p['topics'] = (ldamodel.doc_topic[d,:], ldamodel.top_keys) papers.append(p) return papers
def _handle_top_doc(path, Z): """ Returns ------- td : Numpy array Rows are documents, columns are topics. Rows sum to ~1. """ documents = {} with open(path, "rb") as f: reader = csv.reader(f, delimiter='\t') lines = [ line for line in reader ][1:] # Discard header row. for line in lines: t = line[2:] tops = [] for i in xrange(0,len(t)-1,2): tops.append( (int(t[i]), float(t[i+1])) ) #topics.add(int(t[i])) documents[int(line[0])] = (line[1], tops) td = np.zeros( (len(documents), Z) ) for d, value in documents.iteritems(): for t,p in value[1]: td[d,t] = p return td def _handle_word_top(path, Z): """ Returns ------- wt : Numpy array Rows are topics, columns are words. Rows sum to ~1. vocabulary : :class:`.Dictionary` Maps words to word-indices in wt. """ words = {} topics = set() with open(path, "rb") as f: reader = csv.reader(f, delimiter=' ') for line in reader: tc = { int(tuple(l.split(':'))[0]):float(tuple(l.split(':'))[1]) \ for l in line[2:] } words[int(line[0])] = ( line[1], tc ) wt = np.zeros((Z, len(words))) for w, values in words.iteritems(): for t,p in values[1].iteritems(): wt[t,w] = p # Normalize for t in xrange(Z): wt[t,:] /= np.sum(wt[t,:]) # Build vocabulary vocabulary = Dictionary() for k,v in words.iteritems(): vocabulary[k] = v[0] return wt, vocabulary def _handle_topic_keys(path): """ Returns ------- tk : dict Keys are topic indices, values are (P, terms) tuples, where terms is a list of strings and P is float. """ tk = {} with open(path, "rb") as f: reader = csv.reader(f, delimiter='\t') for l in reader: tk[int(l[0])] = (float(l[1]), l[2].split()) return tk def _handle_metadata(path): """ Returns ------- md : dict Keys are document indices, values are identifiers from a :class:`.Paper` property. """ md = {} with open(path, "rU") as f: reader = csv.reader(f, delimiter='\t') lines = [ l for l in reader ][1:] for l in lines: md[int(l[0])] = l[1] return md