Source code for tethne.networks.terms
"""
Methods for building networks from terms in bibliographic records. This
includes keywords, abstract terms, etc.
.. autosummary::
keyword_cooccurrence
topic_coupling
"""
import numpy as np
import networkx as nx
[docs]def keyword_cooccurrence(papers, threshold, connected=False, **kwargs):
"""
Generates a keyword cooccurrence network.
Parameters
----------
papers : list
A list of :class:`.Paper` objects.
threshold : int
Minimum number of occurrences for a keyword pair to appear in graph.
connected : bool
If True, returns only the largest connected component.
Returns
-------
k_coccurrence : networkx.Graph
A keyword coccurrence network.
"""
# Extract keywords from papers.
keywords = {}
for entry in papers:
if 'keywords' in entry.keys():
keywords[entry['wosid']] = entry['keywords']
# Generate the complete set of keywords in the dataset.
wordset = set([])
for entry in papers:
try:
for kw in keywords[entry['wosid']]:
wordset.add(kw)
except:
pass
# Mapping of integer indices to keywords.
i = 0
dictionary = {}
dictionary_ = {}
for word in wordset:
dictionary[word] = i
dictionary_[i] = word
i += 1
cooccurrence = np.zeros((len(wordset), len(wordset)))
frequencies = np.zeros((len(wordset),))
for entry in papers:
if entry['keywords'] in keywords.keys():
for word in keywords[entry['wosid']]:
frequencies[dictionary[word]] += 1
for word_ in keywords[entry['wosid']]:
i = dictionary[word]
j = dictionary[word_]
if i != j:
cooccurrence[i, j] += 1
G = nx.Graph()
for i in xrange(len(wordset)):
for j in xrange(i, len(wordset)):
if cooccurrence[i, j] > 1 and i != j:
G.add_edge(dictionary_[i], dictionary_[j], weight=int(cooccurrence[i, j]))
if connected: # Return only the first connected component.
return nx.connected_component_subgraphs(G)[0]
else:
return G # Return the whole graph.
[docs]def topic_coupling(model, threshold=0.005, **kwargs):
"""
Creates a network of words connected by implication in a common topic(s).
Parameters
----------
model : :class:`.LDAModel`
threshold : float
Minimum P(W|T) for coupling.
Returns
-------
tc : networkx.Graph
A topic-coupling graph, where nodes are terms.
"""
Z = model.top_word.shape[0]
W = model.top_word.shape[1]
edges = {}
for z in xrange(Z):
word_sub = []
for w in xrange(W):
if model.top_word[z,w] >= threshold:
word_sub.append(w)
for i in xrange(len(word_sub)):
for j in xrange(i+1, len(word_sub)):
w_i = word_sub[i]
w_j = word_sub[j]
p_i = model.top_word[z,w_i]
p_j = model.top_word[z,w_j]
try:
edges[(w_i,w_j)].append((z, (p_i+p_j)/2))
except KeyError:
edges[(w_i,w_j)] = [(z, (p_i+p_j)/2)]
tc = nx.Graph()
for e, topics in edges.iteritems():
weight = sum( [ t[1] for t in topics ] ) / Z
i_id = model.vocabulary[e[0]]
j_id = model.vocabulary[e[1]]
tc.add_edge(i_id, j_id, weight=weight, topics=[t[0] for t in topics])
return tc