"""
Methods for generating networks in which papers are vertices.
Methods
```````
.. autosummary::
author_coupling
bibliographic_coupling
cocitation
direct_citation
topic_coupling
"""
import networkx as nx
import tethne.utilities as util
import helpers
import operator
import tethne.data as ds
[docs]def direct_citation(papers, node_id='ayjid', node_attribs=['date'], **kwargs):
"""
Create a traditional directed citation network.
Direct-citation graphs are `directed acyclic graphs`__ in which vertices are
papers, and each (directed) edge represents a citation of the target
paper by the source paper. The :func:`.networks.papers.direct_citation`
method generates both a global citation graph, which includes all cited and
citing papers, and an internal citation graph that describes only citations
among papers in the original dataset.
.. _dag: http://en.wikipedia.org/wiki/Directed_acyclic_graph
__ dag_
To generate direct-citation graphs, use the
:func:`.networks.papers.direct_citation` method. Note the size difference
between the global and internal citation graphs.
.. code-block:: python
>>> gDC, iDC = nt.papers.direct_citation(papers)
>>> len(gDC)
5998
>>> len(iDC)
163
============== =========================================================
Element Description
============== =========================================================
Node Papers, represented by node_id.
Edge From a paper to a cited reference.
Edge Attribute Publication date of the citing paper.
============== =========================================================
Parameters
----------
papers : list
A list of :class:`.Paper` instances.
node_id : int
A key from :class:`.Paper` to identify the nodes. Default is 'ayjid'.
node_attribs : list
List of user provided optional arguments apart from the provided
positional arguments.
Returns
-------
citation_network : networkx.DiGraph
Global citation network (all citations).
citation_network_internal : networkx.DiGraph
Internal citation network where only the papers in the list are nodes in
the network.
Raises
------
KeyError : If node_id is not present in the meta_list.
"""
citation_network = nx.DiGraph(type='citations')
citation_network_internal = nx.DiGraph(type='citations')
# Check node_id validity.
meta_dict = ds.Paper()
meta_keys = meta_dict.keys()
if node_id not in meta_keys:
raise KeyError('node_id:' + node_id + 'is not in the set of' +
'meta_keys')
for entry in papers:
# Check the head.
head_has_id = True
if entry[node_id] is None:
head_has_id = False
if head_has_id:
# Then create node to both global and internal networks.
node_attrib_dict = util.subdict(entry, node_attribs)
citation_network.add_node(entry[node_id], node_attrib_dict)
citation_network_internal.add_node(entry[node_id],
node_attrib_dict)
if entry['citations'] is not None:
for citation in entry['citations']:
# Check the tail.
tail_has_id = True
if citation[node_id] is None:
tail_has_id = False
if tail_has_id:
# Then create node to global but not internal network.
node_attrib_dict = util.subdict(citation, node_attribs)
citation_network.add_node(citation[node_id],
node_attrib_dict)
if head_has_id and tail_has_id:
# Then draw an edge in the network.
citation_network.add_edge(entry[node_id],
citation[node_id],
date=entry['date'])
# And check if it can be added to the internal network, too.
if (util.contains (papers,
lambda wos_obj:
wos_obj[node_id] == citation[node_id])):
citation_network_internal.add_edge(
entry[node_id],
citation[node_id],
date=entry['date'])
# Checking if both the graphs are Directed Acyclic Graphs.
if not nx.is_directed_acyclic_graph(citation_network):
raise nx.NetworkXError("Citation graph is not a DAG.")
elif not nx.is_directed_acyclic_graph(citation_network_internal):
raise nx.NetworkXError("Internal citation graph is not a DAG.")
else:
return citation_network, citation_network_internal
[docs]def bibliographic_coupling(papers, citation_id='ayjid', threshold=1,
node_id='ayjid', node_attribs=['date'],
weighted=False, **kwargs):
"""
Generate a bibliographic coupling network.
Two papers are **bibliographically coupled** when they both cite the same,
third, paper. You can generate a bibliographic coupling network using the
:func:`.networks.papers.bibliographic_coupling` method.
.. code-block:: python
>>> BC = nt.papers.bibliographic_coupling(papers)
>>> BC
<networkx.classes.graph.Graph object at 0x102eec710>
Especially when working with large datasets, or disciplinarily narrow
literatures, it is usually helpful to set a minimum number of shared
citations required for two papers to be coupled. You can do this by setting
the **`threshold`** parameter.
.. code-block:: python
>>> BC = nt.papers.bibliographic_coupling(papers, threshold=1)
>>> len(BC.edges())
1216
>>> BC = nt.papers.bibliographic_coupling(papers, threshold=2)
>>> len(BC.edges())
542
=============== =========================================================
Element Description
=============== =========================================================
Node Papers represented by node_id.
Node Attributes node_attribs in :class:`.Paper`
Edge (a,b) in E(G) if a and b share x citations where x >=
threshold.
Edge Attributes overlap: the number of citations shared
=============== =========================================================
Parameters
----------
papers : list
A list of wos_objects.
citation_id: string
A key from :class:`.Paper` to identify the citation overlaps. Default
is 'ayjid'.
threshold : int
Minimum number of shared citations to consider two papers "coupled".
node_id : string
Field in :class:`.Paper` used to identify the nodes. Default is 'ayjid'.
node_attribs : list
List of fields in :class:`.Paper` to include as node attributes in
graph.
weighted : bool
If True, edge attribute `overlap` is a float in {0-1} calculated as
:math:`\cfrac{N_{ij}}{\sqrt{N_{i}N_{j}}}` where :math:`N_{i}` and
:math:`N_{j}` are the number of references in :class:`.Paper` *i* and
*j*, respectively, and :math:`N_{ij}` is the number of references
shared by papers *i* and *j*.
Returns
-------
bcoupling : networkx.Graph
A bibliographic coupling network.
Raises
------
KeyError : Raised when citation_id is not present in the meta_list.
Notes
-----
Lists cannot be attributes? causing errors for both gexf and graphml also
nodes cannot be none.
"""
bcoupling = nx.Graph(type='biblio_coupling')
# Validate identifiers.
meta_dict = ds.Paper()
meta_keys = meta_dict.keys()
if node_id not in meta_keys:
raise KeyError('node_id' + node_id + ' is not a meta_dict key.')
# 'citations' is the only invalid meta_key for citation_id
meta_keys.remove('citations')
if citation_id not in meta_keys:
raise KeyError('citation_id' + citation_id + ' is not a meta_dict' +
' key or otherwise cannot be used to detect citation' +
' overlap.')
for i in xrange(len(papers)):
# Make a list of citation_id's for each paper...
i_list = []
if papers[i]['citations'] is not None:
for citation in papers[i]['citations']:
i_list.append(citation[citation_id])
# ...and construct that paper's node.
node_i_attribs = util.subdict(papers[i], node_attribs)
for j in xrange(i+1, len(papers)):
# Make a list of citation_id's for each paper...
j_list = []
if papers[j]['citations'] is not None:
for citation in papers[j]['citations']:
j_list.append(citation[citation_id])
# ...and construct that paper's node.
node_j_attribs = util.subdict(papers[j], node_attribs)
# Add nodes and edge if the citation overlap is sufficiently high.
overlap = util.overlap(i_list, j_list)
if weighted:
if len(overlap) > 0:
w = (float(len(i_list)) * float(len(j_list)))**0.5
similarity = float(len(overlap)) / w
else:
similarity = 0
else:
similarity = len(overlap)
if similarity >= threshold:
bcoupling.add_node(papers[i][node_id], node_i_attribs)
bcoupling.add_node(papers[j][node_id], node_j_attribs)
#nx.set_node_attributes(bcoupling,"",node_i_attribs)
bcoupling.add_edge(papers[i][node_id],
papers[j][node_id],
similarity=similarity)
return bcoupling
[docs]def cocitation(papers, threshold=1, node_id='ayjid', topn=None, verbose=False,\
node_attribs=['date'], **kwargs):
"""
Generate a cocitation network.
A **cocitation network** is a network in which vertices are papers, and
edges indicate that two papers were cited by the same third paper.
`CiteSpace <http://cluster.cis.drexel.edu/~cchen/citespace/doc/jasist2006.pdf>`_
is a popular desktop application for co-citation analysis, and you can read
about the theory behind it
`here <http://cluster.cis.drexel.edu/~cchen/citespace/>`_. Co-citation
analysis is generally performed with a temporal component, so building a
:class:`.GraphCollection` from a :class`.DataCollection` sliced by ``date``
is recommended.
You can generate a co-citation network using the
:func:`.networks.papers.cocitation` method:
.. code-block:: python
>>> CC = nt.papers.cocitation(papers)
>>> CC
<networkx.classes.graph.Graph object at 0x102eec790>
For large datasets, you may wish to set a minimum number of co-citations
required for an edge between two papers Keep in mind that all of the
references in a single paper are co-cited once, so a threshold of at least
2 is prudent. Note the dramatic decrease in the number of edges when the
threshold is changed from 2 to 3.
.. code-block:: python
>>> CC = nt.papers.cocitation(papers, threshold=2)
>>> len(CC.edges())
8889
>>> CC = nt.papers.cocitation(papers, threshold=3)
>>> len(CC.edges())
1493
=============== =========================================================
Element Description
=============== =========================================================
Node Cited papers represented by :class:`.Paper` ayjid.
Edge (a, b) if a and b are cited by the same paper.
Edge Attributes weight: number of times two papers are co-cited
together.
=============== =========================================================
Parameters
----------
papers : list
a list of :class:`.Paper` objects.
threshold : int
Minimum number of co-citations required to create an edge.
topn : int or float, or None
If provided, only the topn (int) or topn percent (float) most cited
papers will be included in the cocitation network. If None (default),
network will include all cited papers (NOTE: this can cause severe
memory consumption for even moderately-sized datasets).
verbose : bool
If True, prints status messages.
Returns
-------
cocitation : networkx.Graph
A cocitation network.
"""
cocitation_graph = nx.Graph(type='cocitation')
# We'll use tuples as keys. Values are the number of times each pair
# of papers is co-cited.
cocitations = {}
citations_count = {}
# 61670334: networks.citations.cocitation should have a "top cited"
# parameter.
if topn is not None:
parents,include,citations_count = helpers.top_parents(papers, topn=topn)
N = len(include)
else:
citations_count = helpers.citation_count(papers)
N = len(citations_count.keys())
if verbose:
print "Generating a cocitation network with " + str(N) + " nodes..."
for paper in papers:
if paper['citations'] is not None: # Some papers don't have citations.
n = len(paper['citations'])
for i in xrange(0, n):
paper_i = paper['citations'][i]['ayjid']
if topn is not None and paper_i not in include:
pass
else:
for j in xrange(i+1, n):
paper_j = paper['citations'][j]['ayjid']
if topn is not None and paper_j not in include:
pass
else:
pp = ( paper_i, paper_j )
pp_inv = ( paper_j, paper_i )
try: # Have these papers been co-cited before?
cocitations[pp] += 1
except KeyError:
try: # Maybe in opposite order?
cocitations[pp_inv] += 1
except KeyError:
# First time these papers are co-cited.
cocitations[pp] = 1
if verbose:
print "Co-citation matrix generated, building Graph..."
for key , val in cocitations.iteritems():
if val >= threshold: # and key[0] in include and key[1] in include:
cocitation_graph.add_edge(key[0], key[1], weight=val)
if verbose:
print "Done building co-citation graph, adding attributes..."
# 62657522: Nodes in co-citation graph should have attribute containing
# number of citations.
n_cit = { k:v for k,v in citations_count.iteritems()
if k in cocitation_graph.nodes() }
nx.set_node_attributes( cocitation_graph, 'citations', n_cit )
return cocitation_graph
[docs]def author_coupling(papers, threshold=1, node_attribs=['date'],
node_id='ayjid', **kwargs):
"""
Vertices are papers and edges indicates shared authorship.
=============== =========================================================
Element Description
=============== =========================================================
Node Papers, represented by node_id.
Edge (a,b) in E(G) if a and b share x authors and x >=
threshold
Edge Attributes overlap: the value of x (above).
=============== =========================================================
Parameters
----------
papers : list
A list of :class:`.Paper`
threshold : int
Minimum number of co-citations required to draw an edge between two
authors.
node_id : string
Field in :class:`.Paper` used to identify nodes.
node_attribs : list
List of fields in :class:`.Paper` to include as node attributes in
graph.
Returns
-------
acoupling : networkx.Graph
An author-coupling network.
"""
acoupling = nx.Graph(type='author_coupling')
for i in xrange(len(papers)):
#define last name first initial name lists for each paper
name_list_i = util.concat_list(papers[i]['aulast'],
papers[i]['auinit'],
' ')
#create nodes
node_attrib_dict = util.subdict(papers[i], node_attribs)
acoupling.add_node(papers[i][node_id], node_attrib_dict)
for j in xrange(i+1, len(papers)):
#define last name first initial name lists for each paper
name_list_j = util.concat_list(papers[j]['aulast'],
papers[j]['auinit'],
' ')
#create nodes
node_attrib_dict = util.subdict(papers[j], node_attribs)
acoupling.add_node(papers[j][node_id], node_attrib_dict)
#draw edges as appropriate
overlap = util.overlap(name_list_i, name_list_j)
if len(overlap) >= threshold:
acoupling.add_edge(papers[i][node_id],
papers[j][node_id],
overlap=len(overlap))
return acoupling
[docs]def topic_coupling(papers, threshold=0.7, node_id='ayjid', **kwargs):
"""
Two papers are coupled if they both contain a shared topic above threshold.
=============== =========================================================
Element Description
=============== =========================================================
Node Papers, represented by node_id.
Edge (a,b) in E(G) if a and b share >= 1 topics with
proportion >= threshold in both a and b.
Edge Attributes weight: combined mean proportion of each shared topic.
topics: list of shared topics.
=============== =========================================================
Parameters
----------
papers : list
A list of :class:`.Paper`
threshold : float
Minimum representation of a topic in each paper.
node_id : string
Field in :class:`.Paper` used to identify nodes.
Returns
-------
tc : networkx.Graph
A topic-coupling network.
"""
for i in xrange(len(papers)):
t_i = papers[i]['topics'][0] # Topic vector for i.
for j in xrange(i+1,len(papers)):
t_j = papers[i]['topics'][0] # Topic vector for j.
Z = t_i.shape[0]
for z in xrange(Z):
if t_i[z] >= threshold and t_j[z] >= threshold:
try: # Add topic and mean of representation in i and j.
edges[(i,j)].append( (z,(t_i[z]+t_j[z])/2) )
except KeyError:
edges[(i,j)] = [ (z,(t_i[z]+t_j[z])/2) ]
tc = nx.Graph()
# Combine means of representations into a single edge weight in {0-1}.
for e, topics in edges.iteritems():
weight = sum([ t[1] for t in topics ] ) / t_i.shape[0]
i_id = papers[e[0]][node_id]
j_id = papers[e[1]][node_id]
tc.add_edge(i_id, j_id, weight=weight, topics=[t[0] for t in topics])
return tc