Source code for msdas.phosphogrid

# -*- coding: utf-8 -*-
# -*- python -*-
#
#  This file is part of MS-DAS software
#
#  Copyright (c) 2011-2012 - EBI-EMBL
#
#  File author(s): Claudia Hernand, Marti Bernardo,
#       Thomas Cokelaer <cokelaer@ebi.ac.uk>
##
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website:
#
##############################################################################
"""
Created on Fri Feb  7 16:58:10 2014

@author: chernand
"""
import os
import pandas as pd
from tools import Requires


__all__ = ["PhosphoGRID"]


[docs]class PhosphoGRID(Requires):
    """Build PKN from phosphogrid

    You need first a list of protein names that you are interested in.
    This list can be obtained from the merger instance as follows::

        from msdas import *
        m = MassSpecReader(get_yeast_small_data, "yeast")
        gene_names = set(list(m.df.Protein))

    Then, you can use this class as follows to visualise and save into a SIF file the
    network built from phosphogrid database.

    .. plot::
        :include-source:
        :width: 70%

        >>> from msdas import *
        >>> r = MassSpecReader(get_yeast_small_data())
        >>> gene_names = set(list(r.df.Protein))
        >>> p = phosphogrid.PhosphoGRID()
        >>> p.run(gene_names=gene_names)
        >>> p.export2sif()
        >>> p.plot()

    2 databases are used. Their filenames are hard coded as:

        #. BIOGRID-PTM-RELATIONSHIPS-3.1.93.ptmrel.txt
        #. BIOGRID-PTM-15-3.1.93.ptmtab.txt

    You can change the name by changing :attr:`dbR_filename` and
    :attr:`dbP_filename`.

    """

    def __init__(self, directory="../../share/data/"):
        """.. rubric:: constructor

        :param str directory: directory where to find the databases



        They can be found at Claudia's Hernand home directory.

        """
        super(PhosphoGRID, self).__init__()
        self.directory = directory

        self.dbR_filename = "BIOGRID-PTM-RELATIONSHIPS-3.1.93.ptmrel.txt"
        # original file was huged. We removed sequence, author,pumed that are not used.
        #self.dfP_filename = "BIOGRID-PTM-15-3.1.93.ptmtab.txt"
        self.dfP_filename = "BIOGRID-PTM-15-3.1.93_SMALL.ptmtab.txt"

[docs]    def run(self, gene_names=None):
        """Build dataframe from the relations found in the PhosphoGRID databases


        relations are saved in :attr:`dfSIF`

        """
        #===PhosphoGRID Database
        #Open file with relationships PhosphoGRID tab delimited files
        import gzip
        dbr_arch = gzip.GzipFile(self.directory+os.sep+self.dbR_filename+'.gz')
        dbp_arch = gzip.GzipFile(self.directory+os.sep+self.dfP_filename+'.gz')
        import StringIO

        dfR = pd.read_csv(StringIO.StringIO(dbr_arch.read(self.dbR_filename)),
                          sep='\t', header=0)
        dfP = pd.read_csv(StringIO.StringIO(dbp_arch.read(self.dfP_filename)),
                          sep='\t', header=0)

        #===Merge PhosphoGRID data frame using PTMID as matching column
        dfRP = dfR.merge(dfP, on="PTMID")
        dfRP.columns = [this.replace("_x","_reg") for this in dfRP.columns.map(str)]
        dfRP.columns = [this.replace("_y","_tgt") for this in dfRP.columns.map(str)]
        dfRP["RelationshipCode"] = ""

        #===Asign interaction type according to  enzymatic activity
        dfRP.RelationshipCode[dfRP.Relationship == "kinase"] = 1
        dfRP.RelationshipCode[dfRP.Relationship == "phosphatase"] = -1
        dfRP["Site"] = dfRP.Residue+dfRP.Position.map(str)

        #Yeast dataframe
        #Select unique gene names
        if gene_names == None:
            filename = self.directory + os.sep + "RawData.csv"
            df = pd.read_csv(filename)
            gene_names = df["Protein"].drop_duplicates()
        else:
            pass

        #===Merge Yeast experimental dataset with PhosphoGRID
        # Intersection between both datasets querying  by Standard 
        # name == OfficialSymbol both in regulators and substrates
        columns=["OfficialSymbol_reg","RelationshipCode","OfficialSymbol_tgt",
                 "Relationship","Identity","Site"]
        out = pd.DataFrame(columns=columns)
        for gene in gene_names:
            this = dfRP[dfRP.OfficialSymbol_reg == gene][columns].drop_duplicates()
            out = out.merge(this, "outer")
            this = dfRP[dfRP.OfficialSymbol_tgt == gene][columns].drop_duplicates()
            out = out.merge(this, "outer")

        #Create SIF file
        self.dfSIF = out[["OfficialSymbol_reg","RelationshipCode",
                        "OfficialSymbol_tgt"]].drop_duplicates()

[docs]    def export2sif(self, filename="matchYeastPhosGRID_protein.sif"):
        """Export the found relations into a SIF file

        :param str filename: Defaults to matchYeastPhosGRID_protein.sif

        """
        # could be a decorator but argument are hidden, so just call it for now.
        self._requires("dfSIF", "Call run() method first")
        # TODO : check that dfSIF is present otherwise call run()
        self.dfSIF.to_csv(filename, index=False, header=False, sep="\t")
        self.sif_filename = filename

[docs]    def plot(self):
        """Plot the relations using CNOGraph

        .. warning:: consider using a faster visualisation tools if the number
            of edges/nodes exceeds ~ 300

        .. todo:: build the SIF without saving it and import directly in the
            cnograph structure.
        """
        self._requires("sif_filename", "sif_filename not set yet. Please, Call export2sif() method first.")
        from cno import CNOGraph
        c = CNOGraph(self.sif_filename)
        c.plot()
MS-DAS 0.9.0 documentation

Source code for msdas.phosphogrid

Search