Source code for cbmpy.CBNetDB

"""
CBMPy: CBNetDB module
=====================
PySCeS Constraint Based Modelling (http://cbmpy.sourceforge.net)
Copyright (C) 2009-2017 Brett G. Olivier, VU University Amsterdam, Amsterdam, The Netherlands

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>

Author: Brett G. Olivier
Contact email: bgoli@users.sourceforge.net
Last edit: $Author: bgoli $ ($Id: CBNetDB.py 575 2017-04-13 12:18:44Z bgoli $)

"""

# preparing for Python 3 port
from __future__ import division, print_function
from __future__ import absolute_import
#from __future__ import unicode_literals

import os, time, re, webbrowser, csv
import urllib2

HAVE_SQLITE2 = False
HAVE_SQLITE3 = False

try:
    import sqlite3 as sqlite3
    HAVE_SQLITE3 = True
except ImportError:
    HAVE_SQLITE3 = False
    try:
        from pysqlite2 import dbapi2 as sqlite2
        HAVE_SQLITE2 = True
    except ImportError:
        HAVE_SQLITE2 = False

from .CBConfig import __CBCONFIG__ as __CBCONFIG__
__DEBUG__ = __CBCONFIG__['DEBUG']
__version__ = __CBCONFIG__['VERSION']


class NetDBbase(object):
    urllib2 = urllib2
    text_encoding = 'utf8'

    def URLEncode(self, txt):
        """
        URL encodes a string.

        """
        return self.urllib2.quote(txt.encode(self.text_encoding))

    def URLDecode(self, txt):
        """
        Decodes a URL encoded string

        """
        return self.urllib2.unquote(txt)

[docs]class DBTools(NetDBbase): """ Tools to work with SQLite DB's (optimized, no SQL required). """ sqlite = None sqlite_version = None db_conn = None db_cursor = None db_tables = None def __init__(self): if HAVE_SQLITE3: self.sqlite = sqlite3 self.sqlite_version = 3 elif HAVE_SQLITE2: self.sqlite = sqlite2 self.sqlite_version = 2 else: raise RuntimeError, "\nSQLite not installed" self.db_tables = []
[docs] def connectSQLiteDB(self, db_name, work_dir=None): """ Connect to a sqlite database. - *db_name* the name of the sqlite database - *work_dir* the optional database path """ # connect to DB if work_dir != None: self.db_conn = self.sqlite.connect(os.path.join(work_dir, db_name)) else: self.db_conn = self.sqlite.connect(db_name) self.db_cursor = self.db_conn.cursor()
[docs] def createDBTable(self, table, sqlcols): """ Create a database table if it does not exist: - *table* the table name - *sqlcols* a list containing the SQL definitions of the table columns: <id> <type> for example `['gene TEXT PRIMARY KEY', 'aa_seq TEXT', 'nuc_seq TEXT', 'aa_len INT', 'nuc_len INT']` Effectively writes CREATE TABLE "table" (<id> <type>, gene TEXT PRIMARY KEY, aa_seq TEXT, nuc_seq TEXT, aa_len INT, nuc_len INT) % table """ SQL = 'CREATE TABLE %s (' % table for c in sqlcols: SQL += ' %s,' % c SQL = SQL[:-1] SQL += ' )' print(SQL) try: self.db_cursor.execute('SELECT * FROM %s' % table) print('Table {} exists'.format(table)) except self.sqlite.OperationalError: print('Table {} does not exist, creating it'.format(table)) self.db_cursor.execute(SQL) self.db_tables.append(table)
[docs] def insertData(self, table, data, commit=True): """ Insert data into a table: "INSERT INTO %s (?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?)" % table, (?, ?, ?, ?, ?)) ) - *table* the DB table name - *data* a dictionary of {id:value} pairs - *commit* whether to commit the data insertions """ colstr = "(" valstr = "VALUES (" vals = [] # I want to use dictionaries but this just keeps things backwards compatible for a while try: data.keys() for d in data: colstr += '%s, ' % d valstr += '?, ' vals.append(data[d]) except AttributeError: print('\n\nWARNING: data now uses a dictionary as input please update your code - see docstring for details\n') time.sleep(3) for d in data: colstr += '%s, ' % d[0] valstr += '?, ' vals.append(d[1]) colstr = colstr[:-2] + ')' valstr = valstr[:-2] + ')' sql = "INSERT INTO %s %s %s" % (table, colstr, valstr) #print sql #time.sleep(1) try: self.db_cursor.execute(sql, vals) if commit: self.db_cursor.connection.commit() return True except AttributeError: return False
[docs] def updateData(self, table, col, rid, data, commit=True): """ Update already defined data - *table* the table name - *col* the column name - *rid* the row id to update - *data* a dictionary of {id:value} pairs - *commit* whether to commit the data updates UPDATE COMPANY SET ADDRESS = 'Texas' WHERE ID = 6; """ sql = 'UPDATE {} SET '.format(table) for d in data: sql += '{}=\"{}\", '.format(d, str(data[d]).replace('\"','\'')) sql = sql[:-2] + ' WHERE {}=\"{}\"'.format(col, rid) #print(sql) try: self.db_cursor.execute(sql) if commit: self.db_cursor.connection.commit() return True except AttributeError: return False
[docs] def checkEntryInColumn(self, table, col, rid): """ Check if an entry exists in a table - *table* the table name - *col* the column name - *rid* the row to search for """ self.db_cursor.execute("SELECT count(*) FROM {} WHERE {}=\"{}\"".format(table, col, rid)) data = self.db_cursor.fetchone()[0] if data == 0: #print('There is no component named {}'.format(rid)) return False else: #print('Component {} found in {} row(s)'.format(rid, data)) return True
[docs] def executeSQL(self, sql): """ Execute a SQL command: - *sql* a string containing a SQL command """ try: self.db_cursor.execute(sql) return True except AttributeError as ex: print('Error executing command') print(ex) return False
[docs] def getColumns(self, table, cols): """ Fetch the contents of one or more columns of data in a table - *table* the database table - *cols* a list of one or more column id's """ sql = "SELECT " for c in cols: sql += '{}, '.format(str(c)) sql = sql[:-2] sql += ' FROM {}'.format(table) dout = [[] for i in range(len(cols))] #print(sql) #print(dout) try: data = self.db_cursor.execute(sql).fetchall() for r_ in data: for c_ in range(len(r_)): dout[c_].append(str(r_[c_])) del data except AttributeError: return None else: return dout
[docs] def getRow(self, table, col, rid): """ Get the table row(s) which correspond to rid in column. Returns the row(s) as a list, if the column is the primary key this is always a single entry. - *table* the database table - *col* the column id - *rid* the row index id """ sql = "SELECT * FROM {} WHERE {}=\"{}\"".format(table, col, rid) #print(sql) dout = [] try: data = self.db_cursor.execute(sql).fetchall() if len(data) > 1: print('INFO: getRow is returning multiple rows for query id: {}'.format(rid)) for r_ in data: dout.append([str(d) for d in r_]) del data except AttributeError: return None else: return dout
[docs] def getCell(self, table, col, rid, cell): """ Get the table cell which correspond to rid in column. Returns the value or None - *table* the database table - *col* the column id - *rid* the row index id - *cell* the column of the cell you want tp extract """ sql = "SELECT {} FROM {} WHERE {}=\"{}\"".format(cell, table, col, rid) #print(sql) data = None try: data = self.db_cursor.execute(sql).fetchone() if data is None: return None data = str(self.db_cursor.execute(sql).fetchone()[0]) except AttributeError: return None return data
[docs] def getTable(self, table, colOut=False): """ Returns an entire database table - *table* the table name - *colOut* optionally return a tuple of (data,ColNames) """ sql = 'SELECT * FROM %s' % table sql2 = "PRAGMA table_info( %s )" % table r = None col = None try: r = self.db_cursor.execute(sql).fetchall() if colOut: col = self.db_cursor.execute(sql2).fetchall() col = [str(a[1]) for a in col] except AttributeError: return None if colOut: return r, col else: return r
[docs] def dumpTableToTxt(self, table, filename): """ Save a table as tab separated txt file - *table* the table to export - *filename* the filename of the table dump """ data, head = self.getTable(table, colOut=True) data.insert(0, head) from .CBTools import exportLabelledLinkedList exportLabelledLinkedList(data, fname=filename, names=None, sep='\t')
[docs] def dumpTableToCSV(self, table, filename): """ Save a table as tab separated txt file - *table* the table to export - *filename* the filename of the table dump """ data, head = self.getTable(table, colOut=True) data.insert(0, head) F = open(filename, 'w') csvw = csv.writer(F, dialect='excel') csvw.writerows(data) F.close() del csvw
[docs] def fetchAll(self, sql): """Raw SQL query e.g. 'SELECT id FROM gene WHERE gene=\"G\"' """ #print(sql) r = None try: r = self.db_cursor.execute(sql).fetchall() except AttributeError as ex: print(ex) return r
[docs] def closeDB(self): """Close the DB connection and reset the DBTools instance (can be reconnected)""" self.db_conn.close() self.db_conn = None self.db_cursor = None self.db_tables = []
[docs]class KeGGTools(object): """ Class that holds useful methods for querying KeGG via a SUDS provided soap client """ Kclient = None def __init__(self, url): import suds self.Kclient = suds.client.Client(url)
[docs] def fetchSeqfromKeGG(self, k_gene): """ Given a gene name try and retrieve the gene and amino acid sequence """ g2 = 'None' p2 = 'None' try: g = self.Kclient.service.bget("-f -n n %s" % k_gene) if g == None: print('\n*****\nWARNING: potential naming error in gene: {}!!\n*****\n'.format(k_gene)) g2 = g.split('(N)')[1].replace('\n', '') except AttributeError as ex: print('\nGene sequence get exception ({})!\n'.format(k_gene)) print(ex) try: p = self.Kclient.service.bget("-f -n a %s" % k_gene) if p == None: print('\n*****\nWARNING: potential naming error in gene: {}!!\n*****\n'.format(k_gene)) p2 = p.split('(A)')[1].replace('\n', '') except AttributeError as ex: print('\nProtein sequence get exception ({})!\n'.format(k_gene)) print(ex) return g2, p2
[docs]class KeGGSequenceTools(object): """ Using the KeGG connector this class provides tools to construct an organims specific sequence database """ DB = None KEGG = None def __init__(self, url, db_name, work_dir): self.DB = DBTools() self.DB.connectSQLiteDB(db_name, work_dir) self.KEGG = KeGGTools(url) def buildGeneDatabase(self, genes, tablename, UPDATE_IF_EXISTS=False, default_length=0): cntr = 1 cntr2 = 1 for ecg in genes: print('Processing gene {} of {}'.format(cntr, len(genes))) entry_exists = False testg = self.DB.db_cursor.execute('SELECT * FROM %s WHERE gene="%s" ' % (tablename, ecg)).fetchall() if len(testg) > 0: entry_exists = True ## if ecg in ['eco:b1898','eco:b1899','eco:b3692','eco:b3111','eco:b4228','eco:b2978',\ ## 'eco:b1416','eco:b3112','eco:b1417','eco:b3768','eco:b3767','eco:b4229']: ## raw_input(testg) tstart = time.time() if not entry_exists: print('\tadding gene {}'.format(ecg), end=" ") gene2, prot2 = self.KEGG.fetchSeqfromKeGG(ecg) if gene2 != 'None' and prot2 != 'None': self.DB.db_cursor.execute("INSERT INTO %s (gene, aa_seq, nuc_seq, aa_len, nuc_len) VALUES (?, ?, ?, ?, ?)" % tablename, (str(ecg), str(prot2), str(gene2), int(len(prot2)), int(len(gene2)))) else: print('\nGene {} cannot be found and is probably an incorrect annotation assigning length: {}\n'.format(ecg, default_length)) self.DB.db_cursor.execute("INSERT INTO %s (gene, aa_seq, nuc_seq, aa_len, nuc_len) VALUES (?, ?, ?, ?, ?)" % tablename, (str(ecg), 'None', 'None', default_length, default_length)) elif entry_exists and UPDATE_IF_EXISTS: print('\tupdating gene {}'.format(ecg), end=" ") gene2, prot2 = self.KEGG.fetchSeqfromKeGG(ecg) self.DB.db_cursor.execute('UPDATE %s SET aa_seq="%s", nuc_seq="%s", aa_len="%s", nuc_len="%s" WHERE gene="%s"' % (tablename, prot2, gene2, int(len(prot2)), int(len(gene2)), ecg)) else: print('\tskipping gene {}'.format(ecg), end=" ") tend = time.time() print(' ... done ({}).'.format(tend-tstart)) # if cntr == 6: # break cntr += 1 cntr2 += 1 if cntr2 == 21: self.DB.db_cursor.connection.commit() cntr2 = 1 def getPeptideLengthsFromDB(self, genes, keg_prefix): gene_peplen = {} for G in genes: print(G) ## Glen = self.cursor.execute('SELECT aa_len FROM gene_data WHERE gene="%s"' % G).fetchall()[0][0] Glen = self.DB.fetchAll('SELECT aa_len FROM gene_data WHERE gene="%s"' % G)[0][0] print(Glen) gene_peplen.update({G.replace(keg_prefix, '') : Glen}) return gene_peplen
[docs]class RESTClient(NetDBbase): """ Class that provides the basis for application specific connectors to REST web services """ site_root = None conn = None history = '' CONNECTED = False USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:13.0) Gecko/20100101 Firefox/13.0' def __init__(self): self.urllib2 = urllib2
[docs] def Log(self, txt): """ Add txt to logfile history - *txt* a string """ self.history += '%s - %s\n' % (time.strftime('%H:%M:%S'), str(txt))
[docs] def GetLog(self): """ Return the logged history """ return self.history
[docs] def Connect(self, root): """ Establish HTTP connection to - *root* the site root "www.google.com" """ try: self.site_root = root self.conn = self.urllib2.httplib.HTTPConnection(self.site_root) self.CONNECTED = True self.Log(self.site_root) except Exception as ex: print('\nConnection to {} failed!'.format(self.site_root)) print(ex) self.CONNECTED = False self.Log('ERROR: %s' % self.site_root) raise RuntimeError
[docs] def Get(self, query): """ Perform an http GET using: - *query* e.g. - *reply_mode* [default=''] this is the reply mode For example "/semanticSBML/annotate/search.xml?q=ATP" """ data1 = None if self.CONNECTED: try: print(query) HTMLhead = {'User-Agent' : self.USER_AGENT} self.conn.request("GET", query, headers=HTMLhead) r1 = self.conn.getresponse() print(r1.status, r1.reason) data1 = r1.read() self.Log('GET %s' % (query)) except Exception as ex: print('\nFailure to GET: {}{}\n'.format(self.site_root, query)) print(ex) self.Log('ERROR: %s%s' % (self.site_root, query)) raise RuntimeError return data1
[docs] def Close(self): """ Close the currently active connection """ if self.CONNECTED: self.conn.close() self.conn = None self.CONNECTED = False self.Log('%s - connection closed' % self.site_root) self.site_root = None
[docs]class MIRIAMTools(object): """ Tools dealing with MIRIAM annotations """ def MiriamURN2IdentifiersURL(self, urn): urn = urn.replace('urn:miriam:', '').split(':', 1) urn = 'http://identifiers.org/%s/%s' % (urn[0].strip(), urn[1].strip()) print(urn) return urn
[docs]class SemanticSBML(RESTClient, MIRIAMTools): """ REST client for connecting to SemanticSBML services """ data = None item_re = re.compile('<item>.+?</item>') def __init__(self): RESTClient.__init__(self)
[docs] def quickLookup(self, txt): """ Do a quick lookpup for txt using SemanticSBML (connectic if required) and return results. Returns a list of identifiers.org id's in descending priority (as return) - *txt* the string to lookup """ if not self.CONNECTED: self.Connect("www.semanticsbml.org") txt = txt.strip().replace(' ', '+') self.data = self.Get("/semanticSBML/annotate/search.xml?q={}".format(txt)) self.data = self.parseXMLtoText(self.data) return self.data
[docs] def viewDataInWebrowser(self, maxres=10): """ Attempt to view #maxres results returned by SemanticSBML in the default browser - *maxres* default maximum number of results to display. """ cntr = 0 for u_ in self.data: cntr += 1 try: webbrowser.open_new_tab(u_) except: print('ERROR in url {}'.format(u_)) if cntr >= maxres: print('\nMaximum results reached, set \"maxres\" to increase') break
[docs] def parseXMLtoText(self, xml): """ Parse the xml output by quickLookup() into a list of URL - *xml* XML returns from SemanticSBML """ return [i.replace('<item>', '').replace('</item>', '').strip() for i in re.findall(self.item_re, xml)]