PGLCNV

Project Summary

Description: This is a real world application of Leaf. It is the protocol for a CNV study in a rare disease. Details can't be disclosed since the study is still unpublished at the moment of publishing its protocol as a Leaf example. For the same reason, links to output files are intentionally not active!.
Number of nodes: 40
Number of [F]-nodes: 21
Total number of output files: 122
Total size of output files: 3.48G
Total CPU time required: 01:03:31.05

Protocol map

Nodes details

exportComRegGen

Exports CompareRegionAndGenes output to tsv format
Output file: CompareRegGenes.txt

Last build was: Sun Jan 8 03:47:10 2012.
Required CPU time: 00:00:0.08.

def exportComRegGen(data):
    """Exports CompareRegionAndGenes output to tsv format"""
    f = open('CompareRegGenes.txt','w')

    f.write('Gene\tSample\ts/t\tReg. start\tReg. end\tReg. size\tTanscript. start\t Transcript. end\tTranscript. size\tCode start\t Code end\tCode size\tInters. to Transcr. %\t Inters. to Code %')
    
    for line in data:
        if line[0]!='NOT_FOUND':
            for item in line:
                if type(item)==float:
                    
                    f.write(str(item)+'\t')
            f.write('\n')
    return 'CompareRegGenes.txt'

exportMergedRegs

Exports to tsv file
Output file: mergedRegsTB.tsv

Last build was: Sun Jan 8 03:47:16 2012.
Required CPU time: 00:00:0.13.

def exportMergedRegs(mergedRegs):
    """Exports to tsv file"""
    f = open('mergedRegsTB.tsv', 'w')

    f.write('B/T\tChr\tReg. Start\tReg. End\tNum Samples\tCN\t"Genes"\n')

    for key in sorted(mergedRegs.keys()):
        for i in range(0, len(mergedRegs[key][0])):
            try:
                f.write(('B' if key[0] == 's' else 'T') +'\t'+
                        key[1]+'\t'+
                        str(mergedRegs[key][0][i]).strip('[]').replace(', ','\t')+'\t'+
                        str(mergedRegs[key][1][i])+'\t'+
                        str(mergedRegs[key][3][i]).
                        replace('[','').
                        replace(']','').
                        replace("'", '')+'\t'+
                        str(mergedRegs[key][2][i]).replace('set','').
                        replace(')','').
                        replace('(','').
                        replace('[','').
                        replace(']','').
                        replace("'", '')+'\n'
                        )
            except: import pdb; pdb.set_trace()

    return 'mergedRegsTB.tsv'

exportIntersMat_genes

Exports the output of intersMat_genes to csv format
Output file: verbose_inters_mat.txt

Last build was: Sun Jan 8 03:47:20 2012.
Required CPU time: 00:00:0.08.

def exportIntersMat_genes(genes, samples, inters_names):
    """Exports the output of intersMat_genes to csv format"""
    f = open('verbose_inters_mat.txt', 'w')
    f.write(' ,'+','.join(map(lambda x:x[0]+x[1], samples)) + '\n')
    for idx1 in range(0,len(inters_names)):
        f.write(samples[idx1][0]+samples[idx1][1]+',')
        for idx2 in range(0,len(inters_names)):
            f.write(str(list(inters_names[idx1][idx2])).strip('[]').replace('\'','').replace(',','') + ',')
        f.write('\n')
    return 'verbose_inters_mat.txt'

joinPennCNVout

Puts together the different ouptuts (1 per sample) of PennCNV
Output file: pglfeb.csv_allNO14.out

Last build was: Sun Jan 8 03:45:59 2012.
Required CPU time: 00:00:0.11.

def joinPennCNVout(inputfiles):
    """Puts together the different ouptuts (1 per sample) of PennCNV"""
    #log_ofile = open(fname + '_all.log', 'a')
    if os.path.exists(fname + '_allNO14.out'):
        os.remove(fname+'_allNO14.out')
    out_ofile = open(fname + '_allNO14.out', 'a')
    for ifile in inputfiles:
        #log_ifname = ifile + '.log'
        #log_ofile.write(open(log_ifname, 'r').read())
        #if not '14B' in ifile and not '5B' in ifile and not '34' in ifile and not '33' in ifile:
        #if not '14' in ifile and not '21' in ifile and not '14B' in ifile and not '5B' in ifile:
        out_ofile.write(open(ifile, 'r').read())
    out_ofile.close()
    return fname + '_allNO14.out'

exportPerRegion

Exports to tsv file
Output file: perRegion.csv

Last build was: Sun Jan 8 03:51:58 2012.
Required CPU time: 00:00:0.26.

def exportPerRegion(data):
    """Exports to tsv file"""
    f = open('perRegion.csv','w')
    sdict = idToNameDict()
    f.write(data[0].replace('\t',',')+'\n')

    for line in data[1:]:
        l = line.replace(',','.').split()
        l[6] = sdict[l[6]]
        l = str(l).replace('\'', '').strip('[]')
        f.write(l+'\n')
    return 'perRegion.csv'

addGeneInfo

Annotate regions with intersecting genes
Output file: pglfeb.csv_allNO14.out.genes

Last build was: Sun Jan 8 03:46:51 2012.
Required CPU time: 00:00:51.95.

def addGeneInfo(fname):
    """Annotate regions with intersecting genes"""
    #print 'SKIPPING'
    #return fname + '.genes'
    os.system('perl ' + penncnv_path + '/scan_region.pl "' + fname + '" -refgene ' + annot_path + '/refGene.txt -reflink ' + annot_path + '/refLink.txt > "' + fname + '.genes1"')
    os.system('perl ' + penncnv_path + '/scan_region.pl "' + fname + '.genes1" -refgene ' + annot_path + '/refGene.txt -reflink ' + annot_path + '/refLink.txt -expandleft 5m > "' + fname + '.genes2"')
    os.system('perl ' + penncnv_path + '/scan_region.pl "' + fname + '.genes2" -refgene ' + annot_path + '/refGene.txt -reflink ' + annot_path + '/refLink.txt -expandright 5m > "' + fname + '.genes"')
    os.system('rm "' + fname + '".genes1')
    os.system('rm "' + fname + '".genes2')

    #os.system('perl ' + penncnv_path + '/detect_cnv.pl -cctest -pfb ' + penncnv_path + '/lib/ho1v1.hg18.pfb -phenofile phenotype -cnv "' + fname + '" > "' + fname + '".CC')
    return fname + '.genes'

sampleSheet

The name of the filename containing sample IDs and names
Output file: sample_sheet.tsv

Last build was: Sun Jan 8 03:37:36 2012.
Required CPU time: 00:00:0.03.

def sampleSheet():
    """The name of the filename containing sample IDs and names"""

    return('sample_sheet.tsv')

readFile

From now on the input does not come from files

Last build was: Sun Jan 8 03:46:51 2012.
Required CPU time: 00:00:0.06.

def readFile(fname):
    """From now on the input does not come from files"""
    return open(fname).readlines()

distMatGfx

Produces an MDS visualization of the output of samplesDistMats
Output files: t_tani_distrib.pdf , t_tani_mds.pdf .

Last build was: Sun Jan 8 03:51:56 2012.
Required CPU time: 00:00:0.25.

def distMatGfx(samples, mats):
    """Produces an MDS visualization of the output of samplesDistMats"""
    tani = mats[2]
    test_couples=[['13t','14t'], ['33t1','33t2'], ['34t','34tL'], ['5t', '5Bt']]
    #s = getSampleNames()
    s = samples
    
    tani_t = list()
    tani_t_mat = list()
    tids = list()
    print 'Adding background distances :'
    for idx1, s1 in enumerate(s):
        matrow = list()
        if 't' in s1:
            tids.append(s1)
            for idx2, s2 in enumerate(s):
                if 't' in s2:
                    matrow.append(tani[idx1,idx2])
                    if s1 != s2:
                        tani_t.append(tani[idx1,idx2])
                        #print s[idx1] + '-' + s[idx2]+',',
            tani_t_mat.append(matrow)
    #print

    #hist(tani_t,30)


    test_dists=list()
    sdict = idToNameDict()

    for idx, item in enumerate(test_couples):
        if item[0] not in s or item[1] not in s:
            continue
        s1 = s.index(item[0])
        s2 = s.index(item[1])
        d = tani[s1,s2]
        test_dists.append(d)
        #annotate('('+sdict[item[0]]+','+sdict[item[1]]+')', [d,10], [d,10*idx+20])
        print d


    mds=r.cmdscale(array(tani_t_mat))
    #savefig('t_tani_distrib.pdf')

    #figure()
    #plot(mds[:,0], mds[:,1], '.')
    for idx, item in enumerate(tids):
        pass
        #annotate(sdict[item], [mds[idx,0], mds[idx,1]])
    #savefig('t_tani_mds.pdf')

    return ['t_tani_distrib.pdf', 't_tani_mds.pdf']

intersectTBRegs

Intersects all CNV T regions against all CNV B regions

Last build was: Sun Jan 8 19:44:57 2012.
Required CPU time: 00:00:7.36.

def intersectTBRegs(getSampleNames, manualClean):
    """Intersects all CNV T regions against all CNV B regions"""
    f,h = PennCNVParser()
    manualClean = [f.split(l) for l in manualClean]
    num = dict()
    num['s'] = sum(['s' in s for s in getSampleNames])
    num['t'] = sum(['t' in s for s in getSampleNames])

    regsets = dict()
    chrs = set([l[1] for l in manualClean])

    for samp in getSampleNames:
        for chrom in chrs:            
            sub = [l for l in manualClean if l[7]==samp and l[1] == chrom]
            st = 'B' if 's' in samp else 'T'
            regset = [[int(l[2]),int(l[3]), l[10], l[6], st] for l in sub]

            if not chrom in regsets.keys():
                regsets[chrom] = list()
            if not sub == []:
                regsets[chrom].append(regset)

    a = dict()
    for chrom in chrs:
 #       if chrom == '13':
 #           import pdb; pdb.set_trace()
        a[chrom]=intersRegs([sorted(s) for s in regsets[chrom]])
    return a

exportToCircos

Exports to a file suitable for use with Circos

Last build was: Sun Jan 8 03:51:59 2012.
Required CPU time: 00:00:0.02.

def exportToCircos(samples, x):
    """Exports to a file suitable for use with Circos"""
    return "tobefixed!"
    x=[l.split('\t') for l in x][1:]
    x=[[l[0],l[1],l[2],l[5],l[6]] for l in x]
    fnames = list()
    for sample in samples:
        fname = 'circosInput_' + sample + '.txt'
        fnames.append(fname)
        f = open(fname, 'w')
        subx = [l[0:4] for l in x if l[4]==sample]
        f.writelines("hs%s\n" % str(item).strip('[]').replace('\'','').replace(',','') for item in subx)
        
    return(fnames)

intersectRegs

Intersects all CNV regions of T and B samples separately

Last build was: Sun Jan 8 03:47:15 2012.
Required CPU time: 00:00:5.19.

def intersectRegs(getSampleNames, manualClean):
    """Intersects all CNV regions of T and B samples separately"""
    f,h = PennCNVParser()
    manualClean = [f.split(l) for l in manualClean]
    num = dict()
    num['s'] = sum(['s' in s for s in getSampleNames])
    num['t'] = sum(['t' in s for s in getSampleNames])

    regsets = dict()
    chrs = set([l[1] for l in manualClean])

    for samp in getSampleNames:
        for chrom in chrs:
            sub = [l for l in manualClean if l[7]==samp and l[1] == chrom]
            regset = [[int(l[2]),int(l[3]), l[10], l[6]] for l in sub]
            if 's' in samp:
                if not ('s',chrom) in regsets.keys():
                    regsets['s', chrom] = list()
                regsets['s',chrom].append(regset)
            else:
                if not ('t',chrom) in regsets.keys():
                    regsets['t', chrom] = list()
                regsets['t',chrom].append(regset)

    a = dict()
    for chrom in chrs:
        a['s',chrom]=intersRegs([sorted(s) for s in regsets['s',chrom]])
        a['t',chrom]=intersRegs([sorted(s) for s in regsets['t',chrom]])
    return a

reformat2

Dummy filter to skip manual clean phase

Last build was: Sun Jan 8 03:51:58 2012.
Required CPU time: 00:00:1.83.

def reformat2(ifile):
    'Dummy filter to skip manual clean phase'

    f, header = PennCNVParser()

    ofile = list()
    ofile.append('\t'.join(header))

    p = getPairings()

    for line in ifile:
        newline = (f.split(line)[1:-1])
        try:
            t=newline.pop(10)
        except IndexError:
            print line
        
        
        if newline[6] in [x[0] for x in p]:
            newline.insert(7,'s')
        else:
            newline.insert(7,'t')

        genes = newline[10].split(',')
        for gene in genes:
            newline[10]=gene        
            ofile.append('\t'.join(newline))
            
    return ofile

genoTypeCheck

Builds a matrix of % of genotype differences between each couple of samples

Last build was: Sun Jan 8 19:42:16 2012.
Required CPU time: 00:02:57.52.

def genoTypeCheck(fnames):
    """Builds a matrix of % of genotype differences between each couple of samples"""
    data=[]
    for ifile in fnames:
        curgtype = list()
        cursamp = open(ifile).readlines()
        for line in cursamp:
            line = line.split('\t')
            curgtype.append(line[3])
        data.append(curgtype)

    
    data=array(data)
    differences=list()
    f = open('gtypecheck_small.txt', 'w')
    sdict = idToNameDict()

    for sample1 in data:
        templist = list()
        for sample2 in data:
            templist.append(float(sum(array(sample1)!=array(sample2))) / len(sample1) * 100)
            try:
                print sdict[sample1[0].replace('.GType','')], 'vs', sdict[sample2[0].replace('.GType','')],':','\t', templist[-1], '\t'

            #print templist[-1]

                if templist[-1] < 20 and sdict[sample1[0].replace('.GType','')] != sdict[sample2[0].replace('.GType','')]:
                    f.write(sdict[sample1[0].replace('.GType','')] + ' vs ' + sdict[sample2[0].replace('.GType','')] + '\t' + str(templist[-1]) + '\n')
        
                differences.append(templist)
    
                f = open('gtypecheck.txt', 'w')
                for idx, item in enumerate(differences):
                    f.write(sdict[data[idx][0].replace('.GType','')]+'  ')
                    f.write('\n')
                for idx, item in enumerate(differences):
                        f.write('  ' + sdict[data[idx][0].replace('.GType','')])
                        f.writelines(str(array(item)).strip('[]').replace('\n','')+'\n')

            except:
                print 'Skipping couple: ' + sample1[0] +', '+sample2[0]

    return differences

getGeneNames

Extract the names of all annotated genes

Last build was: Sun Jan 8 03:47:16 2012.
Required CPU time: 00:00:0.05.

def getGeneNames(f):
    """Extract the names of all annotated genes"""
    data=list()
    for line in f:
        nline=line[0:-1]
        t = nline.split()
        data.extend(t[7].split(','))
    data=array(data)   
    
    return unique(data)

countCNVlenghts

Computes percentage of genome affected by CNV

Last build was: Sun Jan 8 19:43:38 2012.
Required CPU time: 00:00:0.36.

def countCNVlenghts(getSampleNames, reformat):
    """Computes percentage of genome affected by CNV"""
    di = idToNameDict()
    y=[l.split('\t') for l in reformat]
    glength=float(2867188341)
    res = []
    
    for s in getSampleNames:
        try:
            sub=[l for l in y if l[6]==s]
            lens = [int(l[2])-int(l[1]) for l in sub]
            res.append([di[s] , str(sum(lens)) , str(100*(sum(lens) / glength))])
        except:
            import pdb; pdb.set_trace()
            
    for st in ['s', 't']:
        sub = [l for l in y if l[7]==st]
        lens = [int(l[2])-int(l[1]) for l in sub]
        if st=='s':
            label = 'Blood'
        else:
            label = 'Tumor'
        print label + ' average: ' + str(sum(lens) /
                                         (len(getSampleNames)/float(2))) +'\t' + str(100 * (sum(lens)/ (len(getSampleNames)/float(2)) / glength) )

    return res

addStats

Adds descriptive statistics

Last build was: Sun Jan 8 03:47:17 2012.
Required CPU time: 00:00:0.99.

def addStats(fname):
    """Adds descriptive statistics"""
    data=list()
    f = fname
    for line in f:
        line=line[0:-1]
        data.append(line.split('\t'))
    data=array(data)
    
    genes = unique(data[:,10])

    p = getPairings()

    s=list()
    for samp in p[0]:
        for raw in data:
            if raw[6] == samp:
                s.append(raw)
    s=array(s)


    t=list()

    for samp in p[1]:
        for raw in data:
            if raw[6] == samp:
                t.append(raw)
    t=array(t)

    gene_count = dict()

    for gene in genes:
        if gene == 'NOT_FOUND':
            gene_count[gene]=(0,0,0,0)
        else:
            s_where = s[nonzero(s[:,10]==gene)]
            t_where = t[nonzero(t[:,10]==gene)]
            s_count = sum(s[:,10]==gene)
            t_count = sum(t[:,10]==gene)
            s_diffcount = len(unique(s_where[:,6]))
            t_diffcount = len(unique(t_where[:,6]))
            gene_count[gene]=(s_count, t_count, s_diffcount, t_diffcount)
        
    of=list()
    for idx, line in enumerate(data):        
        if idx==0:
            of.append('\t'.join(line) + '\ts hits \tt hits\thits diff.\ts un. hits\tt un. hits\t un. hits diff.')
        else:
            currinfo = gene_count[line[10]]
            of.append('\t'.join(line)+\
                    '\t'+str(currinfo[0])+\
                    '\t'+str(currinfo[1])+\
                    '\t'+str(abs(currinfo[0]-currinfo[1]))+\
                    '\t'+str(currinfo[2])+\
                    '\t'+str(currinfo[3])+\
                    '\t'+str(abs(currinfo[2]-currinfo[3])))
                     #'\t'+str(pvalue(currinfo[2], 8-currinfo[2], currinfo[3], 8-currinfo[3]).two_tail)+'\n')
                     
    return of

exportCNVlenghts

[documentation not available]
Output file: cnv_lenghts.tsv

Last build was: Sun Jan 8 19:43:38 2012.
Required CPU time: 00:00:0.02.

def exportCNVlenghts(countCNVl):
    f = open('cnv_lenghts.tsv', 'w')
    for line in countCNVl:
        f.writelines(str(line).strip('[]').replace(',','\t').replace("'",'')+'\n')
    return 'cnv_lenghts.tsv'

exportDistMats

Exports the output of samplesDistMats to csv format
Output files: inters_mat.txt , notIntersMat.txt , tanimotoMat.txt .

Last build was: Sun Jan 8 19:43:38 2012.
Required CPU time: 00:00:0.03.

def exportDistMats(samples, mats):
    """Exports the output of samplesDistMats to csv format"""
    intMat=mats[0]
    notIntMat=mats[1]
    tanimotoMat=mats[2]

    f_intMat = open('intersMat.txt','w')
    f_notIntMat = open('notIntersMat.txt','w')
    f_tanimotoMat = open('tanimotoMat.txt','w')

    f_intMat.write(' ,'+','.join(samples) + '\n')
    f_notIntMat.write(' ,'+','.join(samples) + '\n')
    f_tanimotoMat.write(' ,'+','.join(samples) + '\n')

    for idx, item in enumerate(intMat):
        f_intMat.write(samples[idx]+',')
        f_intMat.write(str(list(intMat[idx])).strip('[]').replace('.0','').replace(' ','')+'\n')

        f_notIntMat.write(samples[idx]+',')
        f_notIntMat.write(str(list(notIntMat[idx])).strip('[]').replace('.0','').replace(' ','')+'\n')

        f_tanimotoMat.write(samples[idx]+',')
        f_tanimotoMat.write(str(list(tanimotoMat[idx])).strip('[]').replace(' ','')+'\n')

    return ['inters_mat.txt', 'notIntersMat.txt', 'tanimotoMat.txt']

intersMat_genes

Produces a list of common genes between each couple of samples

Last build was: Sun Jan 8 03:47:19 2012.
Required CPU time: 00:00:0.38.

def intersMat_genes(dataset, fname, genes, samples):
    """Produces a list of common genes between each couple of samples"""
    inters_names = list()
    for idx1, geneline1 in enumerate(dataset):
        inters_names_t = list()
        for idx2, geneline2 in enumerate(dataset):
            inters_names_t.append(genes[nonzero(logical_and(geneline1, geneline2))])
        inters_names.append(inters_names_t)
    return inters_names

reformat

Change format from region-centric to gene-centric

Last build was: Sun Jan 8 03:46:56 2012.
Required CPU time: 00:00:0.69.

def reformat(ifile):
    """Change format from region-centric to gene-centric"""
    f, header = PennCNVParser()

    ofile = list()
    ofile.append('\t'.join(header))

    p = getPairings()

    for line in ifile:
        newline = (f.split(line)[1:-1])
        try:
            t=newline.pop(10)
        except IndexError:
            print line
        
        
        if newline[6] in [x[0] for x in p]:
            newline.insert(7,'s')
        else:
            newline.insert(7,'t')

        genes = newline[10].split(',')
        for gene in genes:
            newline[10]=gene        
            ofile.append('\t'.join(newline))
            
    return ofile

CNVboolVects

Creates a matrix of presence/absence of CNV in each sample for each gene (out of all the gene that are affected by CNV in at least 1 sample

Last build was: Sun Jan 8 03:47:18 2012.
Required CPU time: 00:00:0.35.

def CNVboolVects(raw, allsamples):
    """Creates a matrix of presence/absence of CNV in each sample for each gene (out of all the gene that are affected by CNV in at least 1 sample"""
    data=list()
    f=raw
    for line in f:
        line=line[0:-1]
        data.append(line.split('\t'))    
    data=array(data)
    
    
    genes = unique(data[1:-1,10])
    hashd = dict(   zip(  genes, range( 0,len(genes) )  )   )
    

    #samples = getSampleNames()
    samples = allsamples
    s=data
    vectors = np.zeros((len(samples),len(genes)))
    for s_idx, sample in enumerate(samples):        
        sample_genes = s[s[:,6]==sample][:,10]
        for gene in sample_genes:
            vectors[s_idx][hashd[gene]]=1

    return vectors

LogR_BAF_FileName

The name of the file containing Log R Ratio and B Allele Frequency data
Output file: pglfeb.csv

Last build was: Sun Jan 8 03:37:36 2012.
Required CPU time: 00:00:0.03.

def LogR_BAF_FileName():
    """The name of the file containing Log R Ratio and B Allele Frequency data"""
    return(fname)

clustergram

Calls R to make a clustergram
Output files: clustergram_001_108.png , clustergram_005_248.png , clustergram_05_1606.png , heatmap_001_108.png , heatmap_005_248.png , heatmap_05_1606.png .

Last build was: Sun Jan 8 19:44:49 2012.
Required CPU time: 00:00:57.72.

def clustergram(diffmat, fisher, genes, samples):
    """Calls R to make a clustergram"""
    #samplesNoLetter = [item.replace('t','').replace('s','') for item in samples]
    #samples = [] 
    #[samples.append(i) for i in samplesNoLetter if not i in samples]

    p=getPairings()
    sdict = idToNameDict()
    s2 = list()
    for i in range(0,len(p)):
        s2.append(sdict[p[i][0]]+' vs '+sdict[p[i][1]])
    samples = s2

        

    d=diffmat[0]
    gdict = geneChrDict()

    #import pdb; pdb.set_trace()
    thresholds = [0.01, 0.05, 0.5]
    cfnames = list()
    hfnames = list()

    for t in thresholds:
        ok_genes=list()
        ok_gnames=list()

        for idx, gene_row in enumerate(d):
            if fisher[1][idx] < t:       
                if str.lower(genes[idx]) != 'loc100130872-spon2' and str.lower(genes[idx]) != 'loc647859':
                    ok_genes.append(gene_row)
                    ok_gnames.append(genes[idx])

        print 'Selected genes: ' + str(len(ok_gnames))

        for idx, gene in enumerate(ok_gnames):
            if len(gdict[gene])==1:
                chr = '0'+ gdict[gene]
            else:
                chr = gdict[gene]
            ok_gnames[idx]= chr + ' ' + gene

        ok_genes=array(ok_genes)
        r.library('gplots')
        colors = r.redgreen(len(unique(ok_genes)))
        colors = colors[::-1]
        #colors[2]='#880000'
        colors[3]='#880000'
        #return ok_genes, samples, ok_gnames, colors

        cfnames.append('clustergram_' + str(t).replace('.','') + '_' + str(len(ok_gnames)) + '.png')
        hfnames.append('heatmap_' + str(t).replace('.','') + '_' + str(len(ok_gnames)) + '.png')

        callRforClustergram(cfnames[-1], ok_genes, samples, ok_gnames, colors)
        makeHeatmap(hfnames[-1], ok_genes, samples, ok_gnames, colors)
    

    return hstack([cfnames, hfnames])

PennCNV

def PennCNV(inputfiles):
    """Runs PennCNV to detect CNV regions"""
    ofiles = list()
    for ifile in inputfiles:
        cmdstr = 'perl ' + penncnv_path + \
            '/detect_cnv.pl -test -hmm ' + \
            penncnv_path + '/lib/hhall.hmm -pfb ' + \
            penncnv_path + '/lib/ho1v1.hg18.pfb -gcmodelfile ' + \
            penncnv_path + '/lib/ho1v1.hg18.gcmodel ' + \
            ifile + \
            ' -log "' + ifile + '.log" ' +  \
            '-out "' + fname + '_' + ifile + '.out"'
            
        print 'Executing: ' + cmdstr
        #print 'WARNING: skipping PennCNV call! Uncomment to really call.'
        os.system(cmdstr)
        #ofiles.append(lp.newFile(fname + '_' + ifile + '.out'))
        ofiles.append(fname + '_' + ifile + '.out')
    return ofiles

getSampleNames

Extracts sample names

Last build was: Sun Jan 8 03:47:10 2012.
Required CPU time: 00:00:0.03.

def getSampleNames(fname):
    """Extracts sample names"""
    b=open(fname).readlines()
    t=list()
    p = getPairedSampleIDs()
    for l in b[1:]:
        s = l.split()[0]
        #if s in p:
        t.append(s)
    return t

CompareRegionAndGenes

Computes intersections between regions and genes

Last build was: Sun Jan 8 03:47:09 2012.
Required CPU time: 00:00:13.22.

def CompareRegionAndGenes(data):
    """Computes intersections between regions and genes"""
    ref = open(penncnv_path+'/ucsc_hg18/refGene.txt').readlines()
    for idx, line in enumerate(ref):
        ref[idx] = line.split()
    refgenes = [ref[i][12] for i in range(0, len(ref))]
    
    res = list()
    for idx, line in enumerate(data[1:]):
        if mod(idx, 100)==0:
            print 'Intersecting: '+str(idx)+' of '+str(len(data)-1)
        line = line.split('\t')
        gene = line[10]
        try:
            where = refgenes.index(gene)
            int1 = regIntersect([int(line[1]), int(line[2])], [int(ref[where][4]), int(ref[where][5])])
            int2 = regIntersect([int(line[1]), int(line[2])], [int(ref[where][6]), int(ref[where][7])])
            #res.append(['***********************************************'])
            res.append([
                        gene, 
                        line[6],            #sample
                        line[7],            #s/t
                        int(line[1]),       #reg start
                        int(line[2]),       #reg end
                        int(line[2])-int(line[1]), # reg size
                        int(ref[where][4]), #trans start
                        int(ref[where][5]), #trans end
                        int(ref[where][5])-int(ref[where][4]),
                        int(ref[where][6]), #code start
                        int(ref[where][7]), #code end
                        int(ref[where][7])-int(ref[where][6]),
                        int1,               #int trans
                        int2,               #int code
                        ])
        except: 
            res.append([
                        gene,
                        line[6],
                        line[7],
                        int(line[1]),
                        int(line[2]),
                        '?', '?', '?', '?', '?', '?','?','?'
                        ])
    
    return res

exportFigExtRegs

Creates a figure for the extended regions using R
Output file: extRegs.pdf

Last build was: Sun Jan 8 19:45:02 2012.
Required CPU time: 00:00:3.74.

def exportFigExtRegs(frags):
    """Creates a figure for the extended regions using R"""
    r.library('quantsmooth')
    chrwidth = 10
    chrspacing = 30

    r.pdf(file='extRegs.pdf', height=10, width=12)
    #maxarmlen = max(
    maxlen = max([a[0][1] for a in frags['1']])
    r.plot(1,type="n",xlim=[0,(chrspacing+chrwidth)*(len(frags)-1)],ylim=[0, maxlen],xaxt="n",yaxt="n", ann=False, frame_plot=False, tick_number=0, xlab="Autosomes")

    keys = [int(k) for k in frags]
    skeys = sort(keys)
    tickat = list()
    for cidx,chrom in enumerate([str(k) for k in skeys]):
        offs = cidx*(chrspacing+chrwidth)
        #maxy = max([a[0][1] for a in frags[chrom]])
        tothits = sum([a[0][1]-a[0][0] for a in frags[chrom][0::2]])
        maxlen = max([a[0][1] for a in frags[chrom]])
        maxy = r.lengthChromosome(chrom, units = 'bases')[chrom]
        r.lines([offs, offs], [0,maxy])

        for fidx, frag in enumerate(frags[chrom]):
            if frag[1]=='' and frag[2]=='':
                continue
            y = (frag[0][0]+float(frag[0][1]))/2
            y = (y * maxy) / maxlen
            l = frag [0][1]-frag[0][0]+1
            lB = int(frag[1])
            lT = int(frag[2])
            Ts = sum([a=='T' for a in frag[3]])
            Bs = sum([a=='B' for a in frag[3]])
            #Ts_p = sum([a=='T' for idx,a in enumerate(frag[3]) if int(frag[4][idx])>2])
            #Ts_n = sum([a=='T' for idx,a in enumerate(frag[3]) if int(frag[4][idx])<2])        
            #Bs_p = sum([a=='B' for idx,a in enumerate(frag[3]) if int(frag[4][idx])>2])
            #Bs_n = sum([a=='B' for idx,a in enumerate(frag[3]) if int(frag[4][idx])<2])
            Ts_p = sum([a=='T' for idx,a in enumerate(frag[3]) if int(frag[4][idx])>2])
            Ts_n = sum([a=='T' for idx,a in enumerate(frag[3]) if int(frag[4][idx])<2])        
            Bs_p = sum([a=='B' for idx,a in enumerate(frag[3]) if int(frag[4][idx])>2])
            Bs_n = sum([a=='B' for idx,a in enumerate(frag[3]) if int(frag[4][idx])<2])
            Tgain = Ts_p - Ts_n
            Bgain = Bs_p - Bs_n
            #Tcol = 'red' if Tgain < 0 else 'green'
            #Bcol = 'red' if Bgain < 0 else 'green'
            Dcol = 'darkred' if Tgain - Bgain > 0 else 'darkgreen'
            

            armlen = int(abs(lT - lB))*1.5
            if max(lB, lT)>0:
                if Dcol == 'darkred':
                    r.lines([offs +chrwidth-2 , armlen + offs +chrwidth-2], [maxy-y,maxy-y], col=Dcol)
                else:
                    r.lines([offs -armlen -chrwidth+2, offs -chrwidth+2], [maxy-y,maxy-y], col=Dcol)

        tickat.append(offs)
        print chrom, tothits/float(maxy)
        bleachv = 1-min((tothits/float(maxy))/0.08,1)

        r.paintCytobands(chrom, pos=[offs+chrwidth/2, maxy], width=chrwidth, units="bases", orientation='v', legend=False, bleach=bleachv)
        

#    r.axis(at=tickat, labels=range(1,23), cex_axis=1, side=range(1,23), mgp=[3,.5,0], xlab='Autosomes', main='Genomic regions affected by CNV')
    r.axis(at=tickat, lty=0, labels=range(1,23), cex_axis=1, side=range(1,23), xlab='Autosomes', main='Genomic regions affected by CNV', mgp=[3,-0.8,0])
        

                # if Bgain:
                #     #r.lines([-log10(l) + offs, 0 + offs], [y,y], col=Bcol)
                #     r.lines([offs - lB -1, 0 + offs -1], [y,y], col=Bcol)
                #     r.lines([offs - lB -1, 0 + offs -1], [y,y], col=Bcol)
                # if Tgain:
                #     #r.lines([0 + offs, log10(l) + offs], [y,y], col=Tcol)
                #     r.lines([0 + offs +1 , lT + offs +1], [y,y], col=Tcol)
                #     r.lines([0 + offs +1 , lT + offs +1], [y,y], col=Tcol)


    r.dev_off()
    return('extRegs.pdf')

manualClean

Removes a list of CNVi-related genes produced elsewhere

Last build was: Sun Jan 8 03:46:55 2012.
Required CPU time: 00:00:3.48.

def manualClean(x):
    """Removes a list of CNVi-related genes produced elsewhere"""
    no=[int(n) for n in open('manuallyRemovedCNVs.txt').read().split()]
    cleandata = list()
    f = open('cleanedCNVs.txt', 'w')

    for idx, line in enumerate(x):
        if not (idx+1 in no):
            cleandata.append(line)
            f.write(line)
    return cleandata

samplesDistMats

Computes Tanimoto distance between samples

Last build was: Sun Jan 8 03:47:21 2012.
Required CPU time: 00:00:0.24.

def samplesDistMats(dataset):
    """Computes Tanimoto distance between samples"""
    inters_mat = squareform(pdist(dataset, lambda x,y: sum(logical_and(x,y))))
    for idx in range(0,dataset.shape[0]):
        inters_mat[idx,idx]=sum(dataset,axis=1)[idx]    
        
    notIntersMat = squareform(pdist(dataset, lambda x,y: sum(~logical_and(x,y))))
    tanimotoMat = squareform(pdist(dataset, lambda x,y: 1 - sum(logical_and(x,y))/float(sum(logical_or(x,y)))  ))

    return [inters_mat, notIntersMat, tanimotoMat]

exportPerRegion_clean

Dummy node to skip manual filtering phase
Output file: perRegion_clean.csv

Last build was: Sun Jan 8 19:45:02 2012.
Required CPU time: 00:00:0.14.

def exportPerRegion_clean(data):
    """Dummy node to skip manual filtering phase"""
    f = open('perRegion_clean.csv','w')
    sdict = idToNameDict()
    f.write(data[0].replace('\t',',')+'\n')

    for line in data[1:]:
        l = line.replace(',','.').split()
        l[6] = sdict[l[6]]
        l = str(l).replace('\'', '').strip('[]')
        f.write(l+'\n')
    return 'perRegion_clean.csv'

exportCNVDiffMat

Exports to tsv file
Output file: CNVgenes.csv

Last build was: Sun Jan 8 19:45:15 2012.
Required CPU time: 00:00:12.21.

def exportCNVDiffMat(cnvdiff, cnvdata, fisher, genes, samples):
    """Exports to tsv file"""
    p = getPairings()
    samples = list()
    for a in p:
        samples.extend([a[0], a[1]])
    
    samples = samplesToNames(samples)        

    data = list()
    for line in cnvdata:
        data.append(line.split('\t'))
    data = array(data)
    vects, vs, vt = cnvdiff[0], cnvdiff[1], cnvdiff[2]
    gdict = geneChrDict()

    for i in range(len(samples),0,-1):
        if mod(i,2)==0:
            samples.insert(i, samples[i-1]+'-'+samples[i-2])
            
            header =['gene', 'Chr',
                     'B gain', 'B loss', 'B CNV',
                     'T gain', 'T loss', 'T CNV', 
                     'B CNV + T CNV',
                     'TvsB loss','TvsB gain', 'TvsBgain+TvsBloss', 'TvsB loss/gain p-value',
                     'max CNV', 'Reg. Max length',
                     'CNV samples (repeated if hit in multiple regions)']

    header = hstack([header, samples])
  
    fisher_abs = fisher[0]
    fisher_diff = fisher[1]

    f = open('CNVgenes.csv', 'w')
    f.write(str(header).strip('[]').replace('\n','').replace('\' \'',',').strip('\'') + '\n')
    for idx, vect in enumerate(vects):
        #gene name
        f.write(genes[idx])
        #if genes[idx]=='ACOT11':
        #    import pdb; pdb.set_trace()

        #chromosome
        f.write(',' + str(gdict[genes[idx]]))
        
        #cnv stats
        cnvBgain = sum(vs[idx]>2)
        cnvBloss = sum(vs[idx]<2)
        cnvB = cnvBgain + cnvBloss

        cnvTgain = sum(vt[idx]>2)
        cnvTloss = sum(vt[idx]<2)
        cnvT = cnvTgain + cnvTloss

        f.write(', ' + str(cnvBgain) + ',' + str(cnvBloss) + ',' + str(cnvB))
        f.write(', ' + str(cnvTgain) + ',' + str(cnvTloss) + ',' + str(cnvT))
        f.write(', ' + str(cnvB+cnvT))
#        f.write(', ' + str(cnvB) + ',' + str(cnvT) + ',' + str(cnvB+cnvT))

        #fisher on B vs T cnvs
        #f.write(','+str(fisher_abs[idx]))
        
        #differential cnvs stats
        f.write(str(',' +
                    str(sum(vect<0))+','+
                    str(sum(vect>0))+','+
                    str(sum(vect<0)+
                        sum(vect>0))+',')
                .replace('\n',''))

        #fisher on s vs t differential cnvs
        f.write(',' + str(fisher_diff[idx]) + ',')


        # max CNV
        cns = hstack([vs[idx],vt[idx]])
        maxcnv = cns[argmax(abs(2-cns))]
        f.write(str(maxcnv) + ',')

        # per gene additional info
        info = extractPerGeneProperties(data, genes[idx])[1:]
        for idx2, item in enumerate(info):
            if not item=='nan':
                if idx2 in [3,4,5]:
                    f.write(item.replace(',','').replace('\n','').replace('\'', '').replace('[','').replace(']','')+',')

                    
        #cnv per sample
        f.write(str(zip(vs[idx],vt[idx],vect)).strip('[]').replace('(','').replace(')','').replace('\n','').replace('.0',''))

        f.write('\n')

    return 'CNVgenes.csv'

exportMergedFragments

Exports extended regions to tsv file
Output file: extRegions.tsv

Last build was: Sun Jan 8 19:45:15 2012.
Required CPU time: 00:00:0.09.

def exportMergedFragments(mf):
    '''Exports extended regions to tsv file'''
    f = open('extRegions.tsv', 'w')

    f.write('Chr.\t'+
            'ExtReg. Start\t'+
            'ExtReg. End\t'+
            'ExtReg. Length\t'+
            'Max hits B\t'+
            'Max hits T\t'+
            'B/T\t'+
            'CN\t'+
            '"Genes"\n')

    for chrom in mf:
        for extreg in mf[chrom]:
            f.write(
                #chromosome
                chrom+'\t'+
                #reg start
                str(extreg[0][0])+'\t'+
                #reg end
                str(extreg[0][1])+'\t'+
                #reg length
                str(extreg[0][1]-extreg[0][0]+1)+'\t'+
                # max hits B
                str(extreg[1])+'\t'+
                # max hits T
                str(extreg[2])+'\t'+
                #BT
                str(extreg[3]).strip('[]').replace("'",'').replace(',','')+'\t'+
                #CN
                str(extreg[4]).strip('[]').replace("'",'').replace(',','')+'\t'+
                #genes
                str(extreg[5]).replace('set','').replace('(','').replace(')','')
                .replace('[','').replace(']','').replace("'",'')+'\n'
                    )

    return 'extRegions.tsv'

exportMergedRegsTB

Exports to tsv file
Output file: mergedRegsTB.tsv

Last build was: Sun Jan 8 19:45:16 2012.
Required CPU time: 00:00:0.66.

def exportMergedRegsTB(mergedRegs):
    """Exports to tsv file"""
    f = open('mergedRegs.tsv', 'w')

    f.write('Chr.\tReg. Start\tReg. End\tReg. Lenght\tNum Samples\tCN-B\tCN-T\tnB\tnT\t"Genes"\n')

    for key in sorted(mergedRegs.keys()):
        for i in range(0, len(mergedRegs[key][0])):
#            try:

            try:
                cnb = [cn for idx, cn in enumerate(mergedRegs[key][3][i])
                       if mergedRegs[key][4][i][idx]=='B']
                cnt = [cn for idx, cn in enumerate(mergedRegs[key][3][i])
                       if mergedRegs[key][4][i][idx]=='T']
            except:
                import pdb; pdb.set_trace()
            f.write(
                # Chr
                key+'\t'+

                #Reg. Start e Reg. End
                str(mergedRegs[key][0][i]).strip('[]').replace(', ','\t')+'\t'+

                #Reg. Length
                str(mergedRegs[key][0][i][1]-mergedRegs[key][0][i][0]+1)+'\t'+

                #Num Samples
                str(mergedRegs[key][1][i])+'\t'+

                #CN-B
                str(cnb).
                replace('[','').
                replace(']','').
                replace("'", '')+'\t'+

                #CN-T
                str(cnt).
                replace('[','').
                replace(']','').
                replace("'", '')+'\t'+

                #nB
                str(sum([st=='B' for st in mergedRegs[key][4][i]]))+'\t'+

                #nT
                str(sum([st=='T' for st in mergedRegs[key][4][i]]))+'\t'+

                #Genes
                str(mergedRegs[key][2][i]).replace('set','').
                replace(')','').
                replace('(','').
                replace('[','').
                replace(']','').
                replace("'", '')+'\n'
                )
#           except: import pdb; pdb.set_trace()

    return 'mergedRegsTB.tsv'

mergeFragments

Merges all contiguous non-void region fragments

Last build was: Sun Jan 8 19:44:58 2012.
Required CPU time: 00:00:0.37.

def mergeFragments(intRegs):
    """Merges all contiguous non-void region fragments"""
    extreg=dict()

    for chrom in intRegs:
        extreg[chrom] = list()
        currExtRegBT = list()
        currExtRegCN = list()
        currExtRegS = [Inf,0]
        currExtRegG = set()
        currExtRegMaxhitsB = 0
        currExtRegMaxhitsT = 0

        for idx, frag in enumerate(intRegs[chrom][4]):
            a = intRegs[chrom][0][idx][0]
            b = intRegs[chrom][0][idx][1]
            #import pdb; pdb.set_trace()
            if frag != []:
                currExtRegMaxhitsT = max(currExtRegMaxhitsT, sum([a1=='T' for a1 in frag]))
                currExtRegMaxhitsB = max(currExtRegMaxhitsB, sum([a1=='B' for a1 in frag]))
                currExtRegBT.extend(frag)
                currExtRegCN.extend(intRegs[chrom][3][idx])
                currExtRegS = [min(currExtRegS[0], a), max(currExtRegS[1],b)]
                currExtRegG = currExtRegG.union(intRegs[chrom][2][idx])
            else:
                extreg[chrom].append([currExtRegS, currExtRegMaxhitsB,
                                      currExtRegMaxhitsT, currExtRegBT, currExtRegCN, currExtRegG])
                extreg[chrom].append([[a,b], '', '', '', '', 'unknown'])
                currExtRegCN=list()
                currExtRegBT=list()
                currExtRegG = set()
                currExtRegS = [Inf,0]
                currExtRegMaxhitsT = 0
                currExtRegMaxhitsB = 0
                                
        # handle last fragment (will get no '[]' as stop signal)
        if frag != []:
            extreg[chrom].append([currExtRegS, currExtRegMaxhitsB,
                                  currExtRegMaxhitsT, currExtRegBT, currExtRegCN, currExtRegG])
                                

    return extreg

computeFisher

Tests the hyphotesis that a gene is hit by CNV in M samples out of N by chance

Last build was: Sun Jan 8 19:43:52 2012.
Required CPU time: 00:00:8.96.

def computeFisher(cnvect):
    """Tests the hyphotesis that a gene is hit by CNV in M samples out of N by chance"""
    svec = cnvect[1]
    tvec = cnvect[2]
    diff = cnvect[0]
    numsamp = len(svec[0])
    fishval_diff = zeros(svec.shape[0])
    fishval_abs = zeros(svec.shape[0])

    for idx, row in enumerate(diff):
        a11 = sum(row < 0)
        a12 = numsamp-a11
        a21 = sum(row > 0)
        a22 = numsamp-a21
        #fishval[idx]=r.fisher_test(array([[a11,a12],[a21,a22]]), alternative='less')['p.value']
        fishval_diff[idx]=r.fisher_test(array([[a11,a12],[a21,a22]]))['p.value']

    for idx, row in enumerate(svec):
        a11 = sum(row != 2)
        a12 = numsamp-a11
        a21 = sum(tvec[idx] != 2)
        a22 = numsamp-a21
        fishval_abs[idx]=r.fisher_test(array([[a11,a12],[a21,a22]]))['p.value']

    return fishval_abs, fishval_diff

prepareInput

Converts data to a format suitable for PennCNV
Output files: 11s.tab , 11t.tab , 12s.tab , 12t.tab , 13s.tab , 13t.tab , 14Bt.tab , 14s.tab , 19s.tab , 19t.tab , 1s.tab , 1t.tab , 20s.tab , 20t.tab , 21s.tab , 21t.tab , 2s.tab , 2t.tab , 32s.tab , 32t.tab , 33s.tab , 33t1.tab , 34s.tab , 34t.tab , 36s.tab , 36t.tab , 37s.tab , 37t.tab , 3s.tab , 3t.tab , 43s.tab , 43t.tab , 44s.tab , 44t.tab , 45s.tab , 45t.tab , 4s.tab , 4t.tab , 5Bt.tab , 5s.tab , 5t.tab , 6s.tab , 6t.tab , 7s.tab , 7t.tab , 8s.tab , 8t.tab .

Last build was: Sun Jan 8 03:42:33 2012.
Required CPU time: 00:04:57.23.

def prepareInput(fname, ssheet):
    """Converts data to a format suitable for PennCNV"""
    ss = [l.split('\t') for l in open(ssheet).readlines()][1:]

    scolumns = range(4,200,3)
    #a=open(fname).readline()
    #samples = [line.replace('.GType','') for line in a.split(',')[3::3]]
    samples = [s[0] for s in ss]


    fnames = list()
    for k, sample in enumerate(samples):
        print 'Preparazione campione ' + sample + '...',
        #print 'SKIPPING!'
        os.system('cut -f 1-3,' + str(scolumns[k]) + '-' + str(scolumns[k]+2) + ' --delimiter , "' + fname +'" > ' + sample + '.csv')
        os.system('cat ' + sample + ".csv | tr ',' '\t' > " + sample + '.tab')
        os.system('rm ' + sample + '.csv')
        fnames.append(sample + '.tab')


    return fnames

exportFigCNVlenghts

Creates a figure of the samples based % of genome affected by CNV
Output files: cnvlenghts.pdf , cnvlenghts_log.pdf .

Last build was: Sun Jan 8 19:45:16 2012.
Required CPU time: 00:00:0.04.

def exportFigCNVlenghts(countCNVlenghts_o):
    
    """Creates a figure of the samples based % of genome affected by CNV"""
    r.pdf('cnvlenghts_log.pdf')
    y = [int(x[1])/1000000 for x in countCNVlenghts_o if not 'B' in x[0]]
    l = [x[0] for x in countCNVlenghts_o if not 'B' in x[0]]
    yidx = sorted(range(len(y)), key = y.__getitem__)
    ys = sorted(y)
    ls = [l[x] for x in yidx]
    r.barplot(ys, horiz=True, names_arg=ls, log='x', cex_names=0.5, las=1)
    r.dev_off()

    r.pdf('cnvlenghts.pdf')
    y = [int(x[1]) for x in countCNVlenghts_o if not 'B' in x[0]]
    l = [x[0] for x in countCNVlenghts_o if not 'B' in x[0]]
    yidx = sorted(range(len(y)), key = y.__getitem__)
    ys = sorted(y)
    ls = [l[x] for x in yidx]
    r.barplot(ys, horiz=True, names_arg=ls, cex_names=0.5, las=1)
    #r.lines(range(0,len(ys)), r.cumsum(ys))
    #r.lines(r.cumsum(y), range(0,len(y)))
    r.dev_off()

    return ('cnvlenghts.pdf', 'cnvlenghts_log.pdf')

makeBed

Create Bed file for use with Genome Browser
Output file: pglfeb.csv.bed

Last build was: Sun Jan 8 19:45:16 2012.
Required CPU time: 00:00:0.42.

def makeBed(CNVinput):
    """Create Bed file for use with Genome Browser"""
    os.system('perl ' + penncnv_path + '/visualize_cnv.pl -format bed -output "' + fname + '.bed" "' + CNVinput + '"')
    return fname + '.bed'

CNVDiffMat

Makes a matrix of B/T difference in CN between same genes of different samples

Last build was: Sun Jan 8 19:43:41 2012.
Required CPU time: 00:00:2.43.

def CNVDiffMat(fname, genes, allsamples):
    """Makes a matrix of B/T difference in CN between same genes of different samples"""
    data=list()
    f=fname
    for line in f:
        line=line[0:-1]
        data.append(line.split('\t'))    
    data=array(data)

    hashd = dict(  zip( genes, range(0,len(genes)) )    )    

    p = getPairings()
    sp = [x[0] for x in p]
    tp = [x[1] for x in p]

    s = list()
    for sample in sp:
        for line in data:
            if sample == line[6]:
                s.append(line)

    t = list()
    for sample in tp:
        for line in data:
            if sample == line[6]:
                t.append(line)

    samples = sp
    s_vectors = np.zeros((len(samples),len(genes)))*nan
    s_vectors_min =np.zeros((len(samples),len(genes)))*nan
    s_vectors_num =np.zeros((len(samples),len(genes)))*nan
    s_vectors_cn = np.zeros((len(samples),len(genes)))*nan

    for s_idx, sample in enumerate(samples):       
        print s_idx, 'of', len(samples),',',
        x = nonzero([l[6]==sample for l in s])[0]
        sub_s = [s[pickme] for pickme in x]
        sub_s = array(sub_s)
        sample_genes = sub_s[:,10]
        
        for gene in unique(sample_genes):
            if isnan(s_vectors[s_idx][hashd[gene]]):
                temp = array(sub_s[sub_s[:,10]==gene][:,5], int)
                maxidx = argmax(temp)
                minidx = argmin(temp)
                absmax = temp[maxidx]
                absmin = temp[minidx]
                s_vectors[s_idx][hashd[gene]] = absmax
                s_vectors_min[s_idx][hashd[gene]] = absmin
                s_vectors_cn[s_idx][hashd[gene]] = assignGeneCN(temp)
                s_vectors_num[s_idx][hashd[gene]] = len(temp)
    print
    s_vectors[isnan(s_vectors)] = 2
    s_vectors_min[isnan(s_vectors_min)] = 2
    s_vectors_cn[isnan(s_vectors_cn)] = 2
    s_vectors_num[isnan(s_vectors_num)] = 0


    samples = tp
    t_vectors = np.zeros((len(samples),len(genes)))*nan
    t_vectors_min = np.zeros((len(samples),len(genes)))*nan
    t_vectors_num =np.zeros((len(samples),len(genes)))*nan
    t_vectors_cn = np.zeros((len(samples),len(genes)))*nan
    
    for t_idx, sample in enumerate(samples):
        print t_idx, 'of', len(samples),',',
        x = nonzero([l[6]==sample for l in t])[0]
        sub_t = [t[pickme] for pickme in x]
        sub_t = array(sub_t)

        try:
            sample_genes = sub_t[:,10]
        except IndexError:
                import pdb; pdb.set_trace()

        for gene in unique(sample_genes):
            #if gene == 'PC':
            #    import pdb; pdb.set_trace()
            if isnan(t_vectors[t_idx][hashd[gene]]):
                temp = array(sub_t[sub_t[:,10]==gene][:,5], int)
                maxidx = argmax(temp)
                absmax = temp[maxidx]
                minidx = argmin(temp)
                absmin = temp[minidx]
                t_vectors[t_idx][hashd[gene]] = absmax
                t_vectors_min[t_idx][hashd[gene]] = absmin
                t_vectors_num[t_idx][hashd[gene]] = len(temp)
                t_vectors_cn[t_idx][hashd[gene]] = assignGeneCN(temp)
    print
    t_vectors[isnan(t_vectors)]=2
    t_vectors_min[isnan(t_vectors_min)] = 2
    t_vectors_cn[isnan(t_vectors_cn)] = 2
    t_vectors_num[isnan(t_vectors_num)] = 0

    return [t_vectors_cn.T-s_vectors_cn.T, #0
            s_vectors_cn.T, t_vectors_cn.T, #1,2
            s_vectors_min.T, t_vectors_min.T, #3,4
            s_vectors_num.T, t_vectors_num.T, #5,6
            s_vectors.T, t_vectors.T] #7,8