"""
05 Jul 2013
"""
from pytadbit.imp.CONFIG import CONFIG, NROUNDS, STEPS, LSTEPS
from pytadbit.imp.structuralmodels import StructuralModels
from pytadbit.imp.impmodel import IMPmodel
from scipy import polyfit
from math import fabs, pow as power
from cPickle import load, dump
import multiprocessing as mu
from sys import stdout
from os.path import exists
import IMP.core
import IMP.algebra
import IMP.display
from IMP.container import ListSingletonContainer
from IMP import Model
from IMP import FloatKey
try:
import IMP.kernel
except ImportError:
pass
IMP.set_check_level(IMP.NONE)
IMP.set_log_level(IMP.SILENT)
[docs]def generate_3d_models(zscores, resolution, start=1, n_models=5000, n_keep=1000,
close_bins=1, n_cpus=1, keep_all=False, verbose=0,
outfile=None, config=CONFIG['dmel_01'], values=None):
"""
This function generates three-dimensional models starting from Hi-C data.
The final analysis will be performed on the n_keep top models.
:param zscores: the dictionary of the Z-score values calculated from the
Hi-C pairwise interactions
:param resolution: number of nucleotides per Hi-C bin. This will be the
number of nucleotides in each model's particle
:param 5000 n_models: number of models to generate
:param 1000 n_keep: number of models used in the final analysis (usually
the top 20% of the generated models). The models are ranked according to
their objective function value (the lower the better)
:param False keep_all: whether or not to keep the discarded models (if
True, models will be stored under StructuralModels.bad_models)
:param 1 close_bins: number of particles away (i.e. the bin number
difference) a particle pair must be in order to be considered as
neighbors (e.g. 1 means consecutive particles)
:param n_cpus: number of CPUs to use
:param False verbose: if set to True, information about the distance, force
and Z-score between particles will be printed
:param None values: the normalized Hi-C data in a list of lists (equivalent
to a square matrix)
:param CONFIG['dmel_01'] config: a dictionary containing the standard
parameters used to generate the models. The dictionary should contain
the keys kforce, lowrdist, maxdist, upfreq and lowfreq. Examples can be
seen by doing:
::
from pytadbit.imp.CONFIG import CONFIG
where CONFIG is a dictionary of dictionaries to be passed to this function:
:::
CONFIG = {
'dmel_01': {
# Paramaters for the Hi-C dataset from:
'reference' : 'victor corces dataset 2013',
# Force applied to the restraints inferred to neighbor particles
'kforce' : 5,
# Minimum distance between two non-bonded particles
'lowrdist' : 100,
# Maximum experimental contact distance
'maxdist' : 600, # OPTIMIZATION: 500-1200
# Maximum threshold used to decide which experimental values have to be
# included in the computation of restraints. Z-score values greater than upfreq
# and less than lowfreq will be included, while all the others will be rejected
'upfreq' : 0.3, # OPTIMIZATION: min/max Z-score
# Minimum thresholds used to decide which experimental values have to be
# included in the computation of restraints. Z-score values bigger than upfreq
# and less that lowfreq will be include, whereas all the others will be rejected
'lowfreq' : -0.7 # OPTIMIZATION: min/max Z-score
# Space occupied by a nucleotide (nm)
'scale' : 0.005
}
}
:returns: a TheeDeeModels object
"""
# Main config parameters
global CONFIG
CONFIG = config
# Particles initial radius
global RADIUS
RADIUS = resolution * CONFIG['scale']
CONFIG['lowrdist'] = RADIUS * 2.
# get SLOPE and regression for all particles of the z-score data
global SLOPE, INTERCEPT
zsc_vals = [zscores[i][j] for i in zscores for j in zscores[i]]
zmin = min(zsc_vals)
zmax = max(zsc_vals)
SLOPE, INTERCEPT = polyfit([zmin, zmax], [CONFIG['maxdist'],
CONFIG['lowrdist']], 1)
# get SLOPE and regression for neighbors of the z-score data
global NSLOPE, NINTERCEPT
xarray = [zscores[i][j] for i in zscores for j in zscores[i]
if abs(int(i) - int(j)) <= (close_bins + 1)]
yarray = [RADIUS * 2 for _ in xrange(len(xarray))]
NSLOPE, NINTERCEPT = polyfit(xarray, yarray, 1)
# zsc = set([int (k) for k in zscores.keys()] +
# reduce(lambda x, y: x + y,
# [[int (k) for k in j.keys()] for j in zscores.values()]
# ))
global LOCI, NLOCI
LOCI = range(max([int (k) for k in zscores.keys()] +
reduce(lambda x, y: x + y,
[[int (k) for k in j.keys()]
for j in zscores.values()])) + 1)
NLOCI = len(LOCI)
# Z-scores
global PDIST
PDIST = zscores
# random inital number
global START
START = start
models, bad_models = multi_process_model_generation(
n_cpus, n_models, n_keep, keep_all, verbose)
if outfile:
if exists(outfile):
old_models, old_bad_models = load(open(outfile))
else:
old_models, old_bad_models = {}, {}
models.update(old_models)
bad_models.update(old_bad_models)
out = open(outfile, 'w')
dump((models, bad_models), out)
out.close()
else:
return StructuralModels(NLOCI, models, bad_models, resolution,
original_data=values, zscores=zscores,
config=CONFIG)
def multi_process_model_generation(n_cpus, n_models, n_keep, keep_all, verbose):
"""
Parallelize the
:func:`pytadbit.imp.imp_model.StructuralModels.generate_IMPmodel`.
:param n_cpus: number of CPUs to use
:param n_models: number of models to generate
"""
pool = mu.Pool(n_cpus)
jobs = {}
for rand_init in xrange(START, n_models + START):
jobs[rand_init] = pool.apply_async(_do_it, args=(rand_init, verbose))
pool.close()
pool.join()
results = []
for rand_init in xrange(START, n_models + START):
results.append((rand_init, jobs[rand_init].get()))
models = {}
bad_models = {}
for i, (_, m) in enumerate(
sorted(results, key=lambda x: x[1]['objfun'])[:n_keep]):
models[i] = m
if keep_all:
for i, (_, m) in enumerate(
sorted(results, key=lambda x: x[1]['objfun'])[n_keep:]):
bad_models[i+n_keep] = m
return models, bad_models
def _do_it(num, verbose):
"""
Workaround in order pass to the multiprocessing queue a pickable object.
"""
return generate_IMPmodel(num, verbose)
def generate_IMPmodel(rand_init, verbose=0):
"""
:param rand_init: random number kept as model key, for reproducibility.
:returns: a model, that is a dictionary with the log of the objective
function value optimization, and the coordinates of each particles.
"""
IMP.random_number_generator.seed(rand_init)
log_energies = []
model = {'rk' : IMP.FloatKey("radius"),
'model' : Model(),
'ps' : None,
'pps' : None}
model['ps'] = ListSingletonContainer(IMP.core.create_xyzr_particles(
model['model'], NLOCI, RADIUS, 100000))
model['ps'].set_name("")
# initialize each particles
for i in range(0, NLOCI):
p = model['ps'].get_particle(i)
p.set_name(str(LOCI[i]))
# radius = diameter/2 (0.01/2)
# computed following the relationship with the 30nm vs 40nm fiber
newrk = RADIUS
p.set_value(model['rk'], newrk)
# Restraints between pairs of LOCI proportional to the PDIST
try:
# model['pps'] = IMP.ParticlePairs() # this was used for IMP
# version: 847e65d44da7d06718bcad366b09264c818752d5
model['pps'] = IMP.ParticlePairs() # this was used for IMP
except AttributeError:
model['pps'] = IMP.kernel.ParticlePairsTemp()
# CALL BIG FUNCTION
addAllHarmonics(model, verbose=verbose)
# Setup an excluded volume restraint between a bunch of particles
# with radius
r = IMP.core.ExcludedVolumeRestraint(model['ps'], CONFIG['kforce'])
model['model'].add_restraint(r)
if verbose == 3:
print "Total number of restraints: %i" % (
model['model'].get_number_of_restraints())
# Set up optimizer
lo = IMP.core.ConjugateGradients()
lo.set_model(model['model'])
o = IMP.core.MonteCarloWithLocalOptimization(lo, LSTEPS)
o.set_return_best(True)
fk = IMP.core.XYZ.get_xyz_keys()
ptmp = model['ps'].get_particles()
mov = IMP.core.NormalMover(ptmp, fk, 0.25)
o.add_mover(mov)
# o.add_optimizer_state(log)
# Optimizer's parameters
if verbose == 3:
print "nrounds: %i, steps: %i, lsteps: %i" % (NROUNDS, STEPS, LSTEPS)
# Start optimization and save an VRML after 100 MC moves
log_energies.append(model['model'].evaluate(False))
if verbose == 3:
print "Start", log_energies[-1]
#"""simulated_annealing: preform simulated annealing for at most nrounds
# iterations. The optimization stops if the score does not change more than
# a value defined by endLoopValue and for stopCount iterations.
# @param endLoopCount = Counter that increments if the score of two models
# did not change more than a value
# @param stopCount = Maximum values of iteration during which the score
# did not change more than a specific value
# @paramendLoopValue = Threshold used to compute the value that defines
# if the endLoopCounter should be incremented or not"""
# IMP.fivec.simulatedannealing.partial_rounds(m, o, nrounds, steps)
endLoopCount = 0
stopCount = 10
endLoopValue = 0.00001
# alpha is a parameter that takes into account the number of particles in
# the model (NLOCI).
# The multiplier (in this case is 1.0) is used to give a different weight
# to the number of particles
alpha = 1.0 * NLOCI
# During the firsts hightemp iterations, do not stop the optimization
hightemp = int(0.025 * NROUNDS)
for i in range(0, hightemp):
temperature = alpha * (1.1 * NROUNDS - i) / NROUNDS
o.set_kt(temperature)
log_energies.append(o.optimize(STEPS))
if verbose == 3:
print i, log_energies[-1], o.get_kt()
# After the firsts hightemp iterations, stop the optimization if the score
# does not change by more than a value defined by endLoopValue and
# for stopCount iterations
lownrj = log_energies[-1]
for i in range(hightemp, NROUNDS):
temperature = alpha * (1.1 * NROUNDS - i) / NROUNDS
o.set_kt(temperature)
log_energies.append(o.optimize(STEPS))
if verbose == 3:
print i, log_energies[-1], o.get_kt()
# Calculate the score variation and check if the optimization
# can be stopped or not
if lownrj > 0:
deltaE = fabs((log_energies[-1] - lownrj) / lownrj)
else:
deltaE = log_energies[-1]
if (deltaE < endLoopValue and endLoopCount == stopCount):
break
elif (deltaE < endLoopValue and endLoopCount < stopCount):
endLoopCount += 1
lownrj = log_energies[-1]
else:
endLoopCount = 0
lownrj = log_energies[-1]
#"""simulated_annealing_full: preform simulated annealing for nrounds
# iterations"""
# # IMP.fivec.simulatedannealing.full_rounds(m, o, nrounds, steps)
# alpha = 1.0 * NLOCI
# for i in range(0,nrounds):
# temperature = alpha * (1.1 * nrounds - i) / nrounds
# o.set_kt(temperature)
# e = o.optimize(steps)
# print str(i) + " " + str(e) + " " + str(o.get_kt())
# Print the IMP score of the final model
log_energies.append(model['model'].evaluate(False))
if verbose >=1:
if verbose >= 2 or not rand_init % 100:
print 'Model %s IMP Objective Function: %s' % (
rand_init, log_energies[-1])
x, y, z, radius = (FloatKey("x"), FloatKey("y"),
FloatKey("z"), FloatKey("radius"))
result = IMPmodel({'log_objfun' : log_energies,
'objfun' : log_energies[-1],
'x' : [],
'y' : [],
'z' : [],
'radius' : None,
'cluster' : 'Singleton',
'rand_init' : rand_init})
for part in model['ps'].get_particles():
result['x'].append(part.get_value(x))
result['y'].append(part.get_value(y))
result['z'].append(part.get_value(z))
if verbose == 3:
print (part.get_name(), part.get_value(x), part.get_value(y),
part.get_value(z), part.get_value(radius))
# gets radius from last particle, assuming that all are the same
result['radius'] = part.get_value(radius)
return result
def addAllHarmonics(model, verbose=False):
"""
Add harmonics to all pair of particles.
"""
for i in range(0, NLOCI):
p1 = model['ps'].get_particle(i)
x = p1.get_name()
num_loci1 = int(x)
for j in range(i+1, NLOCI):
p2 = model['ps'].get_particle(j)
y = p2.get_name()
num_loci2 = int(y)
log = addHarmonicPair(model, p1, p2, x, y, j, num_loci1, num_loci2)
if verbose:
stdout.write(log)
def addHarmonicPair(model, p1, p2, x, y, j, num_loci1, num_loci2):
"""
add harmonic to a given pair of particles
:param model: a model dictionary that contains IMP model, singleton
containers...
:param p1: first particle
:param p2: second particle
:param x: first particle name
:param y: second particle name
:param j: id of second particle
:param num_loci1: index of the first particle
:param num_loci2: index of the second particle
"""
seqdist = num_loci2 - num_loci1
log = ''
# SHORT RANGE DISTANCE BETWEEN TWO CONSECUTIVE LOCI
if (seqdist == 1):
if (x in PDIST and y in PDIST[x]
and float(PDIST[x][y]) > CONFIG['upfreq']):
kforce1 = CONFIG['kforce']
log += addHarmonicNeighborsRestraints(model, p1, p2, kforce1)
#print "harmo1\t%s\t%s\t%f\t%f" % ( x, y, dist1, kforce1)
else:
kforce1 = CONFIG['kforce']
dist1 = (p1.get_value(model['rk']) + p2.get_value(model['rk']))
log += addHarmonicUpperBoundRestraints(model, p1, p2,
dist1, kforce1)
#print "upper1\t%s\t%s\t%f\t%f" % ( x, y, dist1, kforce1)
# SHORT RANGE DISTANCE BETWEEN TWO SEQDIST = 2
elif (seqdist == 2):
# if (x in PDIST and y in PDIST[x] and float(PDIST[x][y]) > CONFIG['upfreq']):
# kforce2 = CONFIG['kforce']
# log += addHarmonicNeighborsRestraints(model, p1, p2, kforce2)
# else:
# p3 = model['ps'].get_particle(j-1)
# kforce2 = CONFIG['kforce']
# dist2 = (p1.get_value(model['rk']) + p2.get_value(model['rk'])
# + 2.0 * p3.get_value(model['rk']))
# log += addHarmonicUpperBoundRestraints(model, p1, p2,
# dist2, kforce2)
# #print "upper2\t%s\t%s\t%f\t%f" % ( x, y, dist2, kforce2)
p3 = model['ps'].get_particle(j-1)
kforce2 = CONFIG['kforce']
dist2 = (p1.get_value(model['rk']) + p2.get_value(model['rk'])
+ 2.0 * p3.get_value(model['rk']))
log += addHarmonicUpperBoundRestraints(model, p1, p2,
dist2, kforce2)
#print "upper2\t%s\t%s\t%f\t%f" % ( x, y, dist2, kforce2)
else:
# LONG RANGE DISTANCE DISTANCE BETWEEN TWO NON-CONSECUTIVE LOCI
if (x in PDIST and y in PDIST[x]):
# FREQUENCY > UPFREQ
if (float(PDIST[x][y]) > CONFIG['upfreq']):
kforce3 = kForce(float(PDIST[x][y]))
log += addHarmonicRestraints(model, p1, p2,
float(PDIST[x][y]), kforce3)
#print "harmo3\t%s\t%s\t%f\t%f" % ( x, y, dist3, kforce3)
# FREQUENCY > LOW THIS HAS TO BE THE THRESHOLD FOR
# "PHYSICAL INTERACTIONS"
elif (float(PDIST[x][y]) < CONFIG['lowfreq']):
kforce3 = kForce(float(PDIST[x][y]))
log += addHarmonicLowerBoundRestraints(model, p1, p2,
float(PDIST[x][y]),
kforce3)
#print "lower3\t%s\t%s\t%f\t%f" % ( x, y, dist3, kforce3)
else:
return log
# X IN PDIST BY Y NOT IN PDIST[X]
elif (x in PDIST): # and y not in PDIST[x]):
if (num_loci2 > num_loci1):
prev_num = num_loci2 - 1
pnext_num = num_loci2 + 1
else:
prev_num = num_loci1 - 1
pnext_num = num_loci1 + 1
prev = str(prev_num)
pnext = str(pnext_num)
if (prev in PDIST[x] and pnext in PDIST[x]):
virt_freq = (float(PDIST[x][prev]) +
float(PDIST[x][pnext])) / 2.0
if (virt_freq > CONFIG['upfreq']):
kforce4 = 0.5 * kForce(virt_freq)
log += addHarmonicRestraints(model, p1, p2,
virt_freq, kforce4)
#print "harmo4\t%s\t%s\t%f\t%f" % ( x, y, dist4, kforce4)
elif (virt_freq < CONFIG['lowfreq']):
kforce4 = 0.5 * kForce(virt_freq)
log += addHarmonicLowerBoundRestraints(model, p1, p2,
virt_freq, kforce4)
#print "lower4\t%s\t%s\t%f\t%f" % ( x, y, dist4, kforce4)
else:
return log
elif (pnext in PDIST[x]):
virt_freq = float(PDIST[x][pnext])
if (virt_freq > CONFIG['upfreq']):
kforce4 = 0.5 * kForce(virt_freq)
log += addHarmonicRestraints(model, p1, p2,
virt_freq, kforce4)
#print "harmo5\t%s\t%s\t%f\t%f" % ( x, y, dist4, kforce4)
elif (virt_freq < CONFIG['lowfreq']):
kforce4 = 0.5 * kForce(virt_freq)
log += addHarmonicLowerBoundRestraints(model, p1, p2,
virt_freq, kforce4)
#print "lower5\t%s\t%s\t%f\t%f" % ( x, y, dist4, kforce4)
else:
return log
elif (prev in PDIST[x]):
virt_freq = float(PDIST[x][prev])
if (virt_freq > CONFIG['upfreq']):
kforce4 = 0.5 * kForce(virt_freq)
log += addHarmonicRestraints(model, p1, p2,
virt_freq, kforce4)
#print "harmo6\t%s\t%s\t%f\t%f" % ( x, y, dist4, kforce4)
elif (virt_freq < CONFIG['lowfreq']):
kforce4 = 0.5 * kForce(virt_freq)
log += addHarmonicLowerBoundRestraints(model, p1, p2,
virt_freq, kforce4)
#print "lower6\t%s\t%s\t%f\t%f" % ( x, y, dist4, kforce4)
else:
return log
else:
return log
# MISSING DATA (X)
else:
if (num_loci2 > num_loci1):
xprev_num = num_loci1 - 1
xpnext_num = num_loci1 + 1
prev_num = num_loci2 - 1
pnext_num = num_loci2 + 1
else:
xprev_num = num_loci2 - 1
xpnext_num = num_loci2 + 1
prev_num = num_loci1 - 1
pnext_num = num_loci1 + 1
xprev = str(xprev_num)
xpnext = str(xpnext_num)
prev = str(prev_num)
pnext = str(pnext_num)
# CASE 1
if (xprev in PDIST and xpnext in PDIST):
if (y in PDIST[xprev] and y in PDIST[xpnext]):
virt_freq = (float(PDIST[xprev][y]) +
float(PDIST[xpnext][y]) ) / 2.0
kforce4 = 0.5 * kForce(virt_freq)
elif (y in PDIST[xprev]):
virt_freq = float(PDIST[xprev][y])
kforce4 = 0.5 * kForce(virt_freq)
elif (y in PDIST[xpnext]):
virt_freq = float(PDIST[xpnext][y])
kforce4 = 0.5 * kForce(virt_freq)
else:
return log
if (virt_freq > CONFIG['upfreq']):
log += addHarmonicRestraints(model, p1, p2,
virt_freq, kforce4)
elif (virt_freq < CONFIG['lowfreq']):
log += addHarmonicLowerBoundRestraints(model, p1, p2,
virt_freq, kforce4)
#print "lower7\t%s\t%s\t%f\t%f" % ( x, y, dist4, kforce4)
else:
return log
# CASE 2
elif (xprev in PDIST and y in PDIST[xprev]):
virt_freq = float(PDIST[xprev][y])
if (virt_freq > CONFIG['upfreq']):
kforce4 = 0.5 * kForce(virt_freq)
log += addHarmonicRestraints(model, p1, p2,
virt_freq, kforce4)
#print "harmo8\t%s\t%s\t%f\t%f" % ( x, y, dist4, kforce4)
elif (virt_freq < CONFIG['lowfreq']):
kforce4 = 0.5 * kForce(virt_freq)
log += addHarmonicLowerBoundRestraints(model, p1, p2,
virt_freq, kforce4)
#print "lower8\t%s\t%s\t%f\t%f" % ( x, y, dist4, kforce4)
else:
return log
# CASE 3
elif (xpnext in PDIST and y in PDIST[xpnext]):
virt_freq = float(PDIST[xpnext][y])
if (virt_freq > CONFIG['upfreq']):
kforce4 = 0.5 * kForce(virt_freq)
log += addHarmonicRestraints(model, p1, p2,
virt_freq, kforce4)
#print "harmo9\t%s\t%s\t%f\t%f" % ( x, y, dist4, kforce4)
elif (virt_freq < CONFIG['lowfreq']):
kforce4 = 0.5 * kForce(virt_freq)
log += addHarmonicLowerBoundRestraints(model, p1, p2,
virt_freq, kforce4)
#print "lower9\t%s\t%s\t%f\t%f" % ( x, y, dist4, kforce4)
else:
return log
else:
return log
return log
def distConseq12(freq):
"""
Function mapping the Z-scores into distances for neighbor fragments
"""
return (NSLOPE * freq) + NINTERCEPT
def distance(freq):
"""
Function mapping the Z-scores into distances for non-neighbor fragments
"""
return (SLOPE * freq) + INTERCEPT
def addHarmonicNeighborsRestraints(model, p1, p2, kforce):
dist = distConseq12(PDIST[p1.get_name()][p2.get_name()])
p = IMP.ParticlePair(p1, p2)
model['pps'].append(p)
dr = IMP.core.DistanceRestraint(IMP.core.Harmonic(dist, kforce),p1, p2)
model['model'].add_restraint(dr)
return "addHn\t%s\t%s\t%f\t%f\n" % (p1.get_name(), p2.get_name(),
dist, kforce)
def addHarmonicUpperBoundRestraints(model, p1, p2, dist, kforce):
#dist = (p1.get_value(rk) + p2.get_value(rk))
p = IMP.ParticlePair(p1, p2)
model['pps'].append(p)
dr = IMP.core.DistanceRestraint(IMP.core.HarmonicUpperBound(dist, kforce),
p1, p2)
model['model'].add_restraint(dr)
return "addHu\t%s\t%s\t%f\t%f\n" % (p1.get_name(), p2.get_name(),
dist, kforce)
def addHarmonicRestraints(model, p1, p2, freq, kforce):
dist = distance(freq)
p = IMP.ParticlePair(p1, p2)
model['pps'].append(p)
dr = IMP.core.DistanceRestraint(IMP.core.Harmonic(dist, kforce),p1, p2)
model['model'].add_restraint(dr)
return "addHa\t%s\t%s\t%f\t%f\n" % (p1.get_name(), p2.get_name(),
dist, kforce)
def addHarmonicLowerBoundRestraints(model, p1, p2, freq, kforce):
dist = distance(freq)
p = IMP.ParticlePair(p1, p2)
model['pps'].append(p)
dr = IMP.core.DistanceRestraint(IMP.core.HarmonicLowerBound(dist, kforce),
p1, p2)
model['model'].add_restraint(dr)
return "addHl\t%s\t%s\t%f\t%f\n" % (p1.get_name(), p2.get_name(),
dist, kforce)
def kForce(freq):
"""
Function to assign to each restraint a force proportional to the underlying
experimental value.
"""
return power(fabs(freq), 0.5 )