In [4]:
#Necessary libraries to import
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
import MDAnalysis as md
import seaborn as sns
import scipy as sci
In [32]:
#SUM_HILLS
# sum_hills can be used to check the progress of the free energy surface (fes) calculation as a function of 1 collective varible (CV)
# This cell plots the various fes obtained at different points of the simulation to understand its development
# To obtain these files: launch sum_hills with plumed using as input your HILLS file.
# An example is found below. We used the following line to generate the files found in data/post-processing/fes
#
# plumed sum_hills --hills HILLS_800ns --mintozero --stride 80000
# We generated 10 files that go from fes0.dat to fes9.dat... Therefore, the range in this loop goes from [0 to 10).
# Adjust the range as neccessary for your amount of fes files
# Select the range of the x and y axis:
x_min, x_max = 0, 3.5
y_min, y_max = 0, 70
stride = 80000
for i in range(10):
data = np.loadtxt('./data/post-processing/fes/fes_{0}.dat'.format(i), comments='#', unpack=True)
plt.plot(data[0], data[1], label="{0}ns".format(int((i+1)*stride/1000)))
plt.axis([x_min,x_max,y_min,y_max])
plt.ylabel('Energy [kJ/mol]')
plt.xlabel('CV1')
plt.legend()
plt.show()
In [38]:
# PLOT A 1D or a 2D FES
############################################
# Select here plot type ('simple' for a 1D plot, 'surface' for a 3D surface, or 'contour' for a 2D contour-plot)
# Select 'simple' to plot the fes as a function of 1 collective variable (CV)
# Select 'surface' or 'contour' to plot the fes a a function of 2 cv
plot_type = 'contour'
############################################
# Minimum value of the free energy (if you used --mintozero option in the sum_hills command then set to 0)
min_fes = 0
# Maximum value of the free energy in the explored phase space
# here it is recommended to select a value that is around 5 kJ/mol more from your highest energy region of interest
max_fes = 70
# Contour plot resolution, default kJ/mol
contour_stride = 5
#location of the data file
data = np.loadtxt("./data/post-processing/2D-distance.dat", comments="#", unpack=True)
nbins_cv1 = int(np.loadtxt("./data/post-processing/2D-distance.dat", unpack=True,
skiprows=3, max_rows=1, comments=None, dtype=str, usecols=(3)))
cv1 = data[0]
if plot_type == "surface" or plot_type == "contour":
cv2 = data[1]
energy = data[2]
e_lims = np.reshape(energy, (-1, nbins_cv1))
cv1_lims = np.linspace(np.min(cv1), np.max(cv1), e_lims.shape[1])
cv2_lims = np.linspace(np.min(cv2), np.max(cv2), e_lims.shape[0])
else:
energy = data[1]
e_lims = energy
if plot_type == "surface":
fig = go.Figure(data=[go.Surface(z=e_lims, x=cv1_lims, y=cv2_lims, cmin=min_fes, cmax=max_fes)])
fig.update_layout(
scene = dict(zaxis = dict(nticks=4, range=[min_fes,max_fes],),))
fig.update_traces(contours_z=dict(show=True, usecolormap=True,
highlightcolor="limegreen", project_z=True, size=0.05))
fig.show()
elif plot_type == "contour":
fig = go.Figure(data=go.Contour(z=e_lims, x=cv1_lims, y=cv2_lims,
contours=dict(start=min_fes, end=max_fes, size=contour_stride),
colorbar=dict(title='Energy [kJ/mol]')))
fig.update_layout(xaxis_title='CV1', yaxis_title='CV2')
fig.show()
else:
plt.plot(cv1, energy)
plt.show()
In [7]:
# This cell performs frame selections that are within a region of the 1D or 2D CV space from your simulation trajectory
# Very Important! The trajectory and the COLVAR file MUST have the same amount of data points (i.e, same stride)
# Switch mode to '2D' if you want to select frames that lie in between 2D CV values
mode = '1D'
# Load the trajectory and topology
# In this case we upload an initial coordinate file .pdb, and a trajectoty file .trr
# These two files must have the same amount of atoms
# Please check MDAnalysis.Universe documentation to know which type of files can be used for input and output.
u = md.Universe("data/post-processing/start.pdb", "data/post-processing/trj.trr")
# Load the COLVAR file
colvar_data = np.loadtxt("data/post-processing/COLVAR", comments='#', unpack=True)
# Define the selection criteria
# Example: Extract frames where cv1 is between [0.36, 0.42] and cv2 is between [2.19, 2.95]
# If you select 1D computation ignore the cv2 ranges
cv1_min, cv1_max = 0.36, 0.42
cv2_min, cv2_max = 2.19,2.95
# Get the CV values from the COLVAR file
# The column index for your first and second (if used) CV as saved in the COLVAR file.
# (remember that python index start from 0)
cv1_column = 1
cv2_column = 2
cv1 = colvar_data[:, cv1_column]
if mode == '2D':
cv2 = colvar_data[:, cv2_column]
selection_indices = np.where((cv1 > cv1_min) & (cv1 < cv1_max))
if mode == '2D':
selection_indices = np.where((cv1 > cv1_min) & (cv1 < cv1_max) & (cv2 > cv2_min) & (cv2 < cv2_max))
for i in selection_indices[0]:
u.trajectory[i]
with md.Writer("extracted_frames.xtc", u.atoms.n_atoms) as W:
W.write(u)
In [10]:
# Here we plot the estimate of the difference in Gibbs free energy (ΔG) between the unbound an bound states.
# This is done by solving a discrete version of equation [15] in Limongelli et al. publication (10.1073/pnas.1303186110)
# In this code, the integral is solved using the trapezoidal approximation
# ref should be a point that represents the unbound state iso-energetic region. It is used as a reference value.
# In this case we give as input 2.75 nano-meters
ref = 2.75
# Select if you want to calculate the ΔG using '1D' or '2D' fes
mode == '1D'
# Define the selection criteria of where would you like to compute the delta in relation to the reference value
# Example: Compute the ΔG for the region of cv1 between [0.36, 1.5] and cv2 between [2.19, 2.95]
# If you select a '1D' computation ignore the cv2 ranges
cv1_min, cv1_max = 0.36, 0.42
cv2_min, cv2_max = 2.19, 2.95
Gt = []
beta = (8.314*10**(-3)*300)**-1
# Select your range of fes data files to process in this estimate
# In our case, we have 10 fes files there fore our range goes from [0 to 11)
for j in range(0,11):
data = np.loadtxt("data/post-processing/fes/fes_{0}.dat".format(j), comments="#", unpack=True)
cv1 = data[0]
dcv1 = cv1[1] - cv1[0]
energy = data[1]
if mode == '2D':
cv2 = data[1]
dcv2 = cv2[nbins_cv1+1] - cv2[0]
energy = data[2]
if mode == '2D':
x = np.where((cv1_min <= cv1) & (cv1 <= cv1_max) & (cv2 >= cv2_min) & (cv2 <= cv2_max))
else:
x = np.where((0.36 <= cv1_min) & (cv1 <= cv1_max))
fez = [energy[i] for i in x[0]]
fez = np.array(fez)
#For 2D you might want to adapt the code to allow "ref" to take a 2D interval instead
w = -beta*(fez - energy[np.where(abs(data[0]-ref) < dcv1/2)])
mid = [np.exp(w[0])]
for i in range(1, len(w)-1):
mid.append(2*np.exp(w[i]))
mid.append(np.exp(w[-1]))
mid = (dcv1/2)*np.sum(np.array(mid))
if mode == '2D':
mid *= dcv2/2
kb = np.pi*(0.1**2)*mid
G = -(1/beta)*np.log(0.6*kb)
Gt.append(G*0.239)
plt.plot(Gt, label='ΔG(t)')
plt.ylabel('Energy [kJ/mol]')
plt.xlabel('Simulation time')
plt.legend()
plt.show()
In [17]:
# This cell computes a weighted average-on-the-fly of the ΔG(t) estimate obtained in the previous cell
# This weighted average gives higher weights to data points the closer they are to the end of the simulation
# The idea for this is that well-tempered metadynamics analyitically converges to the correct free energy surface as t-->inf
# Select from which data point to start to compute the weighted average
# ideally you want to select a data point after which your ΔG(t) estimate begins to plateau
init_time = 4
avgG = []
rej_time = init_time + 1
for i in range(rej_time, 12):
subGt = Gt[rej_time-1:i]
w = np.zeros_like(subGt)
for j in range(1, len(subGt) + 1):
w[j - 1] = j / len(subGt)
wsubGt = w * np.array(subGt)
avgG.append(np.sum(wsubGt) / np.sum(w))
plt.plot(np.arange(rej_time-1,11), avgG, label="Average G(t)")
plt.plot(Gt, label='G(t)')
plt.ylabel('Energy [kJ/mol]')
plt.xlabel('Simulation time')
plt.legend()
plt.show()
In [26]:
# This cell performs a statistical analysis on the ΔG(t) estimate called Bootstrap
# The idea is that is will randomly sample points from exclusive sections on the ΔG(t) estimate
# These randomly sampled points, when done sufficiently for a large enough data set, should behave like a Normal distribution ~N(μ,σ)
# The best way to use this anlysis is to sample from sections of the ΔG(t) estimate in which it begins to plateau
# The normal distributions obtained should have similar μ and a vanishing σ the closer it is to the end of the simulation
# Important! In this example we perform the Bootstrap analysis for only 10 data points.
# Therfore, we do not have a large enough data set that represents a good bootstrap analysis.
# If you wish to use this method for a real case scenario, you should use many more data points (i.e, more fes.dat files)
# Number of gaussian distributions that you wish to obtain or equivalently: number of slices to analyze from the ΔG(t) estimate
slices = 5
#Initial data point from which to calculate the bootstrap
init_t = 0
# number of times the method will sample from the slice (with replacement). Generally 1000 is a good number
n_samples = 1000
data_dict = {}
Gt = Gt[init_t:]
bootstraps = np.zeros((slices, n_samples))
Gt = np.array(Gt)
for i in range(1, slices+1):
ranges = np.arange(int((len(Gt)/slices)*(i-1)), int((len(Gt)/slices)*i))
for j in range(n_samples):
indices = np.random.choice(ranges, size=1000, replace=True)
bootstraps[i - 1][j] = (np.average(Gt[indices]))
for i in range(slices):
print("Slice {0} has a distribution with mu and sigma: {1:.2f}, {2:.2f}".format(i, np.average(bootstraps[i]), np.std(bootstraps[i])))
data_dict.update({'Slice {0}'.format(i) : bootstraps[i]})
fig, axes = plt.subplots(len(data_dict), 1, sharex=True, sharey=False, figsize=(20,40))
for i, (name, data) in enumerate(data_dict.items()):
sns.kdeplot(data, ax=axes[i], fill=True, common_norm=False, legend=False, color=plt.cm.viridis(i / len(data_dict)))
axes[i].set_ylabel(name)
plt.xlabel("ΔG [kCal/mol")
plt.suptitle("Bootstrap Analysis Distributions")
plt.show()
In [ ]: