#Necessary libraries to import 
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
import MDAnalysis as md
import seaborn as sns
import scipy as sci

#SUM_HILLS
# sum_hills can be used to check the progress of the free energy surface (fes) calculation as a function of 1 collective varible (CV)
# This cell plots the various fes obtained at different points of the simulation to understand its development 
# To obtain these files: launch sum_hills with plumed using as input your HILLS file.
# An example is found below. We used the following line to generate the files found in data/post-processing/fes
# 
# plumed sum_hills --hills HILLS_800ns --mintozero --stride 80000
# We generated 10 files that go from fes0.dat to fes9.dat... Therefore, the range in this loop goes from [0 to 10).
# Adjust the range as neccessary for your amount of fes files

# Select the range of the x and y axis:
x_min, x_max = 0, 3.5
y_min, y_max = 0, 70
stride = 80000

for i in range(10):
  data = np.loadtxt('./data/post-processing/fes/fes_{0}.dat'.format(i), comments='#', unpack=True)
  plt.plot(data[0], data[1], label="{0}ns".format(int((i+1)*stride/1000)))
plt.axis([x_min,x_max,y_min,y_max])
plt.ylabel('Energy [kJ/mol]')
plt.xlabel('CV1')
plt.legend()
plt.show()

# PLOT A 1D or a 2D FES
############################################
# Select here plot type ('simple' for a 1D plot, 'surface' for a 3D surface, or 'contour' for a 2D contour-plot)
# Select 'simple' to plot the fes as a function of 1 collective variable (CV)
# Select 'surface' or 'contour' to plot the fes a a function of 2 cv
plot_type = 'contour'
############################################
# Minimum value of the free energy (if you used --mintozero option in the sum_hills command then set to 0)
min_fes = 0
# Maximum value of the free energy in the explored phase space 
# here it is recommended to select a value that is around 5 kJ/mol more from your highest energy region of interest
max_fes = 70
# Contour plot resolution, default kJ/mol
contour_stride = 5

#location of the data file
data = np.loadtxt("./data/post-processing/2D-distance.dat", comments="#", unpack=True)
nbins_cv1 = int(np.loadtxt("./data/post-processing/2D-distance.dat", unpack=True,
                       skiprows=3, max_rows=1, comments=None, dtype=str, usecols=(3)))
cv1 = data[0]

if plot_type == "surface" or plot_type == "contour":
    cv2 = data[1]
    energy = data[2]
    e_lims = np.reshape(energy, (-1, nbins_cv1))
    cv1_lims = np.linspace(np.min(cv1), np.max(cv1), e_lims.shape[1])
    cv2_lims = np.linspace(np.min(cv2), np.max(cv2), e_lims.shape[0])
else:
    energy = data[1]
    e_lims = energy

if plot_type == "surface":
    fig = go.Figure(data=[go.Surface(z=e_lims, x=cv1_lims, y=cv2_lims, cmin=min_fes, cmax=max_fes)])
    fig.update_layout(
    scene = dict(zaxis = dict(nticks=4, range=[min_fes,max_fes],),))
    fig.update_traces(contours_z=dict(show=True, usecolormap=True,
                                  highlightcolor="limegreen", project_z=True, size=0.05))
    fig.show()

elif plot_type == "contour":
    fig = go.Figure(data=go.Contour(z=e_lims, x=cv1_lims, y=cv2_lims,
                                    contours=dict(start=min_fes, end=max_fes, size=contour_stride),
                                    colorbar=dict(title='Energy [kJ/mol]')))
    fig.update_layout(xaxis_title='CV1', yaxis_title='CV2')
    fig.show()

else:
    plt.plot(cv1, energy)
    plt.show()

# This cell performs frame selections that are within a region of the 1D or 2D CV space from your simulation trajectory 
# Very Important! The trajectory and the COLVAR file MUST have the same amount of data points (i.e, same stride)
# Switch mode to '2D' if you want to select frames that lie in between 2D CV values
mode = '1D'

# Load the trajectory and topology
# In this case we upload an initial coordinate file .pdb, and a trajectoty file .trr
# These two files must have the same amount of atoms
# Please check MDAnalysis.Universe documentation to know which type of files can be used for input and output.
u = md.Universe("data/post-processing/start.pdb", "data/post-processing/trj.trr")

# Load the COLVAR file
colvar_data = np.loadtxt("data/post-processing/COLVAR", comments='#', unpack=True)

# Define the selection criteria
# Example: Extract frames where cv1 is between [0.36, 0.42] and cv2 is between [2.19, 2.95]
# If you select 1D computation ignore the cv2 ranges
cv1_min, cv1_max = 0.36, 0.42
cv2_min, cv2_max = 2.19,2.95

# Get the CV values from the COLVAR file
# The column index for your first and second (if used) CV as saved in the COLVAR file. 
# (remember that python index start from 0)
cv1_column = 1
cv2_column = 2

cv1 = colvar_data[:, cv1_column] 
if mode == '2D':
  cv2 = colvar_data[:, cv2_column]  
    
selection_indices = np.where((cv1 > cv1_min) & (cv1 < cv1_max))
if mode == '2D':
  selection_indices = np.where((cv1 > cv1_min) & (cv1 < cv1_max) & (cv2 > cv2_min) & (cv2 < cv2_max))

for i in selection_indices[0]:
  u.trajectory[i]
  with md.Writer("extracted_frames.xtc", u.atoms.n_atoms) as W:
    W.write(u)

/Users/omar/opt/anaconda3/lib/python3.9/site-packages/MDAnalysis/topology/PDBParser.py:317: UserWarning:

Element information is missing, elements attribute will not be populated. If needed these can be guessed using MDAnalysis.topology.guessers.

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
Cell In[7], line 10
      4 mode = '1D'
      6 # Load the trajectory and topology
      7 # In this case we upload an initial coordinate file .pdb, and a trajectoty file .trr
      8 # These two files must have the same amount of atoms
      9 # Please check MDAnalysis.Universe documentation to know which type of files can be used for input and output.
---> 10 u = md.Universe("data/post-processing/start.pdb", "data/post-processing/trj.trr")
     12 # Load the COLVAR file
     13 colvar_data = np.loadtxt("data/post-processing/COLVAR", comments='#', unpack=True)

File ~/opt/anaconda3/lib/python3.9/site-packages/MDAnalysis/core/universe.py:365, in Universe.__init__(self, topology, all_coordinates, format, topology_format, transformations, guess_bonds, vdwradii, in_memory, in_memory_step, *coordinates, **kwargs)
    360 coordinates = _resolve_coordinates(self.filename, *coordinates,
    361                                    format=format,
    362                                    all_coordinates=all_coordinates)
    364 if coordinates:
--> 365     self.load_new(coordinates, format=format, in_memory=in_memory,
    366                 in_memory_step=in_memory_step, **kwargs)
    368 if transformations:
    369     if callable(transformations):

File ~/opt/anaconda3/lib/python3.9/site-packages/MDAnalysis/core/universe.py:565, in Universe.load_new(self, filename, format, in_memory, in_memory_step, **kwargs)
    562 # supply number of atoms for readers that cannot do it for themselves
    563 kwargs['n_atoms'] = self.atoms.n_atoms
--> 565 self.trajectory = reader(filename, format=format, **kwargs)
    566 if self.trajectory.n_atoms != len(self.atoms):
    567     raise ValueError("The topology and {form} trajectory files don't"
    568                      " have the same number of atoms!\n"
    569                      "Topology number of atoms {top_n_atoms}\n"
   (...)
    573                          fname=filename,
    574                          trj_n_atoms=self.trajectory.n_atoms))

File ~/opt/anaconda3/lib/python3.9/site-packages/MDAnalysis/coordinates/XDR.py:140, in XDRBaseReader.__init__(self, filename, convert_units, sub, refresh_offsets, **kwargs)
    120 """
    121 Parameters
    122 ----------
   (...)
    135 
    136 """
    137 super(XDRBaseReader, self).__init__(filename,
    138                                     convert_units=convert_units,
    139                                     **kwargs)
--> 140 self._xdr = self._file(self.filename)
    142 self._sub = sub
    143 if self._sub is not None:

File MDAnalysis/lib/formats/libmdaxdr.pyx:179, in MDAnalysis.lib.formats.libmdaxdr._XDRFile.__cinit__()

File MDAnalysis/lib/formats/libmdaxdr.pyx:236, in MDAnalysis.lib.formats.libmdaxdr._XDRFile.open()

OSError: File does not exist: b'data/post-processing/trj.trr'

# Here we plot the estimate of the difference in Gibbs free energy (ΔG) between the unbound an bound states.
# This is done by solving a discrete version of equation [15] in Limongelli et al. publication (10.1073/pnas.1303186110)
# In this code, the integral is solved using the trapezoidal approximation

# ref should be a point that represents the unbound state iso-energetic region. It is used as a reference value.
# In this case we give as input 2.75 nano-meters
ref = 2.75

# Select if you want to calculate the ΔG using '1D' or '2D' fes
mode == '1D'

# Define the selection criteria of where would you like to compute the delta in relation to the reference value
# Example: Compute the ΔG for the region of cv1 between [0.36, 1.5] and cv2 between [2.19, 2.95]
# If you select a '1D' computation ignore the cv2 ranges
cv1_min, cv1_max = 0.36, 0.42
cv2_min, cv2_max = 2.19, 2.95

Gt = []
beta = (8.314*10**(-3)*300)**-1
# Select your range of fes data files to process in this estimate
# In our case, we have 10 fes files there fore our range goes from [0 to 11)
for j in range(0,11):
    data = np.loadtxt("data/post-processing/fes/fes_{0}.dat".format(j), comments="#", unpack=True)
    cv1 = data[0]
    dcv1 = cv1[1] - cv1[0]
    energy = data[1]
    if mode == '2D':
      cv2 = data[1]
      dcv2 = cv2[nbins_cv1+1] - cv2[0]
      energy = data[2]

    if mode == '2D':
      x = np.where((cv1_min <= cv1) & (cv1 <= cv1_max) & (cv2 >= cv2_min) & (cv2 <= cv2_max))
    else:
      x = np.where((0.36 <= cv1_min) & (cv1 <= cv1_max))

    fez = [energy[i] for i in x[0]]

    fez = np.array(fez)
    #For 2D you might want to adapt the code to allow "ref" to take a 2D interval instead
    w = -beta*(fez - energy[np.where(abs(data[0]-ref) < dcv1/2)])
    mid = [np.exp(w[0])]
    for i in range(1, len(w)-1):
        mid.append(2*np.exp(w[i]))
    mid.append(np.exp(w[-1]))
    mid = (dcv1/2)*np.sum(np.array(mid))
    if mode == '2D':
      mid *= dcv2/2
    kb = np.pi*(0.1**2)*mid
    G = -(1/beta)*np.log(0.6*kb)
    Gt.append(G*0.239)
plt.plot(Gt, label='ΔG(t)')
plt.ylabel('Energy [kJ/mol]')
plt.xlabel('Simulation time')
plt.legend()
plt.show()

# This cell computes a weighted average-on-the-fly of the ΔG(t) estimate obtained in the previous cell
# This weighted average gives higher weights to data points the closer they are to the end of the simulation
# The idea for this is that well-tempered metadynamics analyitically converges to the correct free energy surface as t-->inf


# Select from which data point to start to compute the weighted average
# ideally you want to select a data point after which your ΔG(t) estimate begins to plateau
init_time = 4


avgG = []
rej_time = init_time + 1
for i in range(rej_time, 12):
    subGt = Gt[rej_time-1:i]
    w = np.zeros_like(subGt)
    for j in range(1, len(subGt) + 1):
        w[j - 1] = j / len(subGt)
    wsubGt = w * np.array(subGt)
    avgG.append(np.sum(wsubGt) / np.sum(w))
plt.plot(np.arange(rej_time-1,11), avgG, label="Average G(t)")
plt.plot(Gt, label='G(t)')
plt.ylabel('Energy [kJ/mol]')
plt.xlabel('Simulation time')
plt.legend()
plt.show()

# This cell performs a statistical analysis on the ΔG(t) estimate called Bootstrap
# The idea is that is will randomly sample points from exclusive sections on the ΔG(t) estimate
# These randomly sampled points, when done sufficiently for a large enough data set, should behave like a Normal distribution ~N(μ,σ)
# The best way to use this anlysis is to sample from sections of the ΔG(t) estimate in which it begins to plateau
# The normal distributions obtained should have similar μ and a vanishing σ the closer it is to the end of the simulation 

# Important! In this example we perform the Bootstrap analysis for only 10 data points.
# Therfore, we do not have a large enough data set that represents a good bootstrap analysis.
# If you wish to use this method for a real case scenario, you should use many more data points (i.e, more fes.dat files)

# Number of gaussian distributions that you wish to obtain or equivalently: number of slices to analyze from the ΔG(t) estimate
slices = 5
#Initial data point from which to calculate the bootstrap
init_t = 0
# number of times the method will sample from the slice (with replacement). Generally 1000 is a good number
n_samples = 1000


data_dict = {}
Gt = Gt[init_t:]
bootstraps = np.zeros((slices, n_samples))
Gt = np.array(Gt)
for i in range(1, slices+1):
    ranges = np.arange(int((len(Gt)/slices)*(i-1)), int((len(Gt)/slices)*i))
    for j in range(n_samples):
        indices = np.random.choice(ranges, size=1000, replace=True)
        bootstraps[i - 1][j] = (np.average(Gt[indices]))
for i in range(slices):
  print("Slice {0} has a distribution with mu and sigma: {1:.2f}, {2:.2f}".format(i, np.average(bootstraps[i]), np.std(bootstraps[i])))
  data_dict.update({'Slice {0}'.format(i) : bootstraps[i]})

fig, axes = plt.subplots(len(data_dict), 1, sharex=True, sharey=False, figsize=(20,40))
for i, (name, data) in enumerate(data_dict.items()):
    sns.kdeplot(data, ax=axes[i], fill=True, common_norm=False, legend=False, color=plt.cm.viridis(i / len(data_dict)))
    axes[i].set_ylabel(name)
plt.xlabel("ΔG [kCal/mol")
plt.suptitle("Bootstrap Analysis Distributions")
plt.show()

Slice 0 has a distribution with mu and sigma: -9.35, 0.05
Slice 1 has a distribution with mu and sigma: -7.15, 0.02
Slice 2 has a distribution with mu and sigma: -8.54, 0.03
Slice 3 has a distribution with mu and sigma: -8.31, 0.01
Slice 4 has a distribution with mu and sigma: -8.38, 0.02