Skip to content

Commit

Permalink
restored previous trajectory data
Browse files Browse the repository at this point in the history
  • Loading branch information
Olivier committed Oct 19, 2023
2 parents cdbdee0 + 3d49215 commit bdad3e9
Show file tree
Hide file tree
Showing 11 changed files with 257 additions and 20 deletions.
237 changes: 237 additions & 0 deletions src/compare_essential_dynamics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
#!/usr/bin/env python3
'''
Combines several trajectories before performing PCA
Some PCA material is borrowed from 'http://mdtraj.org/latest/examples/pca.html'
'''
import sys
import argparse
import numpy as np
import mdtraj as md
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib
import seaborn as sns
matplotlib.use('Agg')
from matplotlib import pyplot as plt

__author__ = "Olivier Sheik Amamuddy"
__copyright__ = "Copyright 2019, Research Unit in Bioinformatics"
__license__ = "GNU GPL 3.0"
__version__ = "1.2"
__maintainer__ = "Olivier Sheik Amamuddy"
__email__ = "oliserand@gmail.com"
__date__ = "24th May 2021"

def parse_args():
"""
The main argument parser
"""
parser = argparse.ArgumentParser(description="Combines several trajectories\
and aligns them before performing a single PCA decomposition on the\
entire array. One important requirement is that the data used\
should be comparable.")
parser.add_argument('--topologies', '-s', type=str, nargs='+', required=True,
help="A list of topology file names")
parser.add_argument('--trajectories', '-f', type=str, nargs='+', required=True,
help="The corresponding trajectory file names, in the \
same order as the topologies")
parser.add_argument('--selection', type=str,
default="all",
help="Atom selection (Use MDTraj selection DSL) \
(default='all'). Note: By default, only CA \
atoms are used. This is usually most applicable when \
specifying residue(s) and/or chain ID(s). Selection is \
performed after having done a global fitting of the \
entire structure.")
parser.add_argument('--stride', type=int, default=1,
help="Step size through each trajectory (default=1)")
parser.add_argument('--suffix', type=str, default="pcafig",
help="File name suffix for the figures")
parser.add_argument('--n_clusters', type=int, default=3,
help="The expected number of protein clusters to\
extract.")
parser.add_argument('--ignn', type=int, default=0,
help="The number of N-terminus residues to ignore in \
PCA calculations (default=0)")
parser.add_argument('--ignc', type=int, default=3,
help="The number of C-terminus residues to ignore in \
PCA calculations (default=3)")
args = parser.parse_args()
return args

def check_compatibility(num_frames):
"""Check if trajectories are compatible given this list of frame numbers"""
assert isinstance(num_frames, list), "A list has to be provided"
if len(num_frames) > 1:
if len(set(num_frames)) > 1:
raise ValueError("Trajectory length mismatch. Ensure all trajectories "
"have the same number of frames.")
sys.exit(1)

def clean_protein_termini(traj, selection="name CA", ignn=0, ignc=0):
"""
Reduces protein to CA atoms and removes the C and N termini in each chain
Input
traj: A trajectory
selection: The selection to use. The default parameter will usually be the
best option for getting protein atoms.
ignn: Number of N-terminus residues to ignore
ignc: Number of C-terminus residues to ignore
Returns
cleaned trajectory
"""
atoms = traj.top.select(selection)
traj = traj.atom_slice(atoms)
chains = traj.top.chains
atom_indices = []
for chain in chains:
atoms = chain.atoms
index = [a.index for a in atoms]
index = index[ignn:(len(index)-ignc)]
atom_indices += index
traj.atom_slice(atom_indices, inplace=True)
return traj

def get_kmean_centroids(pcs, traj, outbasename, n_clusters=3,
n_init=50, max_iter=1000, marker_size=0.5,
marker="o"):
"""
Uses partitioning, before extracting a defined number of representative\
conformations for each partition
"""
coords = pcs[:,:2]
kmeans = KMeans(n_clusters=n_clusters, n_init=n_init, max_iter=max_iter)
kmeans.fit(coords)
centroids = kmeans.cluster_centers_
# Plot
plt.scatter(pcs[:, 0], pcs[:, 1], s=marker_size, c=traj.time, cmap="viridis")
cbar = plt.colorbar()
cbar.set_label('Time (ps)')
# Get closest points to each centroid
for centroid in centroids:
displacements = coords - centroid
position = np.linalg.norm(displacements, axis=1).argmin()
nearest_neighbor = coords[position]
conformation_time = traj.time[position]
plt.text(nearest_neighbor[0], nearest_neighbor[1],
s=int(conformation_time), c="red", fontsize=8,
bbox=dict(facecolor='white', pad=0.5, alpha=0.5, lw=0.2))
plt.scatter(nearest_neighbor[0], nearest_neighbor[1], marker=marker,
edgecolor="red", facecolors="none")
# Dump structure
currtraj = traj[position]
currtraj.save_pdb("{}_kmeans_centroid_{}ps.pdb".format(outbasename, int(currtraj.time[0])))
return centroids

def get_energy_basins(pcs, traj, outbasename, marker="o",
kde_levels=30, plt=plt):
"""
Searches the landscape, and plots the lowest energy basins
Input:
pcs: Array from PCA
timelabels: Time (array) obtained from the mdtraj object
marker: plot character for showing the location of the basin
kde_levels: Number of levels for 2D landscape
Returns:
The basin centroids
"""
subplot = sns.kdeplot(x=pcs[:, 0], y=pcs[:, 1], fill=True, alpha=0.2,
levels=kde_levels, cmap="jet_r")
basin_paths = subplot.collections[-1].get_paths()
basins = [basin.vertices for basin in basin_paths]
basin_centroids = [basin.mean(axis=0) for basin in basins]
# Search basin for nearest nearest neighbor
coords = pcs[:,:2]
for basin_centroid in basin_centroids:
displacements = coords - basin_centroid
position = np.linalg.norm(displacements, axis=1).argmin()
nearest_neighbor = coords[position]
conformation_time = traj.time[position]
plt.text(nearest_neighbor[0], nearest_neighbor[1],
s=int(conformation_time), c="blue", fontsize=8,
bbox=dict(facecolor='white', pad=0.5, alpha=0.5, lw=0.2))
plt.scatter(nearest_neighbor[0], nearest_neighbor[1], marker=marker,
edgecolor="black", facecolors="none")
currtraj = traj[position]
currtraj.save_pdb("{}_basin_{}ps.pdb".format(outbasename, int(currtraj.time[0])))
return basin_centroids

def plot_graphs(pcs, outbasename, traj, title="Essential dynamics plot",
xlim=None, ylim=None, percent_variance=None,
n_clusters=3):
"""
Generates scatter plots for PCA
"""
fig = plt.figure(dpi=500)
plt.xlim(xlim)
plt.ylim(ylim)
# Search basins and get KDE samples
get_energy_basins(pcs, traj=traj, outbasename=outbasename)
get_kmean_centroids(pcs, traj=traj, outbasename=outbasename,
n_clusters=n_clusters)
plt.title(title)
plt.xlabel("PC1 ({:3.2f}%)".format(percent_variance[0]))
plt.ylabel("PC2 ({:3.2f}%)".format(percent_variance[1]))
plt.tight_layout()
plt.savefig("{}.png".format(outbasename))

def main(args):
"""
Program main
"""
sns.set_style("whitegrid")
plt.rcParams["font.family"] = 'serif'
pca = PCA()
ntrajectories = len(args.trajectories)
num_frames = []
all_frames = []
all_times = []
topologies = args.topologies
trajectories = args.trajectories
n_clusters = args.n_clusters
# Set a reference frame
ref_frame = md.load(args.trajectories[0], top=args.topologies[0])
ref_frame = clean_protein_termini(ref_frame, ignn=args.ignn, ignc=args.ignc)
# Read and align trajectories
for i, topology in enumerate(topologies):
trajectory_filename = trajectories[i]
traj = md.load(trajectory_filename, top=topology)
traj = clean_protein_termini(traj, ignn=args.ignn, ignc=args.ignc)
traj = traj[::args.stride]
traj = traj.superpose(ref_frame, 0)
# Post alignment selection
if args.selection != "all":
traj = traj.atom_slice(traj.top.select(args.selection))
xyz_reshaped = traj.xyz.reshape(traj.n_frames, traj.n_atoms*3)
all_frames.append(xyz_reshaped)
all_times.append(traj.time)
num_frames.append(traj.n_frames)
check_compatibility(num_frames)
xyz_concat = np.concatenate(all_frames)
pca.fit(xyz_concat)
percent_variance = pca.explained_variance_ratio_[:2]*100
pcs = pca.transform(xyz_concat)
# Setting plot limits
xmin = np.min(pcs[:, 0])
xmax = np.max(pcs[:, 0])
ymin = np.min(pcs[:, 1])
ymax = np.max(pcs[:, 1])
# Iterate in chunks of trajectory length (time)
increment = int(pcs.shape[0]/ntrajectories)
for idx, start in enumerate(range(0, pcs.shape[0], increment)):
end = start+increment
# Slice PCs for current trajectory
pcs_currtraj = pcs[start:end, :]
# Get cluster representatives
outbasename = args.trajectories[idx][:-4]
# Reload full trajectory
traj = md.load(args.trajectories[idx],
top=args.topologies[idx])[::args.stride]
# Plotting
plot_graphs(pcs=pcs_currtraj, traj=traj, outbasename=outbasename,
xlim=(xmin, xmax), ylim=(ymin, ymax),
percent_variance=percent_variance, n_clusters=n_clusters)

if __name__ == "__main__":
ARGS = parse_args()
main(ARGS)
4 changes: 2 additions & 2 deletions tests/test_anm.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

cd ..
cd ../src

g++ -I cpp/src/ ANM.cpp -o ANM

./ANM --pdb output/ComplexCG4.pdb --cutoff 24 --atomType CB
./ANM --pdb ../output/ComplexCG4.pdb --cutoff 24 --atomType CB
4 changes: 2 additions & 2 deletions tests/test_anm_wrapper.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash

cd ..
cd ../src

./ANM.py --pdb output/ComplexCG4.pdb --cutoff 24 --atomType CB
./ANM.py --pdb ../output/ComplexCG4.pdb --cutoff 24 --atomType CB
4 changes: 2 additions & 2 deletions tests/test_cg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ do
prot=${line}
done <${file}

cd ..
cd ../src

./coarseGrain.py --pdb tests/${prot}.pdb --cg 4 --atomType CB
./coarseGrain.py --pdb ../tests/${prot}.pdb --cg 4 --atomType CB
4 changes: 2 additions & 2 deletions tests/test_conformation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ do
prot=${line}
done <${file}

cd ..
cd ../src

./conformationMode.py --pdbConf output/ComplexCG4.pdb --pdbANM output/ComplexCG4.pdb --vtMatrix output/VT_values.txt --atomType CB
./conformationMode.py --pdbConf ../output/ComplexCG4.pdb --pdbANM ../output/ComplexCG4.pdb --vtMatrix ../output/VT_values.txt --atomType CB
4 changes: 2 additions & 2 deletions tests/test_internal_pca.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash

cd ..
cd ../src

./internal_pca.py --trj tests/pca_test_trj.xtc --top tests/complex.pdb
./internal_pca.py --trj ../tests/pca_test_trj.xtc --top ../tests/complex.pdb
4 changes: 2 additions & 2 deletions tests/test_mds.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash

cd ..
cd ../src

./mds.py --trj tests/pca_test_trj.xtc --top tests/complex.pdb
./mds.py --trj ../tests/pca_test_trj.xtc --top ../tests/complex.pdb
4 changes: 2 additions & 2 deletions tests/test_msf.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash

cd ..
cd ../src

./meanSquareFluctuations.py --pdb output/ComplexCG4.pdb --wMatrix output/W_values.txt --vtMatrix output/VT_values.txt --atomType CB
./meanSquareFluctuations.py --pdb ../output/ComplexCG4.pdb --wMatrix ../output/W_values.txt --vtMatrix ../output/VT_values.txt --atomType CB
4 changes: 2 additions & 2 deletions tests/test_pca.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash

cd ..
cd ../src

./pca.py --trj tests/pca_test_trj.xtc --top tests/complex.pdb
./pca.py --trj ../tests/pca_test_trj.xtc --top ../tests/complex.pdb
4 changes: 2 additions & 2 deletions tests/test_trajectory.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

cd ..
cd ../src

./visualiseVector.py --pdb output/ComplexCG4.pdb --atomType CB --vtMatrix output/VT_values.txt --mode 7
./visualiseVector.py --pdb ../output/ComplexCG4.pdb --atomType CB --vtMatrix ../output/VT_values.txt --mode 7

4 changes: 2 additions & 2 deletions tests/test_tsne.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash

cd ..
cd ../src

./tsne.py -t tests/pca_test_trj.xtc -p tests/complex.pdb -ag CA -dt rmsd
./tsne.py -t ../tests/pca_test_trj.xtc -p ../tests/complex.pdb -ag CA -dt rmsd

0 comments on commit bdad3e9

Please sign in to comment.