Skip to content

Commit

Permalink
new major revision of quickMD-nf workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
JLittlef committed Nov 22, 2024
1 parent 022d936 commit be3eb10
Show file tree
Hide file tree
Showing 38 changed files with 224,929 additions and 21 deletions.
15 changes: 14 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,17 @@ logs/
.DS_Store
*.pyc
.vscode
.nextflow*
.nextflow*
results_2024-11-22/
results-4.0_2024-11-07/
results-4.0_2024-11-08_75K/
results-4.0_2024-11-08_old/
results-4.0_2024-11-09/
results-4.0_2024-11-13/
results-4.0_2024-11-14/
results-4.0_2024-11-15/
results-4.0pH_2024-08-23/
results-4.0pH_2024-10-28/
results-7.4pH_2024-09-02/
results-7.4pH_2024-10-25/
results-old/
35 changes: 35 additions & 0 deletions bin/esmfold_pdbgen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env python3
"""esmfold_pdbgen
Converting FASTA file sequences to PDB files via use of ESMFold
Usage:
esmfold_pdbgen.py [--i=<fasta>]
Options:
--i=<fasta> Input fasta file containing protein sequence
"""
import logging
from docopt import docopt
import torch
from esm import pretrained, FastaBatchedDataset

# Load the ESMFold model
model, alphabet = pretrained.esmfold_v0()
model = model.eval().cuda() if torch.cuda.is_available() else model.eval()

# Example FASTA sequence
sequence = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRIA"

# Tokenize sequence and prepare dataset
batch_converter = alphabet.get_batch_converter()
data = [("protein1", sequence)]
batch_labels, batch_strs, batch_tokens = batch_converter(data)

with torch.no_grad():
results = model.infer_pdb(batch_tokens)

# Write the PDB structure to a file
with open("output.pdb", "w") as pdbfile:
pdbfile.write(results)

print("PDB structure saved to output.pdb")
110 changes: 110 additions & 0 deletions bin/interchain_pairs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python3
"""interchain_pairs
Find closest atom pairs at interchain interface to evidence protein secondary structure
Usage:
interchain_pairs.py [--inpdb=<pdb>]
Options:
--inpdb=<pdb> Input PDB file of protein as obtained from previous process
"""
import logging
from docopt import docopt
import MDAnalysis as mda
from MDAnalysis.analysis import align
from MDAnalysis.analysis import rms
from MDAnalysis import transformations
import pandas as pd
from biopandas.pdb import PandasPdb
from typing import Optional, Tuple, List
import numpy as np

def average_trajectory(pdb: str, pdbout: str):
# Load the structure and trajectory
u = mda.Universe(pdb)
# Create a new Universe with the same topology but without coordinates
#avg_universe = mda.Merge(u.atoms)
ag = u.atoms
new_dimensions = [117.0, 117.0, 117.0, 90, 90, 90]
set_dim = transformations.boxdimensions.set_dimensions(new_dimensions)
transform = transformations.unwrap(ag)
center = transformations.center_in_box(ag.select_atoms('protein'), wrap=True)
u.trajectory.add_transformations(set_dim, transform, center)
protein = u.select_atoms("protein")
avg_universe = mda.Merge(protein)
avg_universe.add_TopologyAttr('tempfactors')
#avg_coordinates = avg_universe.atoms.positions
avg_coordinates = np.zeros_like(avg_universe.atoms.positions)
# Loop over frames, summing up coordinates
for ts in u.trajectory:
avg_coordinates += protein.positions
#avg_coordinates += u.atoms.positions
# Compute the average
avg_coordinates /= len(u.trajectory)
print(len(u.trajectory))
# Assign average coordinates back to avg_universe
avg_universe.atoms.positions = avg_coordinates
# Write the average structure to a PDB file
avg_universe.atoms.write(pdbout)


def get_contact_atoms(pdbout: str, threshold: float):
#read PDB data in pandas dataframe
pdb_data = PandasPdb().read_pdb(pdbout)
#pdb_df = pd.concat([pdb_data.df['ATOM'], pdb_data.df['HETATM']])
pdb_df = pd.concat([pdb_data.df['ATOM']])
pdb_df = pdb_df.dropna(subset=['residue_number'])
#Strings of coordinates, chains and CA to refine dataframe
coord_names = ['x_coord', 'y_coord', 'z_coord']
chain1 = "A"
chain2 = "B"
calpha = "CA"
#Separate chains into separate dataframes
df1 = pdb_df[(pdb_df['chain_id'] == chain1) & (pdb_df['atom_name'] == calpha)]
df2 = pdb_df[(pdb_df['chain_id'] == chain2) & (pdb_df['atom_name'] == calpha)]
#Extract coordinates to numpy
coords1 = df1[coord_names].to_numpy()
coords2 = df2[coord_names].to_numpy()
#Calculate interchain distances
dist_matrix = np.sqrt(((coords1[:, None] - coords2) ** 2).sum(axis=2))
# Create a new dataframe containing pairs of atoms whose distance is below the threshold
pairs = np.argwhere(dist_matrix < threshold)
print(f"Pairs: {pairs.shape}")
print(pairs)
#Identify chain and redidue of atom pairs within distance threshold
atoms1, atoms2 = df1.iloc[pairs[:, 0]], df2.iloc[pairs[:, 1]]
distances = dist_matrix[pairs[:, 0], pairs[:, 1]]
print(f"Length of atoms1: {len(atoms1)}")
print(f"Length of atoms2: {len(atoms2)}")
print(f"Length of distances: {len(distances)}")
print(distances)
atoms1_id = atoms1['chain_id'].map(str) + ":" + atoms1['residue_name'].map(str) + ":" + atoms1['residue_number'].map(str)
atoms2_id = atoms2['chain_id'].map(str) + ":" + atoms2['residue_name'].map(str) + ":" + atoms2['residue_number'].map(str)
node_pairs = np.vstack((atoms1_id.values, atoms2_id.values, distances)).T
#node_pairs_df = pd.DataFrame({ 'Atom1_ID': atoms1['chain_id'].map(str) + ":" + atoms1['residue_name'].map(str) + ":" + atoms1 ['residue_number'].map(str), 'Atom2_ID': atoms2['chain_id'].map(str) + ":" + atoms2['residue_name'].map(str) + ":" + atoms2['residue_number'].map(str), 'Distance': distances})
#node_pairs_df = pd.DataFrame({
#'Atom1_ID': atoms1['chain_id'].astype(str) + ":" + atoms1['residue_name'] + ":" + atoms1['residue_number'].astype(str),
#'Atom2_ID': atoms2['chain_id'].astype(str) + ":" + atoms2['residue_name'] + ":" + atoms2['residue_number'].astype(str),
#'Distance': distances})

result = pd.concat([df1.iloc[np.unique(pairs[:, 0])], df2.iloc[np.unique(pairs[:, 1])]])
return node_pairs, result
#return node_pairs_df, result

def main():
arguments = docopt(__doc__, version='interchain_pairs.py')
pdb = arguments['--inpdb']
pdbstem = pdb.replace(".pdb","")
pdbout = str(pdbstem + "_average.pdb")
csvout = str(pdbstem + "_interchain_pairs.csv")
average_trajectory(pdb, pdbout)
threshold = 15.0
out = get_contact_atoms(pdbout, threshold)
node_pairs = out[0]
node_pairs_df = pd.DataFrame(node_pairs, columns=['Atom1', 'Atom2', 'Distance'])
node_pairs_df.to_csv(csvout)

if __name__ == '__main__':
arguments = docopt(__doc__, version='interchain_pairs.py')
logging.getLogger().setLevel(logging.INFO)
main()
111 changes: 111 additions & 0 deletions bin/mutant_creator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/env python3
"""mutant_creator
Create variants to the wildtype PDB based on a CSV file containing a list of named variants with corresponding list of mutations for each one
Usage:
mutant_creator.py [--wtin=<wtin>] [--varlist=<varlist>] [--pH=<pH>]
Options:
--wtin=<wtin> Wildype PDB file to be mutated
--varlist=<varlist> Input CSV file containing variants with mutation lists
--pH=<pH> Set pH to desired
"""
import logging
from docopt import docopt
from pdbfixer import PDBFixer
from openmm.app import PDBFile
import csv
from Bio.SeqUtils import seq3
from Bio import pairwise2
from Bio.pairwise2 import format_alignment


#function find_mutations for use when sequence data available only
def find_mutations(ref_seq, mut_seq):
alignments = pairwise2.align.globalxx(ref_seq, mut_seq)
aligned_ref, aligned_mut = alignments[0][0], alignments[0][1]
mutations = []
for i, (r, m) in enumerate(zip(aligned_ref, aligned_mut)):
if r != m:
mutation = f"{r}{i+1}{m}"
mutations.append(mutation)
return mutations

#convert mutations to a version readable by PDBFixer
def convert_mutation_to_pdbfixer(mutation):
original_aa = mutation[0]
position = mutation[1:-1]
new_aa = mutation[-1]

# Convert to three-letter codes
original_aa_3letter = seq3(original_aa).upper()
new_aa_3letter = seq3(new_aa).upper()

# Format for PDBFixer, e.g., 'GLU-48-ASP'
pdbfixer_format = f"{original_aa_3letter}-{position}-{new_aa_3letter}"
return pdbfixer_format

#Make aure WT is in the correct format
def clean_wildtype(pdbname: str, pH: str, pdbout: str):
pH_fl = float(pH)
pdb = PDBFixer(pdbname)
#numChains = len(list(pdb.topology.chains()))
#pdb.removeChains(range(1, numChains))
pdb.findMissingResidues()
#pdb.missingResidues = {}
pdb.findNonstandardResidues()
pdb.replaceNonstandardResidues()
pdb.removeHeterogens(False)
pdb.findMissingAtoms()
pdb.addMissingAtoms()
pdb.addMissingHydrogens(pH_fl)
#PDBFile.writeFile(pdb.topology, pdb.positions, open("wildtype_fixed.pdb", 'w'), keepIds=True)
PDBFile.writeFile(pdb.topology, pdb.positions, open(pdbout, 'w'))
return pdb

#create vairants, implamenting mutations across both chains
def create_mutants(pdbname: str, mutant: list, chain: list, pH: str, pdbout: str):
pH_fl = float(pH)
mutpdb = PDBFixer(pdbname)
for ch_list in chain:
for mut_list in mutant:
mutpdb.applyMutations([mut_list], ch_list)
#mutpdb.applyMutations([mutant], chain)
#mutpdb.findMissingResidues()
mutpdb.missingResidues = {}
#mutpdb.findNonstandardResidues()
#mutpdb.replaceNonstandardResidues()
mutpdb.removeHeterogens(False)
mutpdb.findMissingAtoms()
mutpdb.addMissingAtoms()
mutpdb.addMissingHydrogens(pH_fl)
PDBFile.writeFile(mutpdb.topology, mutpdb.positions, open( pdbout, 'w'), keepIds=True)
return mutpdb

def main():
arguments = docopt(__doc__, version='mutant_creator.py')
wt = "wildtype_centered.pdb"
clean_wildtype(arguments['--wtin'], arguments['--pH'], wt)
chain = ["A", "B"]
#Uncomment below line in case of sequence data, where mutations must be identified
#wt_sequence = "wt sequence here"
with open(arguments['--varlist'], 'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
#comment below line in case of sequence data, uncomment line below it
stem = row['design_id']
#stem = row['description']
pdbout = str(stem + "_centered.pdb")
#Uncomment following two lines and comment out two lines below them if sequence data only
#mut_sequence = row['sequence']
#filtered_list = find_mutations(wt_sequence, mut_sequence)
mutation_list = row['mutations'].split(';')
filtered_list = [mutation for mutation in mutation_list if mutation.lower() != 'c-terminal truncation']
formatted_mutations = [convert_mutation_to_pdbfixer(mutation) for mutation in filtered_list]
logging.info("%s %s", pdbout, ', '.join(formatted_mutations))
create_mutants(wt, formatted_mutations, chain, arguments['--pH'], pdbout)

if __name__ == '__main__':
arguments = docopt(__doc__, version='mutant_creator.py')
logging.getLogger().setLevel(logging.INFO)
main()
Loading

0 comments on commit be3eb10

Please sign in to comment.