-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
new major revision of quickMD-nf workflow
- Loading branch information
Showing
38 changed files
with
224,929 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#!/usr/bin/env python3 | ||
"""esmfold_pdbgen | ||
Converting FASTA file sequences to PDB files via use of ESMFold | ||
Usage: | ||
esmfold_pdbgen.py [--i=<fasta>] | ||
Options: | ||
--i=<fasta> Input fasta file containing protein sequence | ||
""" | ||
import logging | ||
from docopt import docopt | ||
import torch | ||
from esm import pretrained, FastaBatchedDataset | ||
|
||
# Load the ESMFold model | ||
model, alphabet = pretrained.esmfold_v0() | ||
model = model.eval().cuda() if torch.cuda.is_available() else model.eval() | ||
|
||
# Example FASTA sequence | ||
sequence = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRIA" | ||
|
||
# Tokenize sequence and prepare dataset | ||
batch_converter = alphabet.get_batch_converter() | ||
data = [("protein1", sequence)] | ||
batch_labels, batch_strs, batch_tokens = batch_converter(data) | ||
|
||
with torch.no_grad(): | ||
results = model.infer_pdb(batch_tokens) | ||
|
||
# Write the PDB structure to a file | ||
with open("output.pdb", "w") as pdbfile: | ||
pdbfile.write(results) | ||
|
||
print("PDB structure saved to output.pdb") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
#!/usr/bin/env python3 | ||
"""interchain_pairs | ||
Find closest atom pairs at interchain interface to evidence protein secondary structure | ||
Usage: | ||
interchain_pairs.py [--inpdb=<pdb>] | ||
Options: | ||
--inpdb=<pdb> Input PDB file of protein as obtained from previous process | ||
""" | ||
import logging | ||
from docopt import docopt | ||
import MDAnalysis as mda | ||
from MDAnalysis.analysis import align | ||
from MDAnalysis.analysis import rms | ||
from MDAnalysis import transformations | ||
import pandas as pd | ||
from biopandas.pdb import PandasPdb | ||
from typing import Optional, Tuple, List | ||
import numpy as np | ||
|
||
def average_trajectory(pdb: str, pdbout: str): | ||
# Load the structure and trajectory | ||
u = mda.Universe(pdb) | ||
# Create a new Universe with the same topology but without coordinates | ||
#avg_universe = mda.Merge(u.atoms) | ||
ag = u.atoms | ||
new_dimensions = [117.0, 117.0, 117.0, 90, 90, 90] | ||
set_dim = transformations.boxdimensions.set_dimensions(new_dimensions) | ||
transform = transformations.unwrap(ag) | ||
center = transformations.center_in_box(ag.select_atoms('protein'), wrap=True) | ||
u.trajectory.add_transformations(set_dim, transform, center) | ||
protein = u.select_atoms("protein") | ||
avg_universe = mda.Merge(protein) | ||
avg_universe.add_TopologyAttr('tempfactors') | ||
#avg_coordinates = avg_universe.atoms.positions | ||
avg_coordinates = np.zeros_like(avg_universe.atoms.positions) | ||
# Loop over frames, summing up coordinates | ||
for ts in u.trajectory: | ||
avg_coordinates += protein.positions | ||
#avg_coordinates += u.atoms.positions | ||
# Compute the average | ||
avg_coordinates /= len(u.trajectory) | ||
print(len(u.trajectory)) | ||
# Assign average coordinates back to avg_universe | ||
avg_universe.atoms.positions = avg_coordinates | ||
# Write the average structure to a PDB file | ||
avg_universe.atoms.write(pdbout) | ||
|
||
|
||
def get_contact_atoms(pdbout: str, threshold: float): | ||
#read PDB data in pandas dataframe | ||
pdb_data = PandasPdb().read_pdb(pdbout) | ||
#pdb_df = pd.concat([pdb_data.df['ATOM'], pdb_data.df['HETATM']]) | ||
pdb_df = pd.concat([pdb_data.df['ATOM']]) | ||
pdb_df = pdb_df.dropna(subset=['residue_number']) | ||
#Strings of coordinates, chains and CA to refine dataframe | ||
coord_names = ['x_coord', 'y_coord', 'z_coord'] | ||
chain1 = "A" | ||
chain2 = "B" | ||
calpha = "CA" | ||
#Separate chains into separate dataframes | ||
df1 = pdb_df[(pdb_df['chain_id'] == chain1) & (pdb_df['atom_name'] == calpha)] | ||
df2 = pdb_df[(pdb_df['chain_id'] == chain2) & (pdb_df['atom_name'] == calpha)] | ||
#Extract coordinates to numpy | ||
coords1 = df1[coord_names].to_numpy() | ||
coords2 = df2[coord_names].to_numpy() | ||
#Calculate interchain distances | ||
dist_matrix = np.sqrt(((coords1[:, None] - coords2) ** 2).sum(axis=2)) | ||
# Create a new dataframe containing pairs of atoms whose distance is below the threshold | ||
pairs = np.argwhere(dist_matrix < threshold) | ||
print(f"Pairs: {pairs.shape}") | ||
print(pairs) | ||
#Identify chain and redidue of atom pairs within distance threshold | ||
atoms1, atoms2 = df1.iloc[pairs[:, 0]], df2.iloc[pairs[:, 1]] | ||
distances = dist_matrix[pairs[:, 0], pairs[:, 1]] | ||
print(f"Length of atoms1: {len(atoms1)}") | ||
print(f"Length of atoms2: {len(atoms2)}") | ||
print(f"Length of distances: {len(distances)}") | ||
print(distances) | ||
atoms1_id = atoms1['chain_id'].map(str) + ":" + atoms1['residue_name'].map(str) + ":" + atoms1['residue_number'].map(str) | ||
atoms2_id = atoms2['chain_id'].map(str) + ":" + atoms2['residue_name'].map(str) + ":" + atoms2['residue_number'].map(str) | ||
node_pairs = np.vstack((atoms1_id.values, atoms2_id.values, distances)).T | ||
#node_pairs_df = pd.DataFrame({ 'Atom1_ID': atoms1['chain_id'].map(str) + ":" + atoms1['residue_name'].map(str) + ":" + atoms1 ['residue_number'].map(str), 'Atom2_ID': atoms2['chain_id'].map(str) + ":" + atoms2['residue_name'].map(str) + ":" + atoms2['residue_number'].map(str), 'Distance': distances}) | ||
#node_pairs_df = pd.DataFrame({ | ||
#'Atom1_ID': atoms1['chain_id'].astype(str) + ":" + atoms1['residue_name'] + ":" + atoms1['residue_number'].astype(str), | ||
#'Atom2_ID': atoms2['chain_id'].astype(str) + ":" + atoms2['residue_name'] + ":" + atoms2['residue_number'].astype(str), | ||
#'Distance': distances}) | ||
|
||
result = pd.concat([df1.iloc[np.unique(pairs[:, 0])], df2.iloc[np.unique(pairs[:, 1])]]) | ||
return node_pairs, result | ||
#return node_pairs_df, result | ||
|
||
def main(): | ||
arguments = docopt(__doc__, version='interchain_pairs.py') | ||
pdb = arguments['--inpdb'] | ||
pdbstem = pdb.replace(".pdb","") | ||
pdbout = str(pdbstem + "_average.pdb") | ||
csvout = str(pdbstem + "_interchain_pairs.csv") | ||
average_trajectory(pdb, pdbout) | ||
threshold = 15.0 | ||
out = get_contact_atoms(pdbout, threshold) | ||
node_pairs = out[0] | ||
node_pairs_df = pd.DataFrame(node_pairs, columns=['Atom1', 'Atom2', 'Distance']) | ||
node_pairs_df.to_csv(csvout) | ||
|
||
if __name__ == '__main__': | ||
arguments = docopt(__doc__, version='interchain_pairs.py') | ||
logging.getLogger().setLevel(logging.INFO) | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
#!/usr/bin/env python3 | ||
"""mutant_creator | ||
Create variants to the wildtype PDB based on a CSV file containing a list of named variants with corresponding list of mutations for each one | ||
Usage: | ||
mutant_creator.py [--wtin=<wtin>] [--varlist=<varlist>] [--pH=<pH>] | ||
Options: | ||
--wtin=<wtin> Wildype PDB file to be mutated | ||
--varlist=<varlist> Input CSV file containing variants with mutation lists | ||
--pH=<pH> Set pH to desired | ||
""" | ||
import logging | ||
from docopt import docopt | ||
from pdbfixer import PDBFixer | ||
from openmm.app import PDBFile | ||
import csv | ||
from Bio.SeqUtils import seq3 | ||
from Bio import pairwise2 | ||
from Bio.pairwise2 import format_alignment | ||
|
||
|
||
#function find_mutations for use when sequence data available only | ||
def find_mutations(ref_seq, mut_seq): | ||
alignments = pairwise2.align.globalxx(ref_seq, mut_seq) | ||
aligned_ref, aligned_mut = alignments[0][0], alignments[0][1] | ||
mutations = [] | ||
for i, (r, m) in enumerate(zip(aligned_ref, aligned_mut)): | ||
if r != m: | ||
mutation = f"{r}{i+1}{m}" | ||
mutations.append(mutation) | ||
return mutations | ||
|
||
#convert mutations to a version readable by PDBFixer | ||
def convert_mutation_to_pdbfixer(mutation): | ||
original_aa = mutation[0] | ||
position = mutation[1:-1] | ||
new_aa = mutation[-1] | ||
|
||
# Convert to three-letter codes | ||
original_aa_3letter = seq3(original_aa).upper() | ||
new_aa_3letter = seq3(new_aa).upper() | ||
|
||
# Format for PDBFixer, e.g., 'GLU-48-ASP' | ||
pdbfixer_format = f"{original_aa_3letter}-{position}-{new_aa_3letter}" | ||
return pdbfixer_format | ||
|
||
#Make aure WT is in the correct format | ||
def clean_wildtype(pdbname: str, pH: str, pdbout: str): | ||
pH_fl = float(pH) | ||
pdb = PDBFixer(pdbname) | ||
#numChains = len(list(pdb.topology.chains())) | ||
#pdb.removeChains(range(1, numChains)) | ||
pdb.findMissingResidues() | ||
#pdb.missingResidues = {} | ||
pdb.findNonstandardResidues() | ||
pdb.replaceNonstandardResidues() | ||
pdb.removeHeterogens(False) | ||
pdb.findMissingAtoms() | ||
pdb.addMissingAtoms() | ||
pdb.addMissingHydrogens(pH_fl) | ||
#PDBFile.writeFile(pdb.topology, pdb.positions, open("wildtype_fixed.pdb", 'w'), keepIds=True) | ||
PDBFile.writeFile(pdb.topology, pdb.positions, open(pdbout, 'w')) | ||
return pdb | ||
|
||
#create vairants, implamenting mutations across both chains | ||
def create_mutants(pdbname: str, mutant: list, chain: list, pH: str, pdbout: str): | ||
pH_fl = float(pH) | ||
mutpdb = PDBFixer(pdbname) | ||
for ch_list in chain: | ||
for mut_list in mutant: | ||
mutpdb.applyMutations([mut_list], ch_list) | ||
#mutpdb.applyMutations([mutant], chain) | ||
#mutpdb.findMissingResidues() | ||
mutpdb.missingResidues = {} | ||
#mutpdb.findNonstandardResidues() | ||
#mutpdb.replaceNonstandardResidues() | ||
mutpdb.removeHeterogens(False) | ||
mutpdb.findMissingAtoms() | ||
mutpdb.addMissingAtoms() | ||
mutpdb.addMissingHydrogens(pH_fl) | ||
PDBFile.writeFile(mutpdb.topology, mutpdb.positions, open( pdbout, 'w'), keepIds=True) | ||
return mutpdb | ||
|
||
def main(): | ||
arguments = docopt(__doc__, version='mutant_creator.py') | ||
wt = "wildtype_centered.pdb" | ||
clean_wildtype(arguments['--wtin'], arguments['--pH'], wt) | ||
chain = ["A", "B"] | ||
#Uncomment below line in case of sequence data, where mutations must be identified | ||
#wt_sequence = "wt sequence here" | ||
with open(arguments['--varlist'], 'r') as csvfile: | ||
reader = csv.DictReader(csvfile) | ||
for row in reader: | ||
#comment below line in case of sequence data, uncomment line below it | ||
stem = row['design_id'] | ||
#stem = row['description'] | ||
pdbout = str(stem + "_centered.pdb") | ||
#Uncomment following two lines and comment out two lines below them if sequence data only | ||
#mut_sequence = row['sequence'] | ||
#filtered_list = find_mutations(wt_sequence, mut_sequence) | ||
mutation_list = row['mutations'].split(';') | ||
filtered_list = [mutation for mutation in mutation_list if mutation.lower() != 'c-terminal truncation'] | ||
formatted_mutations = [convert_mutation_to_pdbfixer(mutation) for mutation in filtered_list] | ||
logging.info("%s %s", pdbout, ', '.join(formatted_mutations)) | ||
create_mutants(wt, formatted_mutations, chain, arguments['--pH'], pdbout) | ||
|
||
if __name__ == '__main__': | ||
arguments = docopt(__doc__, version='mutant_creator.py') | ||
logging.getLogger().setLevel(logging.INFO) | ||
main() |
Oops, something went wrong.