From 79328136fc983565ac212cd298c861253bd0b1a3 Mon Sep 17 00:00:00 2001 From: Martin Steinegger Date: Mon, 18 Nov 2024 01:22:35 +0100 Subject: [PATCH] Fix homo-x-mer --- Boltz1.ipynb | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/Boltz1.ipynb b/Boltz1.ipynb index 4ea1a898..a2df3385 100644 --- a/Boltz1.ipynb +++ b/Boltz1.ipynb @@ -6,7 +6,7 @@ "provenance": [], "machine_shape": "hm", "gpuType": "A100", - "authorship_tag": "ABX9TyP7Wu7UlNfSKmSCprzBFVVC", + "authorship_tag": "ABX9TyOQII3DEBf33DR8Jfw5yz60", "include_colab_link": true }, "kernelspec": { @@ -58,7 +58,7 @@ " return x + \"_\" + hashlib.sha1(y.encode()).hexdigest()[:5]\n", "\n", "# User inputs\n", - "query_sequence = 'PIAQIHILEGRSDEQKETLIREVSEAISRSLDAPLTSVRVIITEMAKGHFGIGGELASK' #@param {type:\"string\"}\n", + "query_sequence = 'PIAQIHILEGRSDEQKETLIREVSEAISRSLDAPLTSVRVIITEMAKGHFGIGGELASKKK' #@param {type:\"string\"}\n", "#@markdown - Use `:` to specify inter-protein chainbreaks for **modeling complexes** (supports homo- and hetro-oligomers). For example **PI...SK:PI...SK** for a homodimer\n", "ligand_input = 'N[C@@H](Cc1ccc(O)cc1)C(=O)O' #@param {type:\"string\"}\n", "#@markdown - Use `:` to specify multiple ligands as smile strings\n", @@ -104,17 +104,25 @@ "fasta_entries = []\n", "csv_entries = []\n", "chain_label_to_seq_id = {}\n", + "seq_to_seq_id = {}\n", + "seq_id_counter = 0 # Counter for unique sequences\n", "\n", "# Process protein sequences\n", - "for i, seq in enumerate(protein_sequences):\n", + "for seq in protein_sequences:\n", " seq = seq.strip()\n", " if not seq:\n", " continue # Skip empty sequences\n", " chain_label = next(chain_labels)\n", - " seq_id = f\"{jobname}_{i}\"\n", + " # Check if sequence has been seen before\n", + " if seq in seq_to_seq_id:\n", + " seq_id = seq_to_seq_id[seq]\n", + " else:\n", + " seq_id = f\"{jobname}_{seq_id_counter}\"\n", + " seq_to_seq_id[seq] = seq_id\n", + " seq_id_counter += 1\n", + " # For CSV file (for ColabFold), add only unique sequences\n", + " csv_entries.append((seq_id, seq))\n", " chain_label_to_seq_id[chain_label] = seq_id\n", - " # For CSV file (for ColabFold)\n", - " csv_entries.append((seq_id, seq))\n", " # For FASTA file\n", " msa_path = os.path.join(jobname, f\"{seq_id}.a3m\")\n", " header = f\">{chain_label}|protein|{msa_path}\"\n", @@ -279,4 +287,4 @@ "outputs": [] } ] -} +} \ No newline at end of file