From 79328136fc983565ac212cd298c861253bd0b1a3 Mon Sep 17 00:00:00 2001
From: Martin Steinegger <martin.steinegger@mpibpc.mpg.de>
Date: Mon, 18 Nov 2024 01:22:35 +0100
Subject: [PATCH] Fix homo-x-mer

---
 Boltz1.ipynb | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/Boltz1.ipynb b/Boltz1.ipynb
index 4ea1a898..a2df3385 100644
--- a/Boltz1.ipynb
+++ b/Boltz1.ipynb
@@ -6,7 +6,7 @@
       "provenance": [],
       "machine_shape": "hm",
       "gpuType": "A100",
-      "authorship_tag": "ABX9TyP7Wu7UlNfSKmSCprzBFVVC",
+      "authorship_tag": "ABX9TyOQII3DEBf33DR8Jfw5yz60",
       "include_colab_link": true
     },
     "kernelspec": {
@@ -58,7 +58,7 @@
         "    return x + \"_\" + hashlib.sha1(y.encode()).hexdigest()[:5]\n",
         "\n",
         "# User inputs\n",
-        "query_sequence = 'PIAQIHILEGRSDEQKETLIREVSEAISRSLDAPLTSVRVIITEMAKGHFGIGGELASK'  #@param {type:\"string\"}\n",
+        "query_sequence = 'PIAQIHILEGRSDEQKETLIREVSEAISRSLDAPLTSVRVIITEMAKGHFGIGGELASKKK'  #@param {type:\"string\"}\n",
         "#@markdown  - Use `:` to specify inter-protein chainbreaks for **modeling complexes** (supports homo- and hetro-oligomers). For example **PI...SK:PI...SK** for a homodimer\n",
         "ligand_input = 'N[C@@H](Cc1ccc(O)cc1)C(=O)O'  #@param {type:\"string\"}\n",
         "#@markdown  - Use `:` to specify multiple ligands as smile strings\n",
@@ -104,17 +104,25 @@
         "fasta_entries = []\n",
         "csv_entries = []\n",
         "chain_label_to_seq_id = {}\n",
+        "seq_to_seq_id = {}\n",
+        "seq_id_counter = 0  # Counter for unique sequences\n",
         "\n",
         "# Process protein sequences\n",
-        "for i, seq in enumerate(protein_sequences):\n",
+        "for seq in protein_sequences:\n",
         "    seq = seq.strip()\n",
         "    if not seq:\n",
         "        continue  # Skip empty sequences\n",
         "    chain_label = next(chain_labels)\n",
-        "    seq_id = f\"{jobname}_{i}\"\n",
+        "    # Check if sequence has been seen before\n",
+        "    if seq in seq_to_seq_id:\n",
+        "        seq_id = seq_to_seq_id[seq]\n",
+        "    else:\n",
+        "        seq_id = f\"{jobname}_{seq_id_counter}\"\n",
+        "        seq_to_seq_id[seq] = seq_id\n",
+        "        seq_id_counter += 1\n",
+        "        # For CSV file (for ColabFold), add only unique sequences\n",
+        "        csv_entries.append((seq_id, seq))\n",
         "    chain_label_to_seq_id[chain_label] = seq_id\n",
-        "    # For CSV file (for ColabFold)\n",
-        "    csv_entries.append((seq_id, seq))\n",
         "    # For FASTA file\n",
         "    msa_path = os.path.join(jobname, f\"{seq_id}.a3m\")\n",
         "    header = f\">{chain_label}|protein|{msa_path}\"\n",
@@ -279,4 +287,4 @@
       "outputs": []
     }
   ]
-}
+}
\ No newline at end of file