diff --git a/Boltz1.ipynb b/Boltz1.ipynb index 4ea1a898..a2df3385 100644 --- a/Boltz1.ipynb +++ b/Boltz1.ipynb @@ -6,7 +6,7 @@ "provenance": [], "machine_shape": "hm", "gpuType": "A100", - "authorship_tag": "ABX9TyP7Wu7UlNfSKmSCprzBFVVC", + "authorship_tag": "ABX9TyOQII3DEBf33DR8Jfw5yz60", "include_colab_link": true }, "kernelspec": { @@ -58,7 +58,7 @@ " return x + \"_\" + hashlib.sha1(y.encode()).hexdigest()[:5]\n", "\n", "# User inputs\n", - "query_sequence = 'PIAQIHILEGRSDEQKETLIREVSEAISRSLDAPLTSVRVIITEMAKGHFGIGGELASK' #@param {type:\"string\"}\n", + "query_sequence = 'PIAQIHILEGRSDEQKETLIREVSEAISRSLDAPLTSVRVIITEMAKGHFGIGGELASKKK' #@param {type:\"string\"}\n", "#@markdown - Use `:` to specify inter-protein chainbreaks for **modeling complexes** (supports homo- and hetro-oligomers). For example **PI...SK:PI...SK** for a homodimer\n", "ligand_input = 'N[C@@H](Cc1ccc(O)cc1)C(=O)O' #@param {type:\"string\"}\n", "#@markdown - Use `:` to specify multiple ligands as smile strings\n", @@ -104,17 +104,25 @@ "fasta_entries = []\n", "csv_entries = []\n", "chain_label_to_seq_id = {}\n", + "seq_to_seq_id = {}\n", + "seq_id_counter = 0 # Counter for unique sequences\n", "\n", "# Process protein sequences\n", - "for i, seq in enumerate(protein_sequences):\n", + "for seq in protein_sequences:\n", " seq = seq.strip()\n", " if not seq:\n", " continue # Skip empty sequences\n", " chain_label = next(chain_labels)\n", - " seq_id = f\"{jobname}_{i}\"\n", + " # Check if sequence has been seen before\n", + " if seq in seq_to_seq_id:\n", + " seq_id = seq_to_seq_id[seq]\n", + " else:\n", + " seq_id = f\"{jobname}_{seq_id_counter}\"\n", + " seq_to_seq_id[seq] = seq_id\n", + " seq_id_counter += 1\n", + " # For CSV file (for ColabFold), add only unique sequences\n", + " csv_entries.append((seq_id, seq))\n", " chain_label_to_seq_id[chain_label] = seq_id\n", - " # For CSV file (for ColabFold)\n", - " csv_entries.append((seq_id, seq))\n", " # For FASTA file\n", " msa_path = os.path.join(jobname, f\"{seq_id}.a3m\")\n", " header = f\">{chain_label}|protein|{msa_path}\"\n", @@ -279,4 +287,4 @@ "outputs": [] } ] -} +} \ No newline at end of file