From 7bf1618f269a93dd36c5d5b74a0d7c11d6cafe51 Mon Sep 17 00:00:00 2001
From: Olga Tsiouri <olgatsiouri@outlook.com>
Date: Mon, 22 Nov 2021 22:34:47 +0200
Subject: [PATCH] fix index count & add subset_pdb_to_fasta,_gui.py and
 executable

---
 README.md                                   |  4 +-
 fasta_manipulation/trim_multifasta_gui.py   | 39 ++++++++-----
 fasta_manipulation/trim_singlefastas_gui.py | 23 ++++++--
 pdb_corner/subset_pdb_to_fasta_gui.py       | 62 +++++++++++++++++++++
 4 files changed, 108 insertions(+), 20 deletions(-)
 create mode 100644 pdb_corner/subset_pdb_to_fasta_gui.py

diff --git a/README.md b/README.md
index 4266864..efd4887 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# bioinfo_gui_scripts [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5703709.svg)](https://doi.org/10.5281/zenodo.5703709)
+# bioinfo_gui_scripts 
 python scripts that can be easily transformed to gui programs for wet lab scientists to use(see the wiki page for documentation and depedences)
 ## GUI stadalone programs(.exe)
 1. DSSP statistics GUI: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4838997.svg)](https://doi.org/10.5281/zenodo.4838997)
@@ -19,3 +19,5 @@ python scripts that can be easily transformed to gui programs for wet lab scient
 16. tabular file to single-fastas GUI: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5652249.svg)](https://doi.org/10.5281/zenodo.5652249)
 17. fasta formatter GUI: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5703665.svg)](https://doi.org/10.5281/zenodo.5703665)
 18. chain pdb to fasta GUI: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5706468.svg)](https://doi.org/10.5281/zenodo.5706468)
+19. subset pdb to fasta GUI: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5718967.svg)](https://doi.org/10.5281/zenodo.5718967)
+
diff --git a/fasta_manipulation/trim_multifasta_gui.py b/fasta_manipulation/trim_multifasta_gui.py
index d9cf286..8330425 100644
--- a/fasta_manipulation/trim_multifasta_gui.py
+++ b/fasta_manipulation/trim_multifasta_gui.py
@@ -2,21 +2,34 @@
 from gooey import *
 from Bio import SeqIO
 # input parameters
-@Gooey(required_cols=5, program_name='trim multifasta', header_bg_color= '#DCDCDC', terminal_font_color= '#DCDCDC', terminal_panel_color= '#DCDCDC')
+@Gooey(required_cols=3, program_name='trim multifasta', header_bg_color= '#DCDCDC', terminal_font_color= '#DCDCDC', terminal_panel_color= '#DCDCDC')
 def main():
-  ap = GooeyParser()
-  ap.add_argument("-in", "--input", required=True, widget='FileChooser', help="input fasta file")
-  ap.add_argument("-start", "--start", required=True, type=int, help="region to start writing the fasta file(starts from 0)")
-  ap.add_argument("-stop", "--stop", required=True, type=int, help="region to stop writing the fasta file(it can also be a negative number to remove nucleotides from the end of the sequence)")
-  ap.add_argument("-out", "--output", required=True, widget='FileSaver', help="output fasta file")
-  args = vars(ap.parse_args())
-# main
-  sequences = []  # setup an empty list
-  for record in SeqIO.parse(args['input'], "fasta"):
-        # add this record to the list
-      sequences.append(record[args['start']:args['stop']])
+    ap = GooeyParser()
+    ap.add_argument("-in", "--input", required=True, widget='FileChooser', help="input fasta file")
+    ap.add_argument("-start", "--start", required=False, default=1, type=int, help="region to start writing the fasta file")
+    ap.add_argument("-stop", "--stop", required=True, type=int, help="region to stop writing the fasta file(it can be both a positive and  a negative number)")
+    ap.add_argument("-out", "--output", required=True, widget='FileSaver', help="output fasta file")
+    args = vars(ap.parse_args())
+    # main
+    sequences = []  # setup an empty list
+    # fix the index for start parameter
+    if args['start'] > 0:
+        seq_start = args['start'] -1
+    else:
+        print("-start parameter must be a positive integer")
+        exit(1)
+    # fix the index for end parameter
+    if args['stop'] > 0:
+        seq_end = args['stop'] -1
+    else:
+        seq_end = args['stop']
+    # iterate for each record
+    for record in SeqIO.parse(args['input'], "fasta"):
+            # add this record to the list
+        sequences.append(record[seq_start:seq_end])
 
-  SeqIO.write(sequences, args['output'], "fasta")
+    # export to fasta
+    SeqIO.write(sequences, args['output'], "fasta")
 
 if __name__ == '__main__':
     main()
diff --git a/fasta_manipulation/trim_singlefastas_gui.py b/fasta_manipulation/trim_singlefastas_gui.py
index df66cd1..c3f3b40 100644
--- a/fasta_manipulation/trim_singlefastas_gui.py
+++ b/fasta_manipulation/trim_singlefastas_gui.py
@@ -3,20 +3,31 @@
 from gooey import *
 from Bio import SeqIO
 # input parameters
-@Gooey(required_cols=3, program_name='trim multiple single-fasta files', header_bg_color= '#DCDCDC', terminal_font_color= '#DCDCDC', terminal_panel_color= '#DCDCDC')
+@Gooey(required_cols=2, program_name='trim multiple single-fasta files', header_bg_color= '#DCDCDC', terminal_font_color= '#DCDCDC', terminal_panel_color= '#DCDCDC')
 def main():
     ap = GooeyParser()
-    ap.add_argument("-start", "--start_fasta", required=True, type=int, help="region to start writing the fasta file(min number 0)")
-    ap.add_argument("-stop", "--stop_fasta", required=True, type=int, help="region to stop writing the fasta file(negative number to remove nucleotides from the end of the sequence")
+    ap.add_argument("-start", "--start_fasta", required=False, default=1, type=int, help="region to start writing the fasta file")
+    ap.add_argument("-stop", "--stop", required=True, type=int, help="region to stop writing the fasta file(it can be both a positive and  a negative number)")
     ap.add_argument("-dir", "--directory", required=True, type=str, widget='DirChooser', help="directory to search for fasta files")
     args = vars(ap.parse_args())
-    # main
-    # import each fasta file from a working directory of choice
+# main
+# fix the index for start parameter
+    if args['start'] > 0:
+        seq_start = args['start'] -1
+    else:
+        print("-start parameter must be a positive integer")
+        exit(1)
+# fix the index for end parameter
+    if args['stop'] > 0:
+        seq_end = args['stop'] -1
+    else:
+        seq_end = args['stop']
+# import each fasta file from a working directory of choice
     for filename in sorted(os.listdir(os.chdir(args['directory']))):
         if filename.endswith(".fa") or filename.endswith(".fasta"):
             # read each file, trim and create SeqRecord to export
             record = SeqIO.read(filename, "fasta")
-            sequence = record[args['start_fasta']:args['stop_fasta']]
+            sequence = record[seq_start:seq_end]
             # export to fasta
             SeqIO.write(sequence, "".join([filename.split(".")[0],"_","trimmed",".fasta"]), "fasta")
 
diff --git a/pdb_corner/subset_pdb_to_fasta_gui.py b/pdb_corner/subset_pdb_to_fasta_gui.py
new file mode 100644
index 0000000..dac49a8
--- /dev/null
+++ b/pdb_corner/subset_pdb_to_fasta_gui.py
@@ -0,0 +1,62 @@
+# python3
+import os
+from gooey import *
+from Bio.PDB import *
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+# input parameters
+@Gooey(required_cols=2, program_name='subset pdb to fasta', header_bg_color= '#DCDCDC', terminal_font_color= '#DCDCDC', terminal_panel_color= '#DCDCDC')
+def main():
+    ap = GooeyParser()
+    ap.add_argument("-pdb", "--pdb", required=True, widget='FileChooser', help="input pdb file")
+    ap.add_argument("-model", "--model",required=False, default=0, help="model from pdb file to select(integer). Default is 0(1 model only)")
+    ap.add_argument("-chain", "--chain", required=True, help="chain from pdb file to select")
+    ap.add_argument("-start", "--start", required=False, default=1, type=int, help="amino acid in chain to start writing the fasta file")
+    ap.add_argument("-end", "--end", required=False, type=int, help="amino acid in chain to end writing the fasta file")
+    ap.add_argument("-pro", "--program", required=False,default=1, type=int, help="program to choose 1) add both start and end location 2) the end location with be that of the latest amino acid in the chain. Default is 1")
+    args = vars(ap.parse_args())
+# main
+# select chain
+    parser = PDBParser()
+    s = parser.get_structure("name", args['pdb'])
+    fill = s[int(args['model'])][args['chain']]
+# retrieve the pdb id of the input file
+    filename = os.path.split(args['pdb'])[1]
+    pdb_id = filename.split(".")[0]
+# retrieve chain amino acids
+    ppb = PPBuilder()
+    for pp in ppb.build_peptides(fill):
+        aa_chain = str(pp.get_sequence())
+# choose program
+    if args['program'] == 1:
+        # fix the index for start parameter
+        if args['start'] > 0:
+            aa_start = args['start'] -1
+        else:
+            print("-start parameter must be a positive integer")
+            exit(1)
+        # fix the index for end parameter
+        if args['end'] > 0:
+            aa_end = args['end'] -1
+        else:
+            aa_end = args['end']
+    else:
+        
+        # fix the index for start parameter
+        if args['start'] > 0:
+            aa_start = args['start'] -1
+        else:
+            print("-start parameter must be a positive integer")
+            exit(1)
+        # fix the index for end parameter
+        args['end'] = len(aa_chain) -1
+        aa_end = args['end']
+    # subset based on aa in chain
+    sub_seq = aa_chain[aa_start:aa_end]
+    # export to fasta
+    record = SeqRecord(Seq(sub_seq),id="".join([str(pdb_id),"_",str(args['chain']),"_",str(args['start']),"_",str(args['end'])]),description="")
+    SeqIO.write(record, "".join([str(pdb_id),"_",str(args['chain']),"_",str(args['start']),"_",str(args['end']),".fasta"]), "fasta")
+   
+if __name__ == '__main__':
+    main()
\ No newline at end of file