remove duplicate code in dragen_parse_csv_stats_hWGS.py & dragen_samp…

…le_parser.py and rename to dragen_csv_to_picard.py
mskcc · Oct 11, 2023 · ba6f235 · ba6f235
1 parent 6216cb6
commit ba6f235
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 168 deletions.
diff --git a/demux_run_dag.py b/demux_run_dag.py
@@ -318,7 +318,7 @@ def launch_wgs_stats(sample_sheet, sequencer_and_run):
         sequencer = sequencer_and_run.split("_")[0]
         stats_path_for_conversion = stats_path + "/"
         stats_done_dir = "/igo/stats/DONE/" + sequencer + "/"
-        cmd_conversion = "python /igo/work/igo/igo-demux/scripts/dragen_parse_csv_stats_hWGS.py {} {}".format(stats_path_for_conversion, stats_done_dir)
+        cmd_conversion = "python /igo/work/igo/igo-demux/scripts/dragen_csv_to_picard.py {} {}".format(stats_path_for_conversion, stats_done_dir)
         bsub_command_conversion = "bsub -J create_txt_{} -o {}create_txt.out -w \"done({}*)\" {}".format(sequencer_and_run, stats_path_for_conversion, sequencer_and_run, cmd_conversion)
         print(bsub_command_conversion)
         subprocess.run(bsub_command_conversion, shell=True)

diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py
@@ -139,7 +139,7 @@ def rna_alignment_and_metrics(sample, run, sample_parameters, rna_directory, wor
 		rna_dragen_parse_header = "{}___RNA_DRAGEN_PARSE___".format(run)
 		metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"])
 
-		dragen_parse_rna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_sample_parser.py {} {} {} {}".format(rna_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"])
+		dragen_parse_rna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_csv_to_picard.py {} {} {} {}".format(rna_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"])
 		bsub_dragen_parse_rna = "bsub -J {0}{1} -o {0}{1}.out -w \"done({2}{1})\" -cwd \"{3}\" -n 8 -M 8 {4}".format(rna_dragen_parse_header, sample.sample_id, rna_dragen_job_name_header, rna_directory, dragen_parse_rna)
 		print(bsub_dragen_parse_rna)
 		call(bsub_dragen_parse_rna, shell = True)
@@ -171,7 +171,7 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas
 		call(bsub_launch_dragen, shell = True)
 
 		dragen_parse_header = "{}___DRAGEN_PARSE___".format(run)
-		dragen_parse_dna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_sample_parser.py {} {} {} {}".format(dragen_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"])
+		dragen_parse_dna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_csv_to_picard.py {} {} {} {}".format(dragen_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"])
 		bsub_dragen_parse_dna = "bsub -J {0}{1} -o {0}{1}.out -w \"done({2}{1})\" -cwd \"{3}\" -n 8 -M 8 {4}".format(dragen_parse_header, sample.sample_id, dragen_job_name_header, dragen_directory, dragen_parse_dna)
 		print(bsub_dragen_parse_dna)
 		call(bsub_dragen_parse_dna, shell = True)
@@ -215,7 +215,7 @@ def dragen_methylation(sample, run, sample_parameters, work_directory, dragen_di
 
 		# launch DRAGEN PARSER
 		dragen_methylation_parse_header = "{}___DRAGEN_METHYLATION_PARSE___".format(run)
-		dragen_methylation_parse_dna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_sample_parser.py {} {} {} {}".format(dragen_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"])
+		dragen_methylation_parse_dna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_csv_to_picard.py {} {} {} {}".format(dragen_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"])
 		bsub_dragen_methylation_parse_dna = "bsub -J {0}{1} -o {0}{1}.out -w \"done({2}{1})\" -cwd \"{3}\" -n 8 -M 8 {4}".format(dragen_methylation_parse_header, sample.sample_id, dragen_methylation_job_name_header, dragen_directory, dragen_methylation_parse_dna)
 		print(bsub_dragen_methylation_parse_dna)
 		call(bsub_dragen_methylation_parse_dna, shell = True)

diff --git a/scripts/dragen_sample_parser.py → scripts/dragen_csv_to_picard.py b/scripts/dragen_sample_parser.py → scripts/dragen_csv_to_picard.py
@@ -1,12 +1,10 @@
-# read in csv stats files generated by dragen alignment and create txt files that can be parsed into NGS database
+# read in DRAGEN .csv stats files generated and create txt files that can be parsed into NGS database based on the Picard stats format
 # for WGS now
 import pandas as pd
 import sys
 import os
 import glob
 
-# input will be the path to all the csv files and output path for txt files
-
 # read in all the names of csv files, get the name before first dot as sampleID, save in a list called sample_list
 def get_sample_list(folder_path):
     sample_list = []
@@ -29,18 +27,17 @@ class DragenStats:
     def __init__(self):
         self.sample_name = ""
 
-    def read_info_from_csv(self, metrics_file_prefix, dragen_metrics_directory):
+    def read_info_from_csv(self, mapping_metrics_file, dragen_metrics_directory):
+        print("Reading (" + mapping_metrics_file + ").mapping_metrics.csv .wgs_coverage_metrics.csv")
         # open mapping_metrics csv and store info in dataframe
-        sample_id = metrics_file_prefix.split("___")[2]
-        mapping_metrics_file = "{}/{}.mapping_metrics.csv".format(dragen_metrics_directory, sample_id)
         df_mapping_metrics = pd.read_csv(mapping_metrics_file, index_col = 2, nrows = 53, names = [0,1,2,3,4])
         self.READ_PAIRS_EXAMINED = int (df_mapping_metrics.loc["Properly paired reads"][3] / 2)
         self.UNMAPPED_READS = int (df_mapping_metrics.loc["Unmapped reads"][3])
         self.TOTAL_READS = int (df_mapping_metrics.loc["Total input reads"][3])
         self.READ_PAIR_DUPLICATES = int (df_mapping_metrics.loc["Number of duplicate marked reads"][3] / 2)
         self.PERCENT_DUPLICATION = df_mapping_metrics.loc["Number of duplicate marked reads"][4] / 100
         self.READS_ALIGNED_IN_PAIRS = int (df_mapping_metrics.loc["Properly paired reads"][3])
-        # open coverage_metrics*.csv and store info in dataframe
+        # open wgs_coverage_metrics.csv and store info in dataframe
         wgs_coverage_metrics_file = "{}/{}.wgs_coverage_metrics.csv".format(dragen_metrics_directory, sample_id)
         df_coverage_metrics = pd.read_csv(wgs_coverage_metrics_file, index_col = 2, names = [0,1,2,3,4])
         self.MEAN_TARGET_COVERAGE = df_coverage_metrics.loc["Average alignment coverage over genome"][3]
@@ -53,8 +50,8 @@ def read_info_from_csv(self, metrics_file_prefix, dragen_metrics_directory):
         self.PF_READS_ALIGNED = int(df_coverage_metrics.loc["Aligned reads"][3])
 
 
-    # creat ___AM.txt picard stats format file
-    # requirments: 7 lines minimun, line with data need to start with Category PAIR, file name ends with ___AM.txt
+    # create ___AM.txt picard stats format file
+    # requirements: 7 lines minimun, line with data need to start with Category PAIR, file name ends with ___AM.txt
     def write_to_am_txt(self):
         data_list_to_write = [0] * 24
         data_list_to_write[0] = "PAIR"
@@ -67,6 +64,7 @@ def write_to_am_txt(self):
         header = "CATEGORY{0}TOTAL_READS{0}PF_READS{0}PCT_PF_READS{0}PF_NOISE_READS{0}PF_READS_ALIGNED{0}PCT_PF_READS_ALIGNED{0}PF_ALIGNED_BASES{0}PF_HQ_ALIGNED_READS{0}PF_HQ_ALIGNED_BASES{0}PF_HQ_ALIGNED_Q20_BASES{0}PF_HQ_MEDIAN_MISMATCHES{0}PF_MISMATCH_RATE{0}PF_HQ_ERROR_RATE{0}PF_INDEL_RATE{0}MEAN_READ_LENGTH{0}READS_ALIGNED_IN_PAIRS{0}PCT_READS_ALIGNED_IN_PAIRS{0}PF_READS_IMPROPER_PAIRS{0}PCT_PF_READS_IMPROPER_PAIRS{0}BAD_CYCLES{0}STRAND_BALANCE{0}PCT_CHIMERAS{0}PCT_ADAPTER{0}SAMPLE{0}LIBRARY{0}READ_GROUP".format(tab)
 
         write_to_file ="{}/{}___DRAGEN3_10_8___AM.txt".format(work_directory, metrics_file_prefix)
+        print("Writing: " + write_to_file)
         data_line = ""
         for i in data_list_to_write:
             data_line = "{}{}{}".format(data_line, str(i), tab)
@@ -92,6 +90,7 @@ def write_to_md_txt(self):
         header = "LIBRARY{0}UNPAIRED_READS_EXAMINED{0}READ_PAIRS_EXAMINED{0}SECONDARY_OR_SUPPLEMENTARY_RDS{0}UNMAPPED_READS{0}UNPAIRED_READ_DUPLICATES{0}READ_PAIR_DUPLICATES{0}READ_PAIR_OPTICAL_DUPLICATES{0}PERCENT_DUPLICATION{0}ESTIMATED_LIBRARY_SIZE".format(tab)
 
         write_to_file = "{}/{}___DRAGEN3_10_8___MD.txt".format(work_directory, metrics_file_prefix)
+        print("Writing: " + write_to_file)
         data_line = ""
         for i in data_list_to_write:
             data_line = "{}{}{}".format(data_line, str(i), tab)
@@ -116,6 +115,7 @@ def write_to_wgs_txt(self):
         header = "GENOME_TERRITORY{0}MEAN_COVERAGE{0}SD_COVERAGE{0}MEDIAN_COVERAGE{0}MAD_COVERAGE{0}PCT_EXC_ADAPTER{0}PCT_EXC_MAPQ{0}PCT_EXC_DUPE{0}PCT_EXC_UNPAIRED{0}PCT_EXC_BASEQ{0}PCT_EXC_OVERLAP{0}PCT_EXC_CAPPED{0}PCT_EXC_TOTAL{0}PCT_1X{0}PCT_5X{0}PCT_10X{0}PCT_15X{0}PCT_20X{0}PCT_25X{0}PCT_30X{0}PCT_40X{0}PCT_50X{0}PCT_60X{0}PCT_70X{0}PCT_80X{0}PCT_90X{0}PCT_100X{0}FOLD_80_BASE_PENALTY{0}FOLD_90_BASE_PENALTY{0}FOLD_95_BASE_PENALTY{0}HET_SNP_SENSITIVITY{0}HET_SNP_Q".format(tab)
 
         write_to_file = "{}/{}___DRAGEN3_10_8___WGS.txt".format(work_directory, metrics_file_prefix)
+        print("Writing: " + write_to_file)
         data_line = ""
         for i in data_list_to_write:
             data_line = "{}{}{}".format(data_line, str(i), tab)
@@ -126,9 +126,9 @@ def write_to_wgs_txt(self):
                 _file.write("#{}".format(newline))
             _file.write("{0}{1}{0}".format(newline, header))
             _file.write(data_line)
-           
-def main(dragen_metrics_directory, work_directory, metrics_file_prefix, sample_type):
-    
+
+
+def process_one_sample(dragen_metrics_directory, work_directory, metrics_file_prefix, sample_type):
     dragen_stats = DragenStats()
     dragen_stats.read_info_from_csv(metrics_file_prefix, dragen_metrics_directory)
     dragen_stats.write_to_am_txt()
@@ -138,24 +138,29 @@ def main(dragen_metrics_directory, work_directory, metrics_file_prefix, sample_t
 
 
 if __name__ == "__main__":
-    # Usage: python dragenstats_csv_to_txt.py [dragen_stats_dir] [output_file_dir]
-    # example: python3 /Users/luc/Documents/GitHub/igo-demux/scripts/dragenstats_csv_to_txt.py /Users/luc/Documents/GitHub/igo-demux/test/ /Users/luc/Documents/GitHub/igo-demux/test/result_test/
-    dragen_metrics_directory = sys.argv[1]
-    work_directory = sys.argv[2]
-    metrics_file_prefix = sys.argv[3]
-    sample_type = sys.argv[4]
-    main(dragen_metrics_directory, work_directory, metrics_file_prefix, sample_type)
-
-
-
-
-
-
+    # Usage: python dragenstats_csv_to_txt.py [dragen_stats_dir] [output_file_dir] [] [TYPE]
+    # example: python3 /Users/luc/Documents/GitHub/igo-demux/scripts/dragen_csv_to_picard.py /Users/luc/Documents/GitHub/igo-demux/test/ /Users/luc/Documents/GitHub/igo-demux/test/result_test/
 
-
-
-
-
-
-
-
+    # if there are 4 arguments process the one sample in the metrics_file_prefix
+    if len(sys.argv) == 5:
+        dragen_metrics_directory = sys.argv[1]
+        work_directory = sys.argv[2]
+        metrics_file_prefix = sys.argv[3]
+        sample_type = sys.argv[4]  # WGS or RNA
+
+        sample_id = metrics_file_prefix.split("___")[2]
+        mapping_metrics_file = "{}/{}.mapping_metrics.csv".format(dragen_metrics_directory, sample_id)
+
+        process_one_sample(dragen_metrics_directory, work_directory, mapping_metrics_file, sample_type)
+
+    # if there are 2 arguments to main() then Process all DRAGEN WGS stats in the entire directory
+    if len(sys.argv) == 3:
+        dragen_stats_folder = sys.argv[1]
+        output_folder_path = sys.argv[2]
+        sample_list = get_sample_list(dragen_stats_folder)
+        for i in sample_list:
+            dragen_stats = DragenStats(i)
+            dragen_stats.read_info_from_csv(i + ".mapping_metrics.csv", dragen_stats_folder)
+            dragen_stats.write_to_am_txt(output_folder_path)
+            dragen_stats.write_to_md_txt(output_folder_path)
+            dragen_stats.write_to_wgs_txt(output_folder_path)
diff --git a/scripts/dragen_parse_csv_stats_hWGS.py b/scripts/dragen_parse_csv_stats_hWGS.py