From ba6f235b63dd1bc76824b2fcb3b09b12f6aec76e Mon Sep 17 00:00:00 2001 From: David McManamon Date: Wed, 11 Oct 2023 11:53:19 -0400 Subject: [PATCH] remove duplicate code in dragen_parse_csv_stats_hWGS.py & dragen_sample_parser.py and rename to dragen_csv_to_picard.py --- demux_run_dag.py | 2 +- scripts/LaunchMetrics.py | 6 +- ...mple_parser.py => dragen_csv_to_picard.py} | 69 ++++----- scripts/dragen_parse_csv_stats_hWGS.py | 132 ------------------ 4 files changed, 41 insertions(+), 168 deletions(-) rename scripts/{dragen_sample_parser.py => dragen_csv_to_picard.py} (78%) delete mode 100644 scripts/dragen_parse_csv_stats_hWGS.py diff --git a/demux_run_dag.py b/demux_run_dag.py index dbf6c1b..4c1f226 100644 --- a/demux_run_dag.py +++ b/demux_run_dag.py @@ -318,7 +318,7 @@ def launch_wgs_stats(sample_sheet, sequencer_and_run): sequencer = sequencer_and_run.split("_")[0] stats_path_for_conversion = stats_path + "/" stats_done_dir = "/igo/stats/DONE/" + sequencer + "/" - cmd_conversion = "python /igo/work/igo/igo-demux/scripts/dragen_parse_csv_stats_hWGS.py {} {}".format(stats_path_for_conversion, stats_done_dir) + cmd_conversion = "python /igo/work/igo/igo-demux/scripts/dragen_csv_to_picard.py {} {}".format(stats_path_for_conversion, stats_done_dir) bsub_command_conversion = "bsub -J create_txt_{} -o {}create_txt.out -w \"done({}*)\" {}".format(sequencer_and_run, stats_path_for_conversion, sequencer_and_run, cmd_conversion) print(bsub_command_conversion) subprocess.run(bsub_command_conversion, shell=True) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index dc4c9e1..665cddb 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -139,7 +139,7 @@ def rna_alignment_and_metrics(sample, run, sample_parameters, rna_directory, wor rna_dragen_parse_header = "{}___RNA_DRAGEN_PARSE___".format(run) metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) - dragen_parse_rna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_sample_parser.py {} {} {} {}".format(rna_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"]) + dragen_parse_rna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_csv_to_picard.py {} {} {} {}".format(rna_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"]) bsub_dragen_parse_rna = "bsub -J {0}{1} -o {0}{1}.out -w \"done({2}{1})\" -cwd \"{3}\" -n 8 -M 8 {4}".format(rna_dragen_parse_header, sample.sample_id, rna_dragen_job_name_header, rna_directory, dragen_parse_rna) print(bsub_dragen_parse_rna) call(bsub_dragen_parse_rna, shell = True) @@ -171,7 +171,7 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas call(bsub_launch_dragen, shell = True) dragen_parse_header = "{}___DRAGEN_PARSE___".format(run) - dragen_parse_dna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_sample_parser.py {} {} {} {}".format(dragen_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"]) + dragen_parse_dna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_csv_to_picard.py {} {} {} {}".format(dragen_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"]) bsub_dragen_parse_dna = "bsub -J {0}{1} -o {0}{1}.out -w \"done({2}{1})\" -cwd \"{3}\" -n 8 -M 8 {4}".format(dragen_parse_header, sample.sample_id, dragen_job_name_header, dragen_directory, dragen_parse_dna) print(bsub_dragen_parse_dna) call(bsub_dragen_parse_dna, shell = True) @@ -215,7 +215,7 @@ def dragen_methylation(sample, run, sample_parameters, work_directory, dragen_di # launch DRAGEN PARSER dragen_methylation_parse_header = "{}___DRAGEN_METHYLATION_PARSE___".format(run) - dragen_methylation_parse_dna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_sample_parser.py {} {} {} {}".format(dragen_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"]) + dragen_methylation_parse_dna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_csv_to_picard.py {} {} {} {}".format(dragen_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"]) bsub_dragen_methylation_parse_dna = "bsub -J {0}{1} -o {0}{1}.out -w \"done({2}{1})\" -cwd \"{3}\" -n 8 -M 8 {4}".format(dragen_methylation_parse_header, sample.sample_id, dragen_methylation_job_name_header, dragen_directory, dragen_methylation_parse_dna) print(bsub_dragen_methylation_parse_dna) call(bsub_dragen_methylation_parse_dna, shell = True) diff --git a/scripts/dragen_sample_parser.py b/scripts/dragen_csv_to_picard.py similarity index 78% rename from scripts/dragen_sample_parser.py rename to scripts/dragen_csv_to_picard.py index 289cb59..2b8b7a5 100644 --- a/scripts/dragen_sample_parser.py +++ b/scripts/dragen_csv_to_picard.py @@ -1,12 +1,10 @@ -# read in csv stats files generated by dragen alignment and create txt files that can be parsed into NGS database +# read in DRAGEN .csv stats files generated and create txt files that can be parsed into NGS database based on the Picard stats format # for WGS now import pandas as pd import sys import os import glob -# input will be the path to all the csv files and output path for txt files - # read in all the names of csv files, get the name before first dot as sampleID, save in a list called sample_list def get_sample_list(folder_path): sample_list = [] @@ -29,10 +27,9 @@ class DragenStats: def __init__(self): self.sample_name = "" - def read_info_from_csv(self, metrics_file_prefix, dragen_metrics_directory): + def read_info_from_csv(self, mapping_metrics_file, dragen_metrics_directory): + print("Reading (" + mapping_metrics_file + ").mapping_metrics.csv .wgs_coverage_metrics.csv") # open mapping_metrics csv and store info in dataframe - sample_id = metrics_file_prefix.split("___")[2] - mapping_metrics_file = "{}/{}.mapping_metrics.csv".format(dragen_metrics_directory, sample_id) df_mapping_metrics = pd.read_csv(mapping_metrics_file, index_col = 2, nrows = 53, names = [0,1,2,3,4]) self.READ_PAIRS_EXAMINED = int (df_mapping_metrics.loc["Properly paired reads"][3] / 2) self.UNMAPPED_READS = int (df_mapping_metrics.loc["Unmapped reads"][3]) @@ -40,7 +37,7 @@ def read_info_from_csv(self, metrics_file_prefix, dragen_metrics_directory): self.READ_PAIR_DUPLICATES = int (df_mapping_metrics.loc["Number of duplicate marked reads"][3] / 2) self.PERCENT_DUPLICATION = df_mapping_metrics.loc["Number of duplicate marked reads"][4] / 100 self.READS_ALIGNED_IN_PAIRS = int (df_mapping_metrics.loc["Properly paired reads"][3]) - # open coverage_metrics*.csv and store info in dataframe + # open wgs_coverage_metrics.csv and store info in dataframe wgs_coverage_metrics_file = "{}/{}.wgs_coverage_metrics.csv".format(dragen_metrics_directory, sample_id) df_coverage_metrics = pd.read_csv(wgs_coverage_metrics_file, index_col = 2, names = [0,1,2,3,4]) self.MEAN_TARGET_COVERAGE = df_coverage_metrics.loc["Average alignment coverage over genome"][3] @@ -53,8 +50,8 @@ def read_info_from_csv(self, metrics_file_prefix, dragen_metrics_directory): self.PF_READS_ALIGNED = int(df_coverage_metrics.loc["Aligned reads"][3]) - # creat ___AM.txt picard stats format file - # requirments: 7 lines minimun, line with data need to start with Category PAIR, file name ends with ___AM.txt + # create ___AM.txt picard stats format file + # requirements: 7 lines minimun, line with data need to start with Category PAIR, file name ends with ___AM.txt def write_to_am_txt(self): data_list_to_write = [0] * 24 data_list_to_write[0] = "PAIR" @@ -67,6 +64,7 @@ def write_to_am_txt(self): header = "CATEGORY{0}TOTAL_READS{0}PF_READS{0}PCT_PF_READS{0}PF_NOISE_READS{0}PF_READS_ALIGNED{0}PCT_PF_READS_ALIGNED{0}PF_ALIGNED_BASES{0}PF_HQ_ALIGNED_READS{0}PF_HQ_ALIGNED_BASES{0}PF_HQ_ALIGNED_Q20_BASES{0}PF_HQ_MEDIAN_MISMATCHES{0}PF_MISMATCH_RATE{0}PF_HQ_ERROR_RATE{0}PF_INDEL_RATE{0}MEAN_READ_LENGTH{0}READS_ALIGNED_IN_PAIRS{0}PCT_READS_ALIGNED_IN_PAIRS{0}PF_READS_IMPROPER_PAIRS{0}PCT_PF_READS_IMPROPER_PAIRS{0}BAD_CYCLES{0}STRAND_BALANCE{0}PCT_CHIMERAS{0}PCT_ADAPTER{0}SAMPLE{0}LIBRARY{0}READ_GROUP".format(tab) write_to_file ="{}/{}___DRAGEN3_10_8___AM.txt".format(work_directory, metrics_file_prefix) + print("Writing: " + write_to_file) data_line = "" for i in data_list_to_write: data_line = "{}{}{}".format(data_line, str(i), tab) @@ -92,6 +90,7 @@ def write_to_md_txt(self): header = "LIBRARY{0}UNPAIRED_READS_EXAMINED{0}READ_PAIRS_EXAMINED{0}SECONDARY_OR_SUPPLEMENTARY_RDS{0}UNMAPPED_READS{0}UNPAIRED_READ_DUPLICATES{0}READ_PAIR_DUPLICATES{0}READ_PAIR_OPTICAL_DUPLICATES{0}PERCENT_DUPLICATION{0}ESTIMATED_LIBRARY_SIZE".format(tab) write_to_file = "{}/{}___DRAGEN3_10_8___MD.txt".format(work_directory, metrics_file_prefix) + print("Writing: " + write_to_file) data_line = "" for i in data_list_to_write: data_line = "{}{}{}".format(data_line, str(i), tab) @@ -116,6 +115,7 @@ def write_to_wgs_txt(self): header = "GENOME_TERRITORY{0}MEAN_COVERAGE{0}SD_COVERAGE{0}MEDIAN_COVERAGE{0}MAD_COVERAGE{0}PCT_EXC_ADAPTER{0}PCT_EXC_MAPQ{0}PCT_EXC_DUPE{0}PCT_EXC_UNPAIRED{0}PCT_EXC_BASEQ{0}PCT_EXC_OVERLAP{0}PCT_EXC_CAPPED{0}PCT_EXC_TOTAL{0}PCT_1X{0}PCT_5X{0}PCT_10X{0}PCT_15X{0}PCT_20X{0}PCT_25X{0}PCT_30X{0}PCT_40X{0}PCT_50X{0}PCT_60X{0}PCT_70X{0}PCT_80X{0}PCT_90X{0}PCT_100X{0}FOLD_80_BASE_PENALTY{0}FOLD_90_BASE_PENALTY{0}FOLD_95_BASE_PENALTY{0}HET_SNP_SENSITIVITY{0}HET_SNP_Q".format(tab) write_to_file = "{}/{}___DRAGEN3_10_8___WGS.txt".format(work_directory, metrics_file_prefix) + print("Writing: " + write_to_file) data_line = "" for i in data_list_to_write: data_line = "{}{}{}".format(data_line, str(i), tab) @@ -126,9 +126,9 @@ def write_to_wgs_txt(self): _file.write("#{}".format(newline)) _file.write("{0}{1}{0}".format(newline, header)) _file.write(data_line) - -def main(dragen_metrics_directory, work_directory, metrics_file_prefix, sample_type): - + + +def process_one_sample(dragen_metrics_directory, work_directory, metrics_file_prefix, sample_type): dragen_stats = DragenStats() dragen_stats.read_info_from_csv(metrics_file_prefix, dragen_metrics_directory) dragen_stats.write_to_am_txt() @@ -138,24 +138,29 @@ def main(dragen_metrics_directory, work_directory, metrics_file_prefix, sample_t if __name__ == "__main__": - # Usage: python dragenstats_csv_to_txt.py [dragen_stats_dir] [output_file_dir] - # example: python3 /Users/luc/Documents/GitHub/igo-demux/scripts/dragenstats_csv_to_txt.py /Users/luc/Documents/GitHub/igo-demux/test/ /Users/luc/Documents/GitHub/igo-demux/test/result_test/ - dragen_metrics_directory = sys.argv[1] - work_directory = sys.argv[2] - metrics_file_prefix = sys.argv[3] - sample_type = sys.argv[4] - main(dragen_metrics_directory, work_directory, metrics_file_prefix, sample_type) - - - - - - + # Usage: python dragenstats_csv_to_txt.py [dragen_stats_dir] [output_file_dir] [] [TYPE] + # example: python3 /Users/luc/Documents/GitHub/igo-demux/scripts/dragen_csv_to_picard.py /Users/luc/Documents/GitHub/igo-demux/test/ /Users/luc/Documents/GitHub/igo-demux/test/result_test/ - - - - - - - \ No newline at end of file + # if there are 4 arguments process the one sample in the metrics_file_prefix + if len(sys.argv) == 5: + dragen_metrics_directory = sys.argv[1] + work_directory = sys.argv[2] + metrics_file_prefix = sys.argv[3] + sample_type = sys.argv[4] # WGS or RNA + + sample_id = metrics_file_prefix.split("___")[2] + mapping_metrics_file = "{}/{}.mapping_metrics.csv".format(dragen_metrics_directory, sample_id) + + process_one_sample(dragen_metrics_directory, work_directory, mapping_metrics_file, sample_type) + + # if there are 2 arguments to main() then Process all DRAGEN WGS stats in the entire directory + if len(sys.argv) == 3: + dragen_stats_folder = sys.argv[1] + output_folder_path = sys.argv[2] + sample_list = get_sample_list(dragen_stats_folder) + for i in sample_list: + dragen_stats = DragenStats(i) + dragen_stats.read_info_from_csv(i + ".mapping_metrics.csv", dragen_stats_folder) + dragen_stats.write_to_am_txt(output_folder_path) + dragen_stats.write_to_md_txt(output_folder_path) + dragen_stats.write_to_wgs_txt(output_folder_path) diff --git a/scripts/dragen_parse_csv_stats_hWGS.py b/scripts/dragen_parse_csv_stats_hWGS.py deleted file mode 100644 index 7b18201..0000000 --- a/scripts/dragen_parse_csv_stats_hWGS.py +++ /dev/null @@ -1,132 +0,0 @@ -# read in csv stats files generated by dragen alignment and create txt files that can be parsed into NGS database -# for WGS now -import pandas as pd -import sys -import os - -# input will be the path to all the csv files and output path for txt files - -# read in all the names of csv files, get the name before first dot as sampleID, save in a list called sample_list -def get_sample_list(folder_path): - sample_list = [] - # get all the file name in the stats folder - file_list = os.listdir(folder_path) - # if the format of the file name is ..csv, add uniqle sample name to sample_list - for i in file_list: - ls = i.split('.') - if len(ls) == 3 and (ls[-1] == "csv") and (ls[0] not in sample_list): - sample_list.append(ls[0]) - - return sample_list - - -# for each sample in the list, get info needed from csv files and generate txt files -# txt file needed: ___AM.txt, ___WGS.txt, ___MD.txt -class DragenStats: - - def __init__(self, sample_name): - self.sample_name = sample_name - - def read_info_from_csv(self, folder_path): - # open mapping_metrics csv and store info in dataframe - mapping_file_name = folder_path + self.sample_name + ".mapping_metrics.csv" - df_mapping_metrics = pd.read_csv(mapping_file_name, index_col=2, nrows=53, names=[0,1,2,3,4]) - self.READ_PAIRS_EXAMINED = int (df_mapping_metrics.loc["Properly paired reads"][3] / 2) - self.UNMAPPED_READS = int (df_mapping_metrics.loc["Unmapped reads"][3]) - self.TOTAL_READS = int (df_mapping_metrics.loc["Total input reads"][3]) - self.READ_PAIR_DUPLICATES = int (df_mapping_metrics.loc["Number of duplicate marked reads"][3] / 2) - self.PERCENT_DUPLICATION = df_mapping_metrics.loc["Number of duplicate marked reads"][4] / 100 - self.READS_ALIGNED_IN_PAIRS = int (df_mapping_metrics.loc["Properly paired reads"][3]) - # open coverage_metrics*.csv and store info in dataframe - coverage_file_name = folder_path + self.sample_name + ".wgs_coverage_metrics.csv" - df_coverage_metrics = pd.read_csv(coverage_file_name, index_col=2, names=[0,1,2,3,4]) - self.MEAN_TARGET_COVERAGE = df_coverage_metrics.loc["Average alignment coverage over genome"][3] - self.PF_READS_ALIGNED = int(df_coverage_metrics.loc["Aligned reads"][3]) - - - # creat ___AM.txt picard stats format file - # requirments: 7 lines minimun, line with data need to start with Category PAIR, file name ends with ___AM.txt - def write_to_am_txt(self, output_path): - data_list_to_write = [0] * 24 - data_list_to_write[0] = "PAIR" - data_list_to_write[1] = self.TOTAL_READS - data_list_to_write[5] = self.PF_READS_ALIGNED - data_list_to_write[16] = self.READS_ALIGNED_IN_PAIRS - - write_to_file = output_path + self.sample_name + "___GRCh38___DRAGEN3_10_8___AM.txt" - data_line = "" - for i in data_list_to_write: - data_line = data_line + str(i) + "\t" - - with open(write_to_file, 'w') as _file: - _file.write("#" + self.sample_name + "\n") - for i in range(6): - _file.write("#\n") - _file.write(data_line) - - # creat ___MD.txt picard stats format file - - def write_to_md_txt(self, output_path): - data_list_to_write = [0] * 10 - data_list_to_write[0] = self.sample_name - data_list_to_write[2] = self.READ_PAIRS_EXAMINED - data_list_to_write[4] = self.UNMAPPED_READS - data_list_to_write[6] = self.READ_PAIR_DUPLICATES - data_list_to_write[8] = self.PERCENT_DUPLICATION - - header = "LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED SECONDARY_OR_SUPPLEMENTARY_RDS UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE" - write_to_file = output_path + self.sample_name + "___GRCh38___DRAGEN3_10_8___MD.txt" - data_line = "" - for i in data_list_to_write: - data_line = data_line + str(i) + "\t" - - with open(write_to_file, 'w') as _file: - _file.write("#" + self.sample_name + "\n" + "#\n" + "\n") - _file.write(header + "\n") - _file.write(data_line) - - # creat ___WGS.txt picard stats format file - def write_to_wgs_txt(self, output_path): - data_list_to_write = [0] * 28 - data_list_to_write[1] = self.MEAN_TARGET_COVERAGE - - header = "GENOME_TERRITORY MEAN_COVERAGE SD_COVERAGE MEDIAN_COVERAGE MAD_COVERAGE PCT_EXC_ADAPTER PCT_EXC_MAPQ PCT_EXC_DUPE PCT_EXC_UNPAIRED PCT_EXC_BASEQ PCT_EXC_OVERLAP PCT_EXC_CAPPED PCT_EXC_TOTAL PCT_1X PCT_5X PCT_10X PCT_15X PCT_20X PCT_25X PCT_30X PCT_40X PCT_50X PCT_60X PCT_70X PCT_80X PCT_90X PCT_100X FOLD_80_BASE_PENALTY FOLD_90_BASE_PENALTY FOLD_95_BASE_PENALTY HET_SNP_SENSITIVITY HET_SNP_Q" - write_to_file = output_path + self.sample_name + "___GRCh38___DRAGEN3_10_8___WGS.txt" - data_line = "" - for i in data_list_to_write: - data_line = data_line + str(i) + "\t" - - with open(write_to_file, 'w') as _file: - _file.write("#" + self.sample_name + "\n") - for i in range(4): - _file.write("#\n") - _file.write("\n" + header + "\n") - _file.write(data_line) - -if __name__ == "__main__": - # Usage: python dragenstats_csv_to_txt.py [dragen_stats_dir] [output_file_dir] - # example: python3 /Users/luc/Documents/GitHub/igo-demux/scripts/dragenstats_csv_to_txt.py /Users/luc/Documents/GitHub/igo-demux/test/ /Users/luc/Documents/GitHub/igo-demux/test/result_test/ - dragen_stats_folder = sys.argv[1] - output_folder_path = sys.argv[2] - sample_list = get_sample_list(dragen_stats_folder) - for i in sample_list: - dragen_stats = DragenStats(i) - dragen_stats.read_info_from_csv(dragen_stats_folder) - dragen_stats.write_to_am_txt(output_folder_path) - dragen_stats.write_to_md_txt(output_folder_path) - dragen_stats.write_to_wgs_txt(output_folder_path) - - - - - - - - - - - - - - -