Skip to content

Commit

Permalink
remove duplicate code in dragen_parse_csv_stats_hWGS.py & dragen_samp…
Browse files Browse the repository at this point in the history
…le_parser.py

and rename to dragen_csv_to_picard.py
  • Loading branch information
dmcmanam committed Oct 11, 2023
1 parent 6216cb6 commit ba6f235
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 168 deletions.
2 changes: 1 addition & 1 deletion demux_run_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def launch_wgs_stats(sample_sheet, sequencer_and_run):
sequencer = sequencer_and_run.split("_")[0]
stats_path_for_conversion = stats_path + "/"
stats_done_dir = "/igo/stats/DONE/" + sequencer + "/"
cmd_conversion = "python /igo/work/igo/igo-demux/scripts/dragen_parse_csv_stats_hWGS.py {} {}".format(stats_path_for_conversion, stats_done_dir)
cmd_conversion = "python /igo/work/igo/igo-demux/scripts/dragen_csv_to_picard.py {} {}".format(stats_path_for_conversion, stats_done_dir)
bsub_command_conversion = "bsub -J create_txt_{} -o {}create_txt.out -w \"done({}*)\" {}".format(sequencer_and_run, stats_path_for_conversion, sequencer_and_run, cmd_conversion)
print(bsub_command_conversion)
subprocess.run(bsub_command_conversion, shell=True)
Expand Down
6 changes: 3 additions & 3 deletions scripts/LaunchMetrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def rna_alignment_and_metrics(sample, run, sample_parameters, rna_directory, wor
rna_dragen_parse_header = "{}___RNA_DRAGEN_PARSE___".format(run)
metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"])

dragen_parse_rna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_sample_parser.py {} {} {} {}".format(rna_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"])
dragen_parse_rna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_csv_to_picard.py {} {} {} {}".format(rna_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"])
bsub_dragen_parse_rna = "bsub -J {0}{1} -o {0}{1}.out -w \"done({2}{1})\" -cwd \"{3}\" -n 8 -M 8 {4}".format(rna_dragen_parse_header, sample.sample_id, rna_dragen_job_name_header, rna_directory, dragen_parse_rna)
print(bsub_dragen_parse_rna)
call(bsub_dragen_parse_rna, shell = True)
Expand Down Expand Up @@ -171,7 +171,7 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas
call(bsub_launch_dragen, shell = True)

dragen_parse_header = "{}___DRAGEN_PARSE___".format(run)
dragen_parse_dna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_sample_parser.py {} {} {} {}".format(dragen_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"])
dragen_parse_dna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_csv_to_picard.py {} {} {} {}".format(dragen_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"])
bsub_dragen_parse_dna = "bsub -J {0}{1} -o {0}{1}.out -w \"done({2}{1})\" -cwd \"{3}\" -n 8 -M 8 {4}".format(dragen_parse_header, sample.sample_id, dragen_job_name_header, dragen_directory, dragen_parse_dna)
print(bsub_dragen_parse_dna)
call(bsub_dragen_parse_dna, shell = True)
Expand Down Expand Up @@ -215,7 +215,7 @@ def dragen_methylation(sample, run, sample_parameters, work_directory, dragen_di

# launch DRAGEN PARSER
dragen_methylation_parse_header = "{}___DRAGEN_METHYLATION_PARSE___".format(run)
dragen_methylation_parse_dna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_sample_parser.py {} {} {} {}".format(dragen_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"])
dragen_methylation_parse_dna = "/home/igo/miniconda_airflow/bin/python3 /igo/work/igo/igo-demux/scripts/dragen_csv_to_picard.py {} {} {} {}".format(dragen_directory, work_directory, metric_file_prefix, sample_parameters["TYPE"])
bsub_dragen_methylation_parse_dna = "bsub -J {0}{1} -o {0}{1}.out -w \"done({2}{1})\" -cwd \"{3}\" -n 8 -M 8 {4}".format(dragen_methylation_parse_header, sample.sample_id, dragen_methylation_job_name_header, dragen_directory, dragen_methylation_parse_dna)
print(bsub_dragen_methylation_parse_dna)
call(bsub_dragen_methylation_parse_dna, shell = True)
Expand Down
69 changes: 37 additions & 32 deletions scripts/dragen_sample_parser.py → scripts/dragen_csv_to_picard.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
# read in csv stats files generated by dragen alignment and create txt files that can be parsed into NGS database
# read in DRAGEN .csv stats files generated and create txt files that can be parsed into NGS database based on the Picard stats format
# for WGS now
import pandas as pd
import sys
import os
import glob

# input will be the path to all the csv files and output path for txt files

# read in all the names of csv files, get the name before first dot as sampleID, save in a list called sample_list
def get_sample_list(folder_path):
sample_list = []
Expand All @@ -29,18 +27,17 @@ class DragenStats:
def __init__(self):
self.sample_name = ""

def read_info_from_csv(self, metrics_file_prefix, dragen_metrics_directory):
def read_info_from_csv(self, mapping_metrics_file, dragen_metrics_directory):
print("Reading (" + mapping_metrics_file + ").mapping_metrics.csv .wgs_coverage_metrics.csv")
# open mapping_metrics csv and store info in dataframe
sample_id = metrics_file_prefix.split("___")[2]
mapping_metrics_file = "{}/{}.mapping_metrics.csv".format(dragen_metrics_directory, sample_id)
df_mapping_metrics = pd.read_csv(mapping_metrics_file, index_col = 2, nrows = 53, names = [0,1,2,3,4])
self.READ_PAIRS_EXAMINED = int (df_mapping_metrics.loc["Properly paired reads"][3] / 2)
self.UNMAPPED_READS = int (df_mapping_metrics.loc["Unmapped reads"][3])
self.TOTAL_READS = int (df_mapping_metrics.loc["Total input reads"][3])
self.READ_PAIR_DUPLICATES = int (df_mapping_metrics.loc["Number of duplicate marked reads"][3] / 2)
self.PERCENT_DUPLICATION = df_mapping_metrics.loc["Number of duplicate marked reads"][4] / 100
self.READS_ALIGNED_IN_PAIRS = int (df_mapping_metrics.loc["Properly paired reads"][3])
# open coverage_metrics*.csv and store info in dataframe
# open wgs_coverage_metrics.csv and store info in dataframe
wgs_coverage_metrics_file = "{}/{}.wgs_coverage_metrics.csv".format(dragen_metrics_directory, sample_id)
df_coverage_metrics = pd.read_csv(wgs_coverage_metrics_file, index_col = 2, names = [0,1,2,3,4])
self.MEAN_TARGET_COVERAGE = df_coverage_metrics.loc["Average alignment coverage over genome"][3]
Expand All @@ -53,8 +50,8 @@ def read_info_from_csv(self, metrics_file_prefix, dragen_metrics_directory):
self.PF_READS_ALIGNED = int(df_coverage_metrics.loc["Aligned reads"][3])


# creat ___AM.txt picard stats format file
# requirments: 7 lines minimun, line with data need to start with Category PAIR, file name ends with ___AM.txt
# create ___AM.txt picard stats format file
# requirements: 7 lines minimun, line with data need to start with Category PAIR, file name ends with ___AM.txt
def write_to_am_txt(self):
data_list_to_write = [0] * 24
data_list_to_write[0] = "PAIR"
Expand All @@ -67,6 +64,7 @@ def write_to_am_txt(self):
header = "CATEGORY{0}TOTAL_READS{0}PF_READS{0}PCT_PF_READS{0}PF_NOISE_READS{0}PF_READS_ALIGNED{0}PCT_PF_READS_ALIGNED{0}PF_ALIGNED_BASES{0}PF_HQ_ALIGNED_READS{0}PF_HQ_ALIGNED_BASES{0}PF_HQ_ALIGNED_Q20_BASES{0}PF_HQ_MEDIAN_MISMATCHES{0}PF_MISMATCH_RATE{0}PF_HQ_ERROR_RATE{0}PF_INDEL_RATE{0}MEAN_READ_LENGTH{0}READS_ALIGNED_IN_PAIRS{0}PCT_READS_ALIGNED_IN_PAIRS{0}PF_READS_IMPROPER_PAIRS{0}PCT_PF_READS_IMPROPER_PAIRS{0}BAD_CYCLES{0}STRAND_BALANCE{0}PCT_CHIMERAS{0}PCT_ADAPTER{0}SAMPLE{0}LIBRARY{0}READ_GROUP".format(tab)

write_to_file ="{}/{}___DRAGEN3_10_8___AM.txt".format(work_directory, metrics_file_prefix)
print("Writing: " + write_to_file)
data_line = ""
for i in data_list_to_write:
data_line = "{}{}{}".format(data_line, str(i), tab)
Expand All @@ -92,6 +90,7 @@ def write_to_md_txt(self):
header = "LIBRARY{0}UNPAIRED_READS_EXAMINED{0}READ_PAIRS_EXAMINED{0}SECONDARY_OR_SUPPLEMENTARY_RDS{0}UNMAPPED_READS{0}UNPAIRED_READ_DUPLICATES{0}READ_PAIR_DUPLICATES{0}READ_PAIR_OPTICAL_DUPLICATES{0}PERCENT_DUPLICATION{0}ESTIMATED_LIBRARY_SIZE".format(tab)

write_to_file = "{}/{}___DRAGEN3_10_8___MD.txt".format(work_directory, metrics_file_prefix)
print("Writing: " + write_to_file)
data_line = ""
for i in data_list_to_write:
data_line = "{}{}{}".format(data_line, str(i), tab)
Expand All @@ -116,6 +115,7 @@ def write_to_wgs_txt(self):
header = "GENOME_TERRITORY{0}MEAN_COVERAGE{0}SD_COVERAGE{0}MEDIAN_COVERAGE{0}MAD_COVERAGE{0}PCT_EXC_ADAPTER{0}PCT_EXC_MAPQ{0}PCT_EXC_DUPE{0}PCT_EXC_UNPAIRED{0}PCT_EXC_BASEQ{0}PCT_EXC_OVERLAP{0}PCT_EXC_CAPPED{0}PCT_EXC_TOTAL{0}PCT_1X{0}PCT_5X{0}PCT_10X{0}PCT_15X{0}PCT_20X{0}PCT_25X{0}PCT_30X{0}PCT_40X{0}PCT_50X{0}PCT_60X{0}PCT_70X{0}PCT_80X{0}PCT_90X{0}PCT_100X{0}FOLD_80_BASE_PENALTY{0}FOLD_90_BASE_PENALTY{0}FOLD_95_BASE_PENALTY{0}HET_SNP_SENSITIVITY{0}HET_SNP_Q".format(tab)

write_to_file = "{}/{}___DRAGEN3_10_8___WGS.txt".format(work_directory, metrics_file_prefix)
print("Writing: " + write_to_file)
data_line = ""
for i in data_list_to_write:
data_line = "{}{}{}".format(data_line, str(i), tab)
Expand All @@ -126,9 +126,9 @@ def write_to_wgs_txt(self):
_file.write("#{}".format(newline))
_file.write("{0}{1}{0}".format(newline, header))
_file.write(data_line)
def main(dragen_metrics_directory, work_directory, metrics_file_prefix, sample_type):


def process_one_sample(dragen_metrics_directory, work_directory, metrics_file_prefix, sample_type):
dragen_stats = DragenStats()
dragen_stats.read_info_from_csv(metrics_file_prefix, dragen_metrics_directory)
dragen_stats.write_to_am_txt()
Expand All @@ -138,24 +138,29 @@ def main(dragen_metrics_directory, work_directory, metrics_file_prefix, sample_t


if __name__ == "__main__":
# Usage: python dragenstats_csv_to_txt.py [dragen_stats_dir] [output_file_dir]
# example: python3 /Users/luc/Documents/GitHub/igo-demux/scripts/dragenstats_csv_to_txt.py /Users/luc/Documents/GitHub/igo-demux/test/ /Users/luc/Documents/GitHub/igo-demux/test/result_test/
dragen_metrics_directory = sys.argv[1]
work_directory = sys.argv[2]
metrics_file_prefix = sys.argv[3]
sample_type = sys.argv[4]
main(dragen_metrics_directory, work_directory, metrics_file_prefix, sample_type)






# Usage: python dragenstats_csv_to_txt.py [dragen_stats_dir] [output_file_dir] [] [TYPE]
# example: python3 /Users/luc/Documents/GitHub/igo-demux/scripts/dragen_csv_to_picard.py /Users/luc/Documents/GitHub/igo-demux/test/ /Users/luc/Documents/GitHub/igo-demux/test/result_test/








# if there are 4 arguments process the one sample in the metrics_file_prefix
if len(sys.argv) == 5:
dragen_metrics_directory = sys.argv[1]
work_directory = sys.argv[2]
metrics_file_prefix = sys.argv[3]
sample_type = sys.argv[4] # WGS or RNA

sample_id = metrics_file_prefix.split("___")[2]
mapping_metrics_file = "{}/{}.mapping_metrics.csv".format(dragen_metrics_directory, sample_id)

process_one_sample(dragen_metrics_directory, work_directory, mapping_metrics_file, sample_type)

# if there are 2 arguments to main() then Process all DRAGEN WGS stats in the entire directory
if len(sys.argv) == 3:
dragen_stats_folder = sys.argv[1]
output_folder_path = sys.argv[2]
sample_list = get_sample_list(dragen_stats_folder)
for i in sample_list:
dragen_stats = DragenStats(i)
dragen_stats.read_info_from_csv(i + ".mapping_metrics.csv", dragen_stats_folder)
dragen_stats.write_to_am_txt(output_folder_path)
dragen_stats.write_to_md_txt(output_folder_path)
dragen_stats.write_to_wgs_txt(output_folder_path)
132 changes: 0 additions & 132 deletions scripts/dragen_parse_csv_stats_hWGS.py

This file was deleted.

0 comments on commit ba6f235

Please sign in to comment.