Merge branch 'main' of https://github.com/mskcc/igo-demux

mskcc · Jul 3, 2024 · 92a4a6e · 92a4a6e
2 parents cc240c1 + e512fe7
commit 92a4a6e
Show file tree

Hide file tree

Showing 22 changed files with 1,539 additions and 549 deletions.
diff --git a/SampleSheet.py b/SampleSheet.py
@@ -99,7 +99,7 @@ def split_sample_sheet(self):
          if sample sheet recipes have mixed DLP and other all DLP need to go on a separate sample sheet named "_DLP"
         """
         # if 10x DRAGEN demux add to header CreateFastqForIndexReads,1,,,,,,,
-        if any("10X_" in s for s in self.recipe_set):
+        if any("SC_Chromium" in s for s in self.recipe_set):
             print("Adding CreateFastqForIndexReads,1 to sample sheet header since 10X samples are present")
             self.df_ss_header.loc[len(self.df_ss_header.index)-1] = ["CreateFastqForIndexReads",1,"","","","","","",""]
             self.df_ss_header.loc[len(self.df_ss_header.index)] = ["[Data]","","","","","","","",""]
@@ -111,12 +111,12 @@ def split_sample_sheet(self):
         split_ss_list = [ss_copy, self]  
 
         was_split = False
-        if "DLP" in self.recipe_set and len(self.recipe_set) > 1:
+        if "SC_DLP" in self.recipe_set and len(self.recipe_set) > 1:
             print("Copying all DLP samples to a new sample sheet")
             # copy all DLP rows to a new sample sheet
-            dlp_data = self.df_ss_data[self.df_ss_data["Sample_Well"].str.match("DLP") == True].copy()
+            dlp_data = self.df_ss_data[self.df_ss_data["Sample_Well"].str.match("SC_DLP") == True].copy()
             # and remove DLP samples from the main sample sheet
-            self.df_ss_data= self.df_ss_data[self.df_ss_data["Sample_Well"].str.match("DLP") == False].copy()
+            self.df_ss_data= self.df_ss_data[self.df_ss_data["Sample_Well"].str.match("SC_DLP") == False].copy()
             # rename DLP sample sheet w/"_DLP.csv"
             dlp_path = os.path.splitext(self.path)[0]+'_DLP.csv'
             header_copy = self.df_ss_header.copy(deep=True)

diff --git a/deliver_pipeline_dag.py b/deliver_pipeline_dag.py
@@ -25,6 +25,7 @@
     def deliver(ds, **kwargs):
         project = kwargs["params"]["project"]
         pi = kwargs["params"]["pi"]
+        # recipe here is actually request name
         recipe = kwargs["params"]["recipe"]
         print("Delivering the pipeline output and/or .bams for {} {} {}".format(project, pi, recipe))
 

diff --git a/demux_run_dag.py b/demux_run_dag.py
@@ -3,7 +3,6 @@
 import subprocess
 from datetime import datetime, timedelta
 
-from numpy import equal
 import pandas
 from SampleSheet import SampleSheet
 import scripts.organise_fastq_split_by_lane
@@ -67,7 +66,7 @@ def demux(ds, **kwargs):
 
         # check if the sample sheet contains DLP project
         is_DLP = False
-        if "DLP" in sample_sheet.recipe_set:
+        if "SC_DLP" in sample_sheet.recipe_set:
             is_DLP = True
             dragen_demux = True
 
@@ -108,7 +107,7 @@ def demux(ds, **kwargs):
     def get_dlp_chip(samplesheet, project):
         samplesheet.df_ss_data.reset_index()
         for index, row in samplesheet.df_ss_data.iterrows():
-            if row['Sample_Well'] == 'DLP' and project == row['Sample_Project']:
+            if row['Sample_Well'] == 'SC_DLP' and project == row['Sample_Project']:
                 # return chip from 071PP_DLP_UNSORTED_128624A_13_12_IGO_09443_CU_1_1_121
                 sample = row['Sample_ID']
                 return get_dlp_chip_from_sample_name(sample)
@@ -136,7 +135,7 @@ def stats(ds, **kwargs):
         if "REFERENCE" in samplesheet_path:
             return "No stats for reference "  + samplesheet_path
 
-        if "DLP" in sample_sheet.recipe_set:
+        if "SC_DLP" in sample_sheet.recipe_set:
             scripts.get_total_reads_from_demux.run_DLP(sample_sheet, sequencer_and_run)
             scripts.upload_stats.upload_stats(sequencer_and_run)
 
@@ -170,15 +169,18 @@ def stats(ds, **kwargs):
 
             return "DLP stats posted and yaml file generated"
 
-        if any("10X_" in s for s in sample_sheet.recipe_set):
+        # check if the run is 10X by read length
+        atac, use_bases_mask = scripts.get_sequencing_read_data.main(sequencer_path)
+        print("read length: {}".format(use_bases_mask))
+        if use_bases_mask == [29, 89] or atac:
             # if is atac run, demux is using cellranger mkfastq
-            if scripts.get_sequencing_read_data.main(sequencer_path)[0]:
+            if atac:
                 scripts.get_total_reads_from_demux.by_json(sequencer_and_run)
                 scripts.upload_stats.upload_stats(sequencer_and_run)
 
                 # launch cell ranger based on recipe
                 sequencer_and_run_prefix = "_".join(sequencer_and_run.split("_")[0:3])
-                scripts.cellranger.launch_cellranger(sample_sheet, sequencer_and_run_prefix)
+                scripts.cellranger.launch_cellranger_by_sample_sheet(sample_sheet, sequencer_and_run_prefix)
 
             else:
                 # step 1, generate txt files containing total reads and upload to qc website
@@ -188,10 +190,10 @@ def stats(ds, **kwargs):
                 # step 2, start cell ranger based on recipe/barcode, check whether multiple fastq files existing
                 # trim sequencer_and_run if postfix like _10X exsiting
                 sequencer_and_run_prefix = "_".join(sequencer_and_run.split("_")[0:3])
-                scripts.cellranger.launch_cellranger(sample_sheet, sequencer_and_run_prefix)
+                scripts.cellranger.launch_cellranger_by_sample_sheet(sample_sheet, sequencer_and_run_prefix)
 
                 # add DONE file when all the 10X pipeline finished, -K to wait until finish
-                cmd = 'bsub -K -J wait_stats_done_for_{} -w \"ended(create_json___{}*)\" touch /igo/stats/CELLRANGER/{}/DONE'.format(sequencer_and_run_prefix, sequencer_and_run_prefix, sequencer_and_run_prefix)
+                cmd = 'bsub -K -J wait_stats_done_for_{} -w \"ended(create_json___{}*)\" touch /igo/staging/CELLRANGER/{}/DONE'.format(sequencer_and_run_prefix, sequencer_and_run_prefix, sequencer_and_run_prefix)
                 print(cmd)
                 subprocess.run(cmd, shell=True)
 
@@ -212,7 +214,7 @@ def stats(ds, **kwargs):
 
     def fingerprinting(ds, **kwargs):
         # read in sample sheet as arguments, filter out projects that need to run fingerprinting
-        recipe_list_for_fp = [".*IMPACT*", ".*Heme*", "IDT_Exome*", "WholeExomeSequencing", "Twist_Exome", "MSK-ACCESS*", "CMO-CH", "HumanWholeGenome"]
+        recipe_list_for_fp = ["PED-PEG", "WGS_Deep", "HC_IMPACT", "HC_IMPACT-Heme", "HC_ACCESS", "WES_Human", "HC_CMOCH"]
         # call fingerprinting_dag.py for each project
         samplesheet_path = kwargs["params"]["samplesheet"]
 
@@ -226,13 +228,9 @@ def fingerprinting(ds, **kwargs):
         project_list_to_run = []        
         for project, recipe in sample_sheet.project_dict.items():
             # fingerprinting only support human
-            if project_genome_dict[project] == "Human":
-                for recipe_list_item in recipe_list_for_fp:
-                    print(project, recipe)
-                    expr = re.compile(recipe_list_item)
-                    if expr.match(recipe):
-                        project_list_to_run.append(project)
-                        break
+            if project_genome_dict[project] == "Human" and recipe in recipe_list_for_fp:
+                project_list_to_run.append(project)
+
         print("Projects need to run fp: {}".format(project_list_to_run))
         if len(project_list_to_run) == 0:
             return "No project need to run fingerprinting"

diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py
@@ -10,12 +10,16 @@
 import shutil
 import pathlib
 import scripts.generate_run_params
+import scripts.get_total_reads_from_demux
 
 
 # Global Variable : we do not want to process these experiments in this script
-DO_NOT_PROCESS = ["10X_Genomics", "DLP"]
+DO_NOT_PROCESS = ["SC_DLP"]
 # These recipes will be evaluated using DRAGEN because of their larger size of fastqs
-RUN_ON_DRAGEN = ["MissionBio", "SingleCellCNV", "MouseWholeGenome", "HumanWholeGenome", "PombeWholeGenome", "ChIPSeq", "AmpliconSeq"]
+RUN_ON_DRAGEN = ["MissionBio", "SingleCellCNV", "WGS_Deep", "ChIP", "CUT&RUN","Amplicon"]
+# these projects willl only need demux stats
+DEMUX_ONLY = ["SMARTSeq", "Chromium", "10X_Genomics", "Visium"]
+
 # Organisms to have DRAGEN BAMS
 DRAGEN_RNA_GENOMES = ["GRCh38", "grcm39"]
 # this list contains the headers of the columns.  we will access the data using these listings
@@ -38,7 +42,8 @@ def launch_metrics(self, all_samples, run, project_directory):
 		work_directory = "{}/{}/".format(parent_directory, run)
 		rna_directory = "{}RNA/".format(work_directory)
 		dragen_directory = "{}DRAGEN/".format(work_directory)
-
+		stats_done_directory = "/igo/stats/DONE/{}/".format(run.split("_")[0])
+		print(stats_done_directory)
 		# create work directory	
 		pathlib.Path(work_directory).mkdir(parents = True, exist_ok = True)
 
@@ -59,6 +64,14 @@ def launch_metrics(self, all_samples, run, project_directory):
 			# test to see if there are some samples that this script will not process
 			if any(s in sample.recipe for s in DO_NOT_PROCESS):
 				continue
+
+			if any(s in sample.recipe for s in DEMUX_ONLY):
+				demux_report_file = "/igo/staging/FASTQ/{}/Reports/Demultiplex_Stats.csv".format(run)
+				demux_reads_per_sample = scripts.get_total_reads_from_demux.get_total_reads([sample.sample_id], demux_report_file)
+				print(demux_reads_per_sample)
+				scripts.get_total_reads_from_demux.write_to_am_txt(run, sample.sample_id, demux_reads_per_sample[sample.sample_id], stats_done_directory)
+				continue
+
 			# grab the sample parameters (bait set, type, gtag, etc)
 			sample_parameters = self.get_parameters(sample.genome, sample.recipe)
 			# process the RNA data seperately
@@ -72,7 +85,7 @@ def launch_metrics(self, all_samples, run, project_directory):
 				self.dragen(sample, run, sample_parameters, work_directory, dragen_directory, fastq_list)
 				continue
 			# check for methylated samples
-			if ((sample.recipe == "MethylCaptureSeq") or (sample.recipe == "WholeGenomeBisulfiteSequencing")):
+			if ("Methyl" in sample.recipe):
 				pathlib.Path(dragen_directory).mkdir(parents = True, exist_ok = True)
 				self.dragen_methylation(sample, run, sample_parameters, work_directory, dragen_directory, fastq_list)
 				continue
@@ -125,15 +138,15 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo
 
 		# get the correct path for the reference
 		if (sample_parameters["GTAG"] == "GRCh38"):
-			rna_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559"
+			rna_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1"
 		else:
-			rna_path = "/igo/work/igo/dragen_hash_tables/{}".format(sample_parameters["GTAG"])
+			rna_path = "/igo/work/igo/dragen_hash_tables/4.2/{}".format(sample_parameters["GTAG"])
 
 		rna_dragen_job_name_header = "{}___RNA_DRAGEN___".format(run)
 
 
-		launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} ".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory)
-		bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna)
+		launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} --bin_memory 50000000000".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory)
+		bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna)
 		print(bsub_launch_dragen_rna)
 		call(bsub_launch_dragen_rna, shell = True)
 
@@ -162,13 +175,13 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas
 
 		# get the correct path for the reference
 		if (sample_parameters["GTAG"] == "GRCh38"):
-			dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559"
+			dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1"
 		else:
-			dragen_path = "/igo/work/igo/dragen_hash_tables/{}".format(sample_parameters["GTAG"])
+			dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/{}".format(sample_parameters["GTAG"])
 
 		metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"])
-		launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id)
-		bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen)
+		launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id)
+		bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen)
 		print(bsub_launch_dragen)
 		call(bsub_launch_dragen, shell = True)
 
@@ -205,13 +218,13 @@ def dragen_methylation(sample, run, sample_parameters, work_directory, dragen_di
 
 		# get the correct path for the reference
 		if (sample_parameters["GTAG"] == "GRCh38"):
-			dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_methylated"
+			dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38_methylated"
 		else:
-			dragen_path = "/igo/work/igo/dragen_hash_tables/grcm39_methylated"
+			dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/grcm39_methylated"
 
 		metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"])
-		launch_dragen_methylation = "/opt/edico/bin/dragen --enable-methylation-calling true --methylation-protocol directional --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id)
-		bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_methylation_job_name_header, sample.sample_id, dragen_directory, launch_dragen_methylation)
+		launch_dragen_methylation = "/opt/edico/bin/dragen --enable-methylation-calling true --methylation-protocol directional --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id)
+		bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_methylation_job_name_header, sample.sample_id, dragen_directory, launch_dragen_methylation)
 		print(bsub_launch_dragen)
 		call(bsub_launch_dragen, shell = True)
 
@@ -287,4 +300,4 @@ def launch_picard(bams_by_lane, run, sample, sample_parameters, work_directory):
 
 
 
-
+