Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/mskcc/igo-demux
Browse files Browse the repository at this point in the history
  • Loading branch information
dmcmanam committed Jul 3, 2024
2 parents cc240c1 + e512fe7 commit 92a4a6e
Show file tree
Hide file tree
Showing 22 changed files with 1,539 additions and 549 deletions.
8 changes: 4 additions & 4 deletions SampleSheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def split_sample_sheet(self):
if sample sheet recipes have mixed DLP and other all DLP need to go on a separate sample sheet named "_DLP"
"""
# if 10x DRAGEN demux add to header CreateFastqForIndexReads,1,,,,,,,
if any("10X_" in s for s in self.recipe_set):
if any("SC_Chromium" in s for s in self.recipe_set):
print("Adding CreateFastqForIndexReads,1 to sample sheet header since 10X samples are present")
self.df_ss_header.loc[len(self.df_ss_header.index)-1] = ["CreateFastqForIndexReads",1,"","","","","","",""]
self.df_ss_header.loc[len(self.df_ss_header.index)] = ["[Data]","","","","","","","",""]
Expand All @@ -111,12 +111,12 @@ def split_sample_sheet(self):
split_ss_list = [ss_copy, self]

was_split = False
if "DLP" in self.recipe_set and len(self.recipe_set) > 1:
if "SC_DLP" in self.recipe_set and len(self.recipe_set) > 1:
print("Copying all DLP samples to a new sample sheet")
# copy all DLP rows to a new sample sheet
dlp_data = self.df_ss_data[self.df_ss_data["Sample_Well"].str.match("DLP") == True].copy()
dlp_data = self.df_ss_data[self.df_ss_data["Sample_Well"].str.match("SC_DLP") == True].copy()
# and remove DLP samples from the main sample sheet
self.df_ss_data= self.df_ss_data[self.df_ss_data["Sample_Well"].str.match("DLP") == False].copy()
self.df_ss_data= self.df_ss_data[self.df_ss_data["Sample_Well"].str.match("SC_DLP") == False].copy()
# rename DLP sample sheet w/"_DLP.csv"
dlp_path = os.path.splitext(self.path)[0]+'_DLP.csv'
header_copy = self.df_ss_header.copy(deep=True)
Expand Down
1 change: 1 addition & 0 deletions deliver_pipeline_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
def deliver(ds, **kwargs):
project = kwargs["params"]["project"]
pi = kwargs["params"]["pi"]
# recipe here is actually request name
recipe = kwargs["params"]["recipe"]
print("Delivering the pipeline output and/or .bams for {} {} {}".format(project, pi, recipe))

Expand Down
32 changes: 15 additions & 17 deletions demux_run_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import subprocess
from datetime import datetime, timedelta

from numpy import equal
import pandas
from SampleSheet import SampleSheet
import scripts.organise_fastq_split_by_lane
Expand Down Expand Up @@ -67,7 +66,7 @@ def demux(ds, **kwargs):

# check if the sample sheet contains DLP project
is_DLP = False
if "DLP" in sample_sheet.recipe_set:
if "SC_DLP" in sample_sheet.recipe_set:
is_DLP = True
dragen_demux = True

Expand Down Expand Up @@ -108,7 +107,7 @@ def demux(ds, **kwargs):
def get_dlp_chip(samplesheet, project):
samplesheet.df_ss_data.reset_index()
for index, row in samplesheet.df_ss_data.iterrows():
if row['Sample_Well'] == 'DLP' and project == row['Sample_Project']:
if row['Sample_Well'] == 'SC_DLP' and project == row['Sample_Project']:
# return chip from 071PP_DLP_UNSORTED_128624A_13_12_IGO_09443_CU_1_1_121
sample = row['Sample_ID']
return get_dlp_chip_from_sample_name(sample)
Expand Down Expand Up @@ -136,7 +135,7 @@ def stats(ds, **kwargs):
if "REFERENCE" in samplesheet_path:
return "No stats for reference " + samplesheet_path

if "DLP" in sample_sheet.recipe_set:
if "SC_DLP" in sample_sheet.recipe_set:
scripts.get_total_reads_from_demux.run_DLP(sample_sheet, sequencer_and_run)
scripts.upload_stats.upload_stats(sequencer_and_run)

Expand Down Expand Up @@ -170,15 +169,18 @@ def stats(ds, **kwargs):

return "DLP stats posted and yaml file generated"

if any("10X_" in s for s in sample_sheet.recipe_set):
# check if the run is 10X by read length
atac, use_bases_mask = scripts.get_sequencing_read_data.main(sequencer_path)
print("read length: {}".format(use_bases_mask))
if use_bases_mask == [29, 89] or atac:
# if is atac run, demux is using cellranger mkfastq
if scripts.get_sequencing_read_data.main(sequencer_path)[0]:
if atac:
scripts.get_total_reads_from_demux.by_json(sequencer_and_run)
scripts.upload_stats.upload_stats(sequencer_and_run)

# launch cell ranger based on recipe
sequencer_and_run_prefix = "_".join(sequencer_and_run.split("_")[0:3])
scripts.cellranger.launch_cellranger(sample_sheet, sequencer_and_run_prefix)
scripts.cellranger.launch_cellranger_by_sample_sheet(sample_sheet, sequencer_and_run_prefix)

else:
# step 1, generate txt files containing total reads and upload to qc website
Expand All @@ -188,10 +190,10 @@ def stats(ds, **kwargs):
# step 2, start cell ranger based on recipe/barcode, check whether multiple fastq files existing
# trim sequencer_and_run if postfix like _10X exsiting
sequencer_and_run_prefix = "_".join(sequencer_and_run.split("_")[0:3])
scripts.cellranger.launch_cellranger(sample_sheet, sequencer_and_run_prefix)
scripts.cellranger.launch_cellranger_by_sample_sheet(sample_sheet, sequencer_and_run_prefix)

# add DONE file when all the 10X pipeline finished, -K to wait until finish
cmd = 'bsub -K -J wait_stats_done_for_{} -w \"ended(create_json___{}*)\" touch /igo/stats/CELLRANGER/{}/DONE'.format(sequencer_and_run_prefix, sequencer_and_run_prefix, sequencer_and_run_prefix)
cmd = 'bsub -K -J wait_stats_done_for_{} -w \"ended(create_json___{}*)\" touch /igo/staging/CELLRANGER/{}/DONE'.format(sequencer_and_run_prefix, sequencer_and_run_prefix, sequencer_and_run_prefix)
print(cmd)
subprocess.run(cmd, shell=True)

Expand All @@ -212,7 +214,7 @@ def stats(ds, **kwargs):

def fingerprinting(ds, **kwargs):
# read in sample sheet as arguments, filter out projects that need to run fingerprinting
recipe_list_for_fp = [".*IMPACT*", ".*Heme*", "IDT_Exome*", "WholeExomeSequencing", "Twist_Exome", "MSK-ACCESS*", "CMO-CH", "HumanWholeGenome"]
recipe_list_for_fp = ["PED-PEG", "WGS_Deep", "HC_IMPACT", "HC_IMPACT-Heme", "HC_ACCESS", "WES_Human", "HC_CMOCH"]
# call fingerprinting_dag.py for each project
samplesheet_path = kwargs["params"]["samplesheet"]

Expand All @@ -226,13 +228,9 @@ def fingerprinting(ds, **kwargs):
project_list_to_run = []
for project, recipe in sample_sheet.project_dict.items():
# fingerprinting only support human
if project_genome_dict[project] == "Human":
for recipe_list_item in recipe_list_for_fp:
print(project, recipe)
expr = re.compile(recipe_list_item)
if expr.match(recipe):
project_list_to_run.append(project)
break
if project_genome_dict[project] == "Human" and recipe in recipe_list_for_fp:
project_list_to_run.append(project)

print("Projects need to run fp: {}".format(project_list_to_run))
if len(project_list_to_run) == 0:
return "No project need to run fingerprinting"
Expand Down
47 changes: 30 additions & 17 deletions scripts/LaunchMetrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,16 @@
import shutil
import pathlib
import scripts.generate_run_params
import scripts.get_total_reads_from_demux


# Global Variable : we do not want to process these experiments in this script
DO_NOT_PROCESS = ["10X_Genomics", "DLP"]
DO_NOT_PROCESS = ["SC_DLP"]
# These recipes will be evaluated using DRAGEN because of their larger size of fastqs
RUN_ON_DRAGEN = ["MissionBio", "SingleCellCNV", "MouseWholeGenome", "HumanWholeGenome", "PombeWholeGenome", "ChIPSeq", "AmpliconSeq"]
RUN_ON_DRAGEN = ["MissionBio", "SingleCellCNV", "WGS_Deep", "ChIP", "CUT&RUN","Amplicon"]
# these projects willl only need demux stats
DEMUX_ONLY = ["SMARTSeq", "Chromium", "10X_Genomics", "Visium"]

# Organisms to have DRAGEN BAMS
DRAGEN_RNA_GENOMES = ["GRCh38", "grcm39"]
# this list contains the headers of the columns. we will access the data using these listings
Expand All @@ -38,7 +42,8 @@ def launch_metrics(self, all_samples, run, project_directory):
work_directory = "{}/{}/".format(parent_directory, run)
rna_directory = "{}RNA/".format(work_directory)
dragen_directory = "{}DRAGEN/".format(work_directory)

stats_done_directory = "/igo/stats/DONE/{}/".format(run.split("_")[0])
print(stats_done_directory)
# create work directory
pathlib.Path(work_directory).mkdir(parents = True, exist_ok = True)

Expand All @@ -59,6 +64,14 @@ def launch_metrics(self, all_samples, run, project_directory):
# test to see if there are some samples that this script will not process
if any(s in sample.recipe for s in DO_NOT_PROCESS):
continue

if any(s in sample.recipe for s in DEMUX_ONLY):
demux_report_file = "/igo/staging/FASTQ/{}/Reports/Demultiplex_Stats.csv".format(run)
demux_reads_per_sample = scripts.get_total_reads_from_demux.get_total_reads([sample.sample_id], demux_report_file)
print(demux_reads_per_sample)
scripts.get_total_reads_from_demux.write_to_am_txt(run, sample.sample_id, demux_reads_per_sample[sample.sample_id], stats_done_directory)
continue

# grab the sample parameters (bait set, type, gtag, etc)
sample_parameters = self.get_parameters(sample.genome, sample.recipe)
# process the RNA data seperately
Expand All @@ -72,7 +85,7 @@ def launch_metrics(self, all_samples, run, project_directory):
self.dragen(sample, run, sample_parameters, work_directory, dragen_directory, fastq_list)
continue
# check for methylated samples
if ((sample.recipe == "MethylCaptureSeq") or (sample.recipe == "WholeGenomeBisulfiteSequencing")):
if ("Methyl" in sample.recipe):
pathlib.Path(dragen_directory).mkdir(parents = True, exist_ok = True)
self.dragen_methylation(sample, run, sample_parameters, work_directory, dragen_directory, fastq_list)
continue
Expand Down Expand Up @@ -125,15 +138,15 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo

# get the correct path for the reference
if (sample_parameters["GTAG"] == "GRCh38"):
rna_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559"
rna_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1"
else:
rna_path = "/igo/work/igo/dragen_hash_tables/{}".format(sample_parameters["GTAG"])
rna_path = "/igo/work/igo/dragen_hash_tables/4.2/{}".format(sample_parameters["GTAG"])

rna_dragen_job_name_header = "{}___RNA_DRAGEN___".format(run)


launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} ".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory)
bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna)
launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} --bin_memory 50000000000".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory)
bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna)
print(bsub_launch_dragen_rna)
call(bsub_launch_dragen_rna, shell = True)

Expand Down Expand Up @@ -162,13 +175,13 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas

# get the correct path for the reference
if (sample_parameters["GTAG"] == "GRCh38"):
dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559"
dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1"
else:
dragen_path = "/igo/work/igo/dragen_hash_tables/{}".format(sample_parameters["GTAG"])
dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/{}".format(sample_parameters["GTAG"])

metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"])
launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id)
bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen)
launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id)
bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen)
print(bsub_launch_dragen)
call(bsub_launch_dragen, shell = True)

Expand Down Expand Up @@ -205,13 +218,13 @@ def dragen_methylation(sample, run, sample_parameters, work_directory, dragen_di

# get the correct path for the reference
if (sample_parameters["GTAG"] == "GRCh38"):
dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_methylated"
dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38_methylated"
else:
dragen_path = "/igo/work/igo/dragen_hash_tables/grcm39_methylated"
dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/grcm39_methylated"

metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"])
launch_dragen_methylation = "/opt/edico/bin/dragen --enable-methylation-calling true --methylation-protocol directional --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id)
bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_methylation_job_name_header, sample.sample_id, dragen_directory, launch_dragen_methylation)
launch_dragen_methylation = "/opt/edico/bin/dragen --enable-methylation-calling true --methylation-protocol directional --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id)
bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_methylation_job_name_header, sample.sample_id, dragen_directory, launch_dragen_methylation)
print(bsub_launch_dragen)
call(bsub_launch_dragen, shell = True)

Expand Down Expand Up @@ -287,4 +300,4 @@ def launch_picard(bams_by_lane, run, sample, sample_parameters, work_directory):





Loading

0 comments on commit 92a4a6e

Please sign in to comment.