fix: fixed check for user bowtie index (#11)

DavideBrex · Jan 24, 2024 · 4609a1c · 4609a1c
1 parent e6af464
commit 4609a1c
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -66,7 +66,7 @@ To obtain the Snakemake workflow, you can:
     1. Create a new github repository using this workflow [as a template](https://help.github.com/en/articles/creating-a-repository-from-a-template).
     2. [Clone](https://help.github.com/en/articles/cloning-a-repository) the newly created repository to your local system, in the folder where you want to perform the data analysis.
 
-- Download the source code as zip file from this page (code button)
+- Download the source code as zip file from the latest [version](https://github.com/DavideBrex/SpikeFlow/releases).
 
 
 The usage of this workflow is also described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=DavideBrex%2FSpikeFlow).
@@ -221,7 +221,7 @@ First, the singularity container will be pulled from DockerHub and then the work
 
 To execute the pipeline on a HPC cluster, please follow [these guidelines](https://snakemake.readthedocs.io/en/stable/tutorial/additional_features.html#cluster-execution).
 
-If you are using **Snakemake version $\ge$ 8**, the comman line arguments have [different names](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#containerization-of-conda-based-workflows). In this case, run the workflow with:
+If you are using **Snakemake version $\ge$ 8**, the command line arguments have [different names](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#containerization-of-conda-based-workflows). In this case, run the workflow with:
 
 ```bash
 snakemake --cores --software-deployment-method conda apptainer

diff --git a/workflow/rules/callPeaks.smk b/workflow/rules/callPeaks.smk
@@ -22,7 +22,7 @@ rule macs2_callNarrowPeak:
         + " --gsize "
         + str(config["params"]["deeptools"]["effective_genome_length"])
         + " --pvalue "
-        + config["params"]["peakCalling"]["macs2"]["pvalue"]
+        + str(config["params"]["peakCalling"]["macs2"]["pvalue"])
         + " --keep-dup all",
     benchmark:
         "{}results/.benchmarks/{{sample}}.macs2.benchmark.txt".format(outdir)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -124,6 +124,40 @@ wildcard_constraints:
 
 # -------------------- Sample sheet Sanity checks function ---------------#
 def perform_checks(input_df):
+    def check_index_files(folder_path, prefix):
+        # Expected filenames
+        expected_files = [
+            "{}.1.ebwt",
+            "{}.2.ebwt",
+            "{}.3.ebwt",
+            "{}.4.ebwt",
+            "{}.rev.1.ebwt",
+            "{}.rev.2.ebwt",
+        ]
+        # Check if the folder exists
+        if not os.path.exists(folder_path):
+            raise FileNotFoundError(
+                "The genome index folder {} does not exist. \nPlease check that the folder is present and contains the indexing files".format(
+                    folder_path
+                )
+            )
+        # List all files in the directory to check for the presence of index files
+        files_in_directory = os.listdir(folder_path)
+        missing_files = []  # Check for each expected file
+        for file_pattern in expected_files:
+            expected_file = file_pattern.format(prefix)
+            if expected_file not in files_in_directory:
+                missing_files.append(expected_file)
+        # Report missing files
+        if missing_files:
+            raise FileNotFoundError(
+                """It appears that the genome index folder you provided is missing one/more indexing files.
+                \nPlease check that the index prefix is correct and the index files are present in {}""".format(
+                    folder_path
+                )
+            )
+
+    # config file header
     header = [
         "sample",
         "replicate",
@@ -203,18 +237,16 @@ def perform_checks(input_df):
 
     # 6. in case an index is provided for the ref genome (different than ""), check whether it actually exists
     if config["resources"]["ref"]["index"] != "":
-        if not os.path.exists(os.path.dirname(config["resources"]["ref"]["index"])):
-            raise FileNotFoundError(
-                "The provided path to the reference genome index does not exist. \nPlease check that the folder is present and contains the indexing files"
-            )
+        check_index_files(
+            os.path.dirname(config["resources"]["ref"]["index"]),
+            os.path.basename(config["resources"]["ref"]["index"]),
+        )
     # same for spike
     if config["resources"]["ref_spike"]["index_spike"] != "":
-        if not os.path.exists(
-            os.path.dirname(config["resources"]["ref_spike"]["index_spike"])
-        ):
-            raise FileNotFoundError(
-                "The provided path to the spike genome index does not exist. \nPlease check that the folder is present and contains the indexing files"
-            )
+        check_index_files(
+            os.path.dirname(config["resources"]["ref_spike"]["index_spike"]),
+            os.path.basename(config["resources"]["ref_spike"]["index_spike"]),
+        )
     # 7. check if the chromsome sizes file exists and if the blacklist file exists
     if not os.path.exists(config["params"]["peakCalling"]["chrom_sizes"]):
         raise FileNotFoundError(