Merge pull request #47 from WGLab/46-add-rrms-support

Add RRMS support
WGLab · Sep 22, 2023 · c197d94 · c197d94
2 parents 10833e6 + 907f9ec
commit c197d94
Show file tree

Hide file tree

Showing 14 changed files with 400 additions and 306 deletions.
diff --git a/.gitignore b/.gitignore
@@ -27,4 +27,4 @@ SampleData
 *.log
 
 # Testing scripts
-tests/SCRIPTS.txt
+scripts/
diff --git a/README.md b/README.md
@@ -7,14 +7,10 @@
 LongReadSum supports FASTA, FASTQ, BAM, FAST5, and sequencing_summary.txt file formats for quick generation of QC data in HTML and text format.
 
 ## Software requirements
-Please refer to `environment.yml` for detail. For your quick reference, LongReadSum needs
-```
-  - python=3.9
-  - hdf5
-  - htslib
-  - swig
-  - matplotlib
-```
+Please refer to the conda
+[environment.yml](https://github.com/WGLab/LongReadSum/blob/main/environment.yml)
+file for all required packages.
+
 # Installation using Anaconda (Linux 64-bit)
 First, install [Anaconda](https://www.anaconda.com/).
 LongReadSum can be installed using the following command:
@@ -71,34 +67,20 @@ export HDF5_PLUGIN_PATH=/full/path/to/ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/h
 
 
 ## Running
-Activate the conda environment:
-
-`conda activate lrst_py39`
-
-To test that you are using the correct Python interpreter, run:
-
-`which python`
-
-This should point to the environment's Python interpreter path:
-
-`~/miniconda3/envs/lrst_py39/bin/python`
-
-If the path is incorrect, export its location to `PATH`:
-
-`export PATH=~/miniconda3/envs/lrst_py39/bin:$PATH`
-
-Then you can run LongReadSum using the following command:
-
-`python /path/to/LongReadSum [arguments]`
+Activate the conda environment and then run with arguments:
+```
+conda activate longreadsum
+python longreadsum [arguments]
+```
 
 # General Usage
 
 Specifying input files:
 
 ```
-usage: LongReadSum [-h] {fa,fq,f5,seqtxt,bam} ...
+usage: longreadsum [-h] {fa,fq,f5,f5s,seqtxt,bam,rrms} ...
 
-QC tools for long-read sequencing data
+Fast and comprehensive QC for long read sequencing data.
 
 positional arguments:
   {fa,fq,f5,seqtxt,bam}
@@ -108,16 +90,23 @@ positional arguments:
     f5s                 FAST5 file input with signal statistics output    
     seqtxt              sequencing_summary.txt input
     bam                 BAM file input
+    rrms                RRMS BAM file input
 
 optional arguments:
   -h, --help            show this help message and exit
 
 Example with single inputs:
-	python LongReadSum bam -i path/to/input.bam -o /output_directory/
+	longreadsum bam -i input.bam -o output_directory -t 12
 
 Example with multiple inputs:
-	python LongReadSum bam -I "path/to/input1.bam, path/to/input2.bam" -o /output_directory/
-	python LongReadSum bam -P "path/to/*.bam" -o /output_directory/
+	longreadsum bam -I input1.bam, input2.bam -o output_directory
+	longreadsum bam -P *.bam -o output_directory
+
+RRMS example:
+  longreadsum rrms --csv rrms_results.csv --input input.bam --output output_directory --threads 12
+
+FAST5 signal mode example:
+  longreadsum f5s --input input.fast5 --output output_directory
 ```
 
 # Revision history

diff --git a/__main__.py b/__main__.py
@@ -1,23 +1,8 @@
-"""
-__main__.py:
-Call the command-line interface.
-"""
-
-import os
+# __main__.py: Call the command-line interface.
 
 from src import cli
 
 
-from os.path import dirname, abspath
-
-# Get the parent directory
-parent_dir = dirname(dirname(abspath(__file__)))
-
-# # Set the HDF5 plugin path
-# os.environ['HDF5_PLUGIN_PATH'] = os.path.join(parent_dir, "lib")
-print("HDF5_PLUGIN_PATH is " + os.environ.get('HDF5_PLUGIN_PATH', ''))
-
-
 def main():
     cli.main()
 

diff --git a/include/bam_module.h b/include/bam_module.h
@@ -16,8 +16,13 @@ class BAM_Module{
     std::map<std::string, bool> secondary_alignment;
     std::map<std::string, bool> supplementary_alignment;
 
+    int run(Input_Para& input_params, Output_BAM& final_output);
     int calculateStatistics(Input_Para& input_params, Output_BAM& final_output);
     static void batchStatistics(HTSReader& reader, int batch_size, Input_Para& input_params, Output_BAM& ref_output, std::mutex& bam_mutex, std::mutex& output_mutex, std::mutex& cout_mutex);
+
+    // RRMS
+    // Read the RRMS CSV file and store the read IDs (accepted or rejected)
+    std::unordered_set<std::string> readRRMSFile(std::string rrms_csv_file, bool accepted_reads);
 };
 
 #endif
diff --git a/include/hts_reader.h b/include/hts_reader.h
@@ -35,7 +35,7 @@ class HTSReader {
         int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics* basic_qc, uint64_t *base_quality_distribution);
 
         // Read the next batch of records from the BAM file
-        int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex);
+        int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex, std::unordered_set<std::string>& read_ids);
 
         // Return if the reader has finished reading the BAM file
         bool hasNextRecord();

diff --git a/include/input_parameters.h b/include/input_parameters.h
@@ -8,6 +8,7 @@ Define the Python bindings from our C++ modules
 
 #include <vector>
 #include <string>
+#include <unordered_set>  // For RRMS read ID filtering
 #define MAX_INPUT_FILES 2048
 
 
@@ -28,6 +29,9 @@ class Input_Para{
     std::string output_folder;  // Output folder
     std::string input_files[MAX_INPUT_FILES];  // Input files
     std::string read_ids;  // Read IDs comma-separated (FAST5 signal module)
+    std::string rrms_csv;  // CSV file with accepted/rejected read IDs (RRMS module)
+    bool rrms_filter;  // Generate RRMS stats for accepted (true) or rejected (false) reads
+    std::unordered_set<std::string> rrms_read_ids;  // List of read IDs from RRMS CSV file (accepted or rejected)
 
     // Functions
     std::string add_input_file(const std::string& _ip_file);

diff --git a/include/output_data.h b/include/output_data.h
@@ -14,9 +14,8 @@ Define the output structures for each module.
 #include "input_parameters.h"
 
 #define MAX_READ_LENGTH 10485760
-#define MAX_MAP_QUALITY 256
-#define MAX_BASE_QUALITY 256
-#define MAX_READ_QUALITY 256
+#define MAX_BASE_QUALITY 100
+#define MAX_READ_QUALITY 100
 #define MAX_SIGNAL_VALUE 5000
 #define PERCENTAGE_ARRAY_SIZE 101
 #define ZeroDefault 0

diff --git a/src/bam_module.cpp b/src/bam_module.cpp
@@ -4,14 +4,50 @@ Class for generating BAM file statistics. Records are accessed using multi-threa
 */
 
 #include <iostream>
+#include <fstream>
+#include <string>
 #include <thread>
 #include <iostream>
 #include <cmath>
+#include <unordered_set>
 
 #include "bam_module.h"
 
+// Run the BAM module
+int BAM_Module::run(Input_Para &input_params, Output_BAM &final_output)
+{
+    int exit_code = 0;
+
+    // Determine if RRMS read ID filtering is required
+    if (input_params.rrms_csv != ""){
+        std::cout << "RRMS read ID filtering enabled" << std::endl;
+        std::cout << "RRMS CSV file: " << input_params.rrms_csv << std::endl;
+
+        // Determine if RRMS stats should be generated for accepted or rejected
+        // reads
+        if (input_params.rrms_filter){
+            std::cout << "RRMS stats will be generated for accepted reads" << std::endl;
+        } else {
+            std::cout << "RRMS stats will be generated for rejected reads" << std::endl;
+        }
+
+        // Read the RRMS CSV file and store the read IDs
+        std::cout << "Reading RRMS CSV file..." << std::endl;
+        std::unordered_set<std::string> rrms_read_ids = readRRMSFile(input_params.rrms_csv, input_params.rrms_filter);
+        std::cout << "Number of read IDs = " << rrms_read_ids.size() << std::endl;
+
+        // Store the read IDs in the input parameters
+        input_params.rrms_read_ids = rrms_read_ids;
+    }
 
-int BAM_Module::calculateStatistics(Input_Para& input_params, Output_BAM& final_output){
+    // Calculate statistics
+    exit_code = calculateStatistics(input_params, final_output);
+
+    return exit_code;
+}
+
+int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_output)
+{
     int exit_code = 0;
     auto relapse_start_time = std::chrono::high_resolution_clock::now();
 
@@ -108,7 +144,13 @@ int BAM_Module::calculateStatistics(Input_Para& input_params, Output_BAM& final_
 
     // Save the summary statistics to a file
     std::cout << "Saving summary statistics to file..." << std::endl;
-    std::string summary_filepath = input_params.output_folder + "/bam_summary.txt";
+
+    // If in RRMS mode, append RRMS accepted/rejected to the output prefix
+    std::string output_prefix = "bam";
+    if (input_params.rrms_csv != ""){
+        output_prefix += input_params.rrms_filter ? "_rrms_accepted" : "_rrms_rejected";
+    } 
+    std::string summary_filepath = input_params.output_folder + "/" + output_prefix + "_summary.txt";
     final_output.save_summary(summary_filepath, input_params, final_output);
     std::cout << "Saved file: " << summary_filepath << std::endl;
 
@@ -120,13 +162,89 @@ int BAM_Module::calculateStatistics(Input_Para& input_params, Output_BAM& final_
 
 void BAM_Module::batchStatistics(HTSReader& reader, int batch_size, Input_Para& input_params, Output_BAM& final_output, std::mutex& bam_mutex, std::mutex& output_mutex, std::mutex& cout_mutex)
 {
-    Output_BAM record_output;  // Output for the current batch
-
     // Read the next N records
-    reader.readNextRecords(batch_size, record_output, bam_mutex);
+    Output_BAM record_output;
+    reader.readNextRecords(batch_size, record_output, bam_mutex, input_params.rrms_read_ids);
 
     // Update the final output
     output_mutex.lock();
     final_output.add(record_output);
     output_mutex.unlock();
 }
+
+std::unordered_set<std::string> BAM_Module::readRRMSFile(std::string rrms_csv_file, bool accepted_reads)
+{
+    // Create an unordered set to store the read IDs for fast lookup
+    std::unordered_set<std::string> rrms_read_ids;
+
+    // Open the file
+    std::ifstream rrms_file(rrms_csv_file);
+
+    // Read the header and find the 'read_id' and 'decision' columns
+    std::string header;
+    std::vector<std::string> header_fields;
+    std::getline(rrms_file, header);
+    std::stringstream ss(header);
+    std::string field;
+    // std::cout << "RRMS CSV header:" << std::endl;
+    while (std::getline(ss, field, ',')){
+        header_fields.push_back(field);
+        // std::cout << field << std::endl;
+    }
+
+    // Find the 'read_id' and 'decision' columns
+    int read_id_index = -1;
+    int decision_index = -1;
+    for (size_t i=0; i<header_fields.size(); i++){
+        if (header_fields[i] == "read_id"){
+            read_id_index = i;
+        } else if (header_fields[i] == "decision"){
+            decision_index = i;
+        }
+    }
+
+    // Exit if the read_id or decision columns are not found
+    if (read_id_index == -1){
+        std::cerr << "Error: 'read_id' column not found in RRMS CSV file" << std::endl;
+        exit(1);
+    }
+
+    if (decision_index == -1){
+        std::cerr << "Error: 'decision' column not found in RRMS CSV file" << std::endl;
+        exit(1);
+    }
+
+    // Read all rows in the file and store the read IDs if the decision is
+    // 'stop_receiving' for accepted, or 'unblock' for rejected reads.
+    std::string pattern = accepted_reads ? "stop_receiving" : "unblock";
+    std::string line;
+    while (std::getline(rrms_file, line)){
+        std::vector<std::string> fields;
+        std::string field;
+        std::stringstream ss(line);
+        while (std::getline(ss, field, ',')){
+            fields.push_back(field);
+        }
+
+        // Get the read ID and decision
+        std::string read_id = fields[read_id_index];
+        std::string decision = fields[decision_index];
+
+        // Store the read ID if the decision matches the pattern
+        if (decision == pattern){
+            rrms_read_ids.insert(read_id);
+        }
+    }
+
+
+    // Close the file
+    rrms_file.close();
+
+    // // Print the first 10 read IDs
+    // std::cout << "First 10 read IDs:" << std::endl;
+    // for (int i=0; i<10; i++){
+    //     std::cout << rrms_read_ids[i] << std::endl;
+    // }
+
+    return rrms_read_ids;
+}