Merge branch 'release/0.2.0'

msk-access · Apr 9, 2020 · d11a72b · d11a72b
2 parents 9f3b524 + 79e8301
commit d11a72b
Show file tree

Hide file tree

Showing 9 changed files with 261 additions and 14 deletions.
diff --git a/README.rst b/README.rst
@@ -35,6 +35,7 @@ Which have the following sub-commands:
 * **generate**: To run GetBaseCountMultiSample on given BAM files
 * **merge**: To merge MAF format files w.r.t counts generated from the `generate` command.
 * **all**: This will run both of the sub-commands above `generate` and `merge` togather.
+* **multiple-patient**: This will run sub-commands `all` for multiple patients in the provided metadata file
 
 **Please read the USAGE** (https://genotype-variants.readthedocs.io/en/latest/usage.html) **section of the documentation for more information**
 

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -11,6 +11,7 @@ Which have the following sub-commands:
 * `generate`_: To run GetBaseCountMultiSample on given BAM files
 * `merge`_: To merge MAF format files w.r.t counts generated from the `generate` command.
 * `all`_: This will run both of the sub-commands above `generate` and `merge` togather.
+* `multiple-patient`_: This will run sub-commands `all` for multiple patients in the provided metadata file
 
 generate
 --------
@@ -201,6 +202,59 @@ Expected Output
     Please refer to the `generate` and `merge` usage for the expected output.
 
 
+multiple-patient
+----------------
+
+To use `small_variants multiple-patient` via command line here are the options::
+
+    genotype_variants small_variants multiple-patient --help
+    Usage: genotype_variants small_variants multiple-patient [OPTIONS]
+
+    Command that helps to generate genotyped MAF and  merge the genotyped MAF
+    for multiple patients. the output file will be labelled with  patient
+    identifier as prefix
+
+    Expected header of metadata_file in any order: patient_id maf standard_bam
+    duplex_bam simplex_bam
+
+    For maf, standard_bam, duplex_bam and simplex_bam please include full path
+    to the file.
+
+    Options:
+    -i, --input-metadata PATH       Full path to metadata file in TSV/EXCEL
+                                    format, with following headers: patient_id,
+                                    maf, standard_bam, duplex_bam, simplex_bam.
+                                    Make sure to use full paths inside the
+                                    metadata file  [required]
+    -r, --reference-fasta PATH      Full path to reference file in FASTA format
+                                    [required]
+    -g, --gbcms-path PATH           Full path to GetBaseCountMultiSample
+                                    executable with fragment support  [required]
+    -fd, --filter-duplicate INTEGER
+                                    Filter duplicate parameter for
+                                    GetBaseCountMultiSample
+    -fc, --fragment-count INTEGER   Fragment Count parameter for
+                                    GetBaseCountMultiSample
+    -mapq, --mapping-quality INTEGER
+                                    Mapping quality for GetBaseCountMultiSample
+    -t, --threads INTEGER           Number of threads to use for
+                                    GetBaseCountMultiSample
+    -v, --verbosity LVL             Either CRITICAL, ERROR, WARNING, INFO or
+                                    DEBUG
+    --help                          Show this message and exit.
+
+.. code-block:: console 
+    
+    genotype_variants small_variants multiple-patient \
+    -i /path/to/input_metadata \
+    -r /path/to/reference_fasta \
+    -g /path/to/GetBaseCountsMultiSample
+
+Expected Output
+"""""""""""""""
+
+    Please refer to the `generate` and `merge` usage for the expected output.
+
 To use genotype_variants in a project::
 
     import genotype_variants
diff --git a/genotype_variants/__init__.py b/genotype_variants/__init__.py
@@ -2,4 +2,4 @@
 
 __author__ = """Ronak Shah"""
 __email__ = 'rons.shah@gmail.com'
-__version__ = '0.1.0'
+__version__ = '0.2.0'
diff --git a/genotype_variants/commands/__init__.py b/genotype_variants/commands/__init__.py
@@ -2,4 +2,4 @@
 
 __author__ = """Ronak Shah"""
 __email__ = 'rons.shah@gmail.com'
-__version__ = '0.1.0'
+__version__ = '0.2.0'
diff --git a/genotype_variants/commands/small_variants.py b/genotype_variants/commands/small_variants.py
@@ -156,7 +156,9 @@ def generate(
     """Command that helps to generate genotyped MAF,
     the output file will be labelled with
     patient identifier as prefix"""
-    logger_output = pathlib.Path.cwd().joinpath("genotype_variants.log")
+    pid = os.getpid()
+    logger_file = "genotype_variants_" + str(pid) + ".log"
+    logger_output = pathlib.Path.cwd().joinpath(logger_file)
     fh = logging.FileHandler(logger_output)
     formatter = logging.Formatter(
         fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -375,7 +377,9 @@ def merge(
     the program will generate merged genotypes as well.
     The output file will be based on the give alphanumeric patient identifier as prefix.
     """
-    logger_output = pathlib.Path.cwd().joinpath("genotype_variants.log")
+    pid = os.getpid()
+    logger_file = "genotype_variants_" + str(pid) + ".log"
+    logger_output = pathlib.Path.cwd().joinpath(logger_file)
     fh = logging.FileHandler(logger_output)
     formatter = logging.Formatter(
         fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -600,7 +604,9 @@ def all(
     the output file will be labelled with 
     patient identifier as prefix
     """
-    logger_output = pathlib.Path.cwd().joinpath("genotype_variants.log")
+    pid = os.getpid()
+    logger_file = "genotype_variants_" + str(pid) + ".log"
+    logger_output = pathlib.Path.cwd().joinpath(logger_file)
     fh = logging.FileHandler(logger_output)
     formatter = logging.Formatter(
         fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -639,4 +645,190 @@ def all(
     logger.info("Elapsed time: %.1f [min]" % ((t1_stop - t1_start) / 60))
     logger.info("CPU process time: %.1f [min]" % ((t2_stop - t2_start) / 60))
     logger.info("--------------------------------------------------")
+    return final_file
+
+@cli.command()
+@click.option(
+    "-i",
+    "--input-metadata",
+    required=True,
+    type=click.Path(exists=True),
+    help="Full path to metadata file in TSV/EXCEL format, with following headers: patient_id, maf, standard_bam, duplex_bam, simplex_bam. Make sure to use full paths inside the metadata file",
+)
+@click.option(
+    "-r",
+    "--reference-fasta",
+    required=True,
+    type=click.Path(exists=True),
+    help="Full path to reference file in FASTA format",
+)
+@click.option(
+    "-g",
+    "--gbcms-path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Full path to GetBaseCountMultiSample executable with fragment support",
+)
+@click.option(
+    "-fd",
+    "--filter-duplicate",
+    required=False,
+    default=0,
+    type=click.INT,
+    help="Filter duplicate parameter for GetBaseCountMultiSample",
+)
+@click.option(
+    "-fc",
+    "--fragment-count",
+    required=False,
+    default=1,
+    type=click.INT,
+    help="Fragment Count parameter for GetBaseCountMultiSample",
+)
+@click.option(
+    "-mapq",
+    "--mapping-quality",
+    required=False,
+    default=20,
+    type=click.INT,
+    help="Mapping quality for GetBaseCountMultiSample",
+)
+@click.option(
+    "-t",
+    "--threads",
+    required=False,
+    default=1,
+    type=click.INT,
+    help="Number of threads to use for GetBaseCountMultiSample",
+)
+@click_log.simple_verbosity_option(logger)
+def multiple_patient(
+    input_metadata,
+    reference_fasta,
+    gbcms_path,
+    filter_duplicate,
+    fragment_count,
+    mapping_quality,
+    threads,
+):
+    """
+    Command that helps to generate genotyped MAF and 
+    merge the genotyped MAF for multiple patients.
+    the output file will be labelled with 
+    patient identifier as prefix
+
+    Expected header of metadata_file in any order:
+    patient_id
+    maf
+    standard_bam
+    duplex_bam
+    simplex_bam
+    
+    For maf, standard_bam, duplex_bam and simplex_bam please include full path to the file.
+    """
+    pid = os.getpid()
+    logger_file = "genotype_variants_" + str(pid) + ".log"
+    logger_output = pathlib.Path.cwd().joinpath(logger_file)
+    fh = logging.FileHandler(logger_output)
+    formatter = logging.Formatter(
+        fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%m/%d/%Y %I:%M:%S %p",
+    )
+    fh.setFormatter(formatter)
+    logger.addHandler(fh)
+    logger.info(
+        "========================================================================================"
+    )
+    logger.info(
+        ">>> Running genotype_variants for small variants to generate genotypes and merge MAF <<<"
+    )
+    logger.info(
+        "========================================================================================="
+    )
+    t1_start = time.perf_counter()
+    t2_start = time.process_time()
+    metadata = pd.DataFrame()
+    try:
+        metadata = pd.read_excel(input_metadata)
+    except:
+        e = sys.exc_info()[0]
+        logger.warning(
+            "genotype_variants:small_variants:multiple_patient:: could not read to EXCEL file, due to error: %s",
+            e,
+        )
+        logger.warning(
+            "genotype_variants:small_variants:multiple_patient:: Assuming its as TSV file"
+        )
+        pass
+    try:
+        metadata = pd.read_csv(input_metadata, sep="\t", header="infer")
+    except:
+        e = sys.exc_info()[0]
+        logger.error(
+            "genotype_variants:small_variants:multiple_patient:: could not read TSV file, due to error: %s. Please fix and rerun the script",
+            e,
+        )
+        exit(1)
+    for ind in metadata.index:
+        if pd.notnull(metadata['maf'][ind]):
+            if pathlib.Path(metadata['maf'][ind]).is_file():
+                input_maf = metadata['maf'][ind]
+            else:
+                logger.error("genotype_variants::small_variants::multiple_patient:: Maf file to genotype variants is present but the path is invalid. Please provide a valid path")
+                exit(1)
+        else:
+            logger.error("genotype_variants::small_variants::multiple_patient:: Maf file to genotype variants is not present and is required.")
+            exit(1)
+        if pd.notnull(metadata['standard_bam'][ind]): 
+            if pathlib.Path(metadata['standard_bam'][ind]).is_file():
+                standard_bam = metadata['standard_bam'][ind]
+            else:
+                standard_bam = None
+        else:
+            standard_bam = None
+            logger.info("genotype_variants::small_variants::multiple_patient:: Standard BAM file to genotype variants is not present.")
+        if pd.notnull(metadata['duplex_bam'][ind]):
+            if pathlib.Path(metadata['duplex_bam'][ind]).is_file():
+                duplex_bam = metadata['duplex_bam'][ind]
+            else:
+                duplex_bam = None
+        else:
+            duplex_bam = None
+        if pd.notnull(metadata['simplex_bam'][ind]):
+            if pathlib.Path(metadata['simplex_bam'][ind]).is_file():
+                simplex_bam = metadata['simplex_bam'][ind]
+            else:
+                simplex_bam = None
+        else:
+            simplex_bam = None
+        if duplex_bam and simplex_bam:
+            logger.info("genotype_variants::small_variants::multiple_patient:: duplex_bam and simplex_bam are present for genotype variants.")
+        else:
+            logger.error("genotype_variants::small_variants::multiple_patient:: duplex_bam and simplex_bam are not present for genotype variants! Please provide both of them to run genotype_variants.")
+            exit(1)
+        if pd.notnull(metadata['patient_id'][ind]):
+            patient_id = metadata['patient_id'][ind]
+        else:
+            logger.error("genotype_variants:small_variants:multiple_patient:: Patient Id is not a string, please check input metadata file and try again.")
+            exit(1)
+        logger.info("genotype_variants:small_variants::multiple_patient:: %s is being processed", patient_id)
+        final_file = all.callback(
+            input_maf,
+            reference_fasta,
+            gbcms_path,
+            patient_id,
+            standard_bam,
+            duplex_bam,
+            simplex_bam,
+            filter_duplicate,
+            fragment_count,
+            mapping_quality,
+            threads,
+        )
+    t1_stop = time.perf_counter()
+    t2_stop = time.process_time()
+    logger.info("--------------------------------------------------")
+    logger.info("Elapsed time: %.1f [min]" % ((t1_stop - t1_start) / 60))
+    logger.info("CPU process time: %.1f [min]" % ((t2_stop - t2_start) / 60))
+    logger.info("--------------------------------------------------")
     return
diff --git a/genotype_variants/create_all_maf_dataframe.py b/genotype_variants/create_all_maf_dataframe.py
@@ -108,15 +108,15 @@ def create_all_maf_dataframe(
 
         try:
             df_s["t_total_count_reverse_standard"] = (
-                df_s["t_total_count_standard"] - df_ds["t_total_count_forward_standard"]
+                df_s["t_total_count_standard"] - df_s["t_total_count_forward_standard"]
             )
             df_s["t_ref_count_reverse_standard"] = (
                 df_s["t_total_count_reverse_standard"]
-                - df_ds["t_ref_count_forward_standard"]
+                - df_s["t_ref_count_forward_standard"]
             )
             df_s["t_alt_count_reverse_standard"] = (
                 df_s["t_total_count_reverse_standard"]
-                - df_ds["t_alt_count_forward_standard"]
+                - df_s["t_alt_count_forward_standard"]
             )
             logger.debug(
                 "genotype:variants:small_variants::create_all_maf_dataframe:: Successfully generated reverse count columns in standard data frame"
@@ -130,7 +130,7 @@ def create_all_maf_dataframe(
             exit(1)
 
         try:
-            df_d["Tumor_Sample_Barcode"] = df_d["Tumor_Sample_Barcode"].str.replace(
+            df_s["Tumor_Sample_Barcode"] = df_s["Tumor_Sample_Barcode"].str.replace(
                 "-STANDARD", ""
             )
             logger.debug(
@@ -144,7 +144,7 @@ def create_all_maf_dataframe(
             )
 
         try:
-            df_d.set_index(mutation_key, drop=False, inplace=True)
+            df_s.set_index(mutation_key, drop=False, inplace=True)
             logger.debug(
                 "genotype:variants:small_variants:create_all_maf_dataframe:: Successfully reset the index for standard data frame"
             )

diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -10,3 +10,4 @@ twine==3.1.1
 Click==7.0
 click-log==0.3.2
 pandas==1.0.0
+xlrd==1.2.0
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.1.0
+current_version = 0.2.0
 commit = True
 tag = True
 
@@ -18,5 +18,4 @@ universal = 1
 exclude = docs
 
 [aliases]
-# Define setup.py command aliases here
 
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 with open('HISTORY.rst') as history_file:
     history = history_file.read()
 
-requirements = ['Click>=7.0', 'click_log>=0.3.2', 'pandas>=1.0.0']
+requirements = ['Click>=7.0', 'click_log>=0.3.2', 'pandas>=1.0.0', 'xlrd>=1.2.0']
 
 setup_requirements = [ ]
 
@@ -48,6 +48,6 @@
     test_suite='tests',
     tests_require=test_requirements,
     url='https://github.com/rhshah/genotype_variants',
-    version='0.1.0',
+    version='0.2.0',
     zip_safe=False,
 )