Skip to content

Commit

Permalink
Merge branch 'release/0.2.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
rhshah committed Apr 9, 2020
2 parents 9f3b524 + 79e8301 commit d11a72b
Show file tree
Hide file tree
Showing 9 changed files with 261 additions and 14 deletions.
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Which have the following sub-commands:
* **generate**: To run GetBaseCountMultiSample on given BAM files
* **merge**: To merge MAF format files w.r.t counts generated from the `generate` command.
* **all**: This will run both of the sub-commands above `generate` and `merge` togather.
* **multiple-patient**: This will run sub-commands `all` for multiple patients in the provided metadata file

**Please read the USAGE** (https://genotype-variants.readthedocs.io/en/latest/usage.html) **section of the documentation for more information**

Expand Down
54 changes: 54 additions & 0 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Which have the following sub-commands:
* `generate`_: To run GetBaseCountMultiSample on given BAM files
* `merge`_: To merge MAF format files w.r.t counts generated from the `generate` command.
* `all`_: This will run both of the sub-commands above `generate` and `merge` togather.
* `multiple-patient`_: This will run sub-commands `all` for multiple patients in the provided metadata file

generate
--------
Expand Down Expand Up @@ -201,6 +202,59 @@ Expected Output
Please refer to the `generate` and `merge` usage for the expected output.


multiple-patient
----------------

To use `small_variants multiple-patient` via command line here are the options::

genotype_variants small_variants multiple-patient --help
Usage: genotype_variants small_variants multiple-patient [OPTIONS]

Command that helps to generate genotyped MAF and merge the genotyped MAF
for multiple patients. the output file will be labelled with patient
identifier as prefix

Expected header of metadata_file in any order: patient_id maf standard_bam
duplex_bam simplex_bam

For maf, standard_bam, duplex_bam and simplex_bam please include full path
to the file.

Options:
-i, --input-metadata PATH Full path to metadata file in TSV/EXCEL
format, with following headers: patient_id,
maf, standard_bam, duplex_bam, simplex_bam.
Make sure to use full paths inside the
metadata file [required]
-r, --reference-fasta PATH Full path to reference file in FASTA format
[required]
-g, --gbcms-path PATH Full path to GetBaseCountMultiSample
executable with fragment support [required]
-fd, --filter-duplicate INTEGER
Filter duplicate parameter for
GetBaseCountMultiSample
-fc, --fragment-count INTEGER Fragment Count parameter for
GetBaseCountMultiSample
-mapq, --mapping-quality INTEGER
Mapping quality for GetBaseCountMultiSample
-t, --threads INTEGER Number of threads to use for
GetBaseCountMultiSample
-v, --verbosity LVL Either CRITICAL, ERROR, WARNING, INFO or
DEBUG
--help Show this message and exit.

.. code-block:: console
genotype_variants small_variants multiple-patient \
-i /path/to/input_metadata \
-r /path/to/reference_fasta \
-g /path/to/GetBaseCountsMultiSample
Expected Output
"""""""""""""""

Please refer to the `generate` and `merge` usage for the expected output.

To use genotype_variants in a project::

import genotype_variants
2 changes: 1 addition & 1 deletion genotype_variants/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

__author__ = """Ronak Shah"""
__email__ = 'rons.shah@gmail.com'
__version__ = '0.1.0'
__version__ = '0.2.0'
2 changes: 1 addition & 1 deletion genotype_variants/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

__author__ = """Ronak Shah"""
__email__ = 'rons.shah@gmail.com'
__version__ = '0.1.0'
__version__ = '0.2.0'
198 changes: 195 additions & 3 deletions genotype_variants/commands/small_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,9 @@ def generate(
"""Command that helps to generate genotyped MAF,
the output file will be labelled with
patient identifier as prefix"""
logger_output = pathlib.Path.cwd().joinpath("genotype_variants.log")
pid = os.getpid()
logger_file = "genotype_variants_" + str(pid) + ".log"
logger_output = pathlib.Path.cwd().joinpath(logger_file)
fh = logging.FileHandler(logger_output)
formatter = logging.Formatter(
fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
Expand Down Expand Up @@ -375,7 +377,9 @@ def merge(
the program will generate merged genotypes as well.
The output file will be based on the give alphanumeric patient identifier as prefix.
"""
logger_output = pathlib.Path.cwd().joinpath("genotype_variants.log")
pid = os.getpid()
logger_file = "genotype_variants_" + str(pid) + ".log"
logger_output = pathlib.Path.cwd().joinpath(logger_file)
fh = logging.FileHandler(logger_output)
formatter = logging.Formatter(
fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
Expand Down Expand Up @@ -600,7 +604,9 @@ def all(
the output file will be labelled with
patient identifier as prefix
"""
logger_output = pathlib.Path.cwd().joinpath("genotype_variants.log")
pid = os.getpid()
logger_file = "genotype_variants_" + str(pid) + ".log"
logger_output = pathlib.Path.cwd().joinpath(logger_file)
fh = logging.FileHandler(logger_output)
formatter = logging.Formatter(
fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
Expand Down Expand Up @@ -639,4 +645,190 @@ def all(
logger.info("Elapsed time: %.1f [min]" % ((t1_stop - t1_start) / 60))
logger.info("CPU process time: %.1f [min]" % ((t2_stop - t2_start) / 60))
logger.info("--------------------------------------------------")
return final_file

@cli.command()
@click.option(
"-i",
"--input-metadata",
required=True,
type=click.Path(exists=True),
help="Full path to metadata file in TSV/EXCEL format, with following headers: patient_id, maf, standard_bam, duplex_bam, simplex_bam. Make sure to use full paths inside the metadata file",
)
@click.option(
"-r",
"--reference-fasta",
required=True,
type=click.Path(exists=True),
help="Full path to reference file in FASTA format",
)
@click.option(
"-g",
"--gbcms-path",
required=True,
type=click.Path(exists=True),
help="Full path to GetBaseCountMultiSample executable with fragment support",
)
@click.option(
"-fd",
"--filter-duplicate",
required=False,
default=0,
type=click.INT,
help="Filter duplicate parameter for GetBaseCountMultiSample",
)
@click.option(
"-fc",
"--fragment-count",
required=False,
default=1,
type=click.INT,
help="Fragment Count parameter for GetBaseCountMultiSample",
)
@click.option(
"-mapq",
"--mapping-quality",
required=False,
default=20,
type=click.INT,
help="Mapping quality for GetBaseCountMultiSample",
)
@click.option(
"-t",
"--threads",
required=False,
default=1,
type=click.INT,
help="Number of threads to use for GetBaseCountMultiSample",
)
@click_log.simple_verbosity_option(logger)
def multiple_patient(
input_metadata,
reference_fasta,
gbcms_path,
filter_duplicate,
fragment_count,
mapping_quality,
threads,
):
"""
Command that helps to generate genotyped MAF and
merge the genotyped MAF for multiple patients.
the output file will be labelled with
patient identifier as prefix
Expected header of metadata_file in any order:
patient_id
maf
standard_bam
duplex_bam
simplex_bam
For maf, standard_bam, duplex_bam and simplex_bam please include full path to the file.
"""
pid = os.getpid()
logger_file = "genotype_variants_" + str(pid) + ".log"
logger_output = pathlib.Path.cwd().joinpath(logger_file)
fh = logging.FileHandler(logger_output)
formatter = logging.Formatter(
fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%m/%d/%Y %I:%M:%S %p",
)
fh.setFormatter(formatter)
logger.addHandler(fh)
logger.info(
"========================================================================================"
)
logger.info(
">>> Running genotype_variants for small variants to generate genotypes and merge MAF <<<"
)
logger.info(
"========================================================================================="
)
t1_start = time.perf_counter()
t2_start = time.process_time()
metadata = pd.DataFrame()
try:
metadata = pd.read_excel(input_metadata)
except:
e = sys.exc_info()[0]
logger.warning(
"genotype_variants:small_variants:multiple_patient:: could not read to EXCEL file, due to error: %s",
e,
)
logger.warning(
"genotype_variants:small_variants:multiple_patient:: Assuming its as TSV file"
)
pass
try:
metadata = pd.read_csv(input_metadata, sep="\t", header="infer")
except:
e = sys.exc_info()[0]
logger.error(
"genotype_variants:small_variants:multiple_patient:: could not read TSV file, due to error: %s. Please fix and rerun the script",
e,
)
exit(1)
for ind in metadata.index:
if pd.notnull(metadata['maf'][ind]):
if pathlib.Path(metadata['maf'][ind]).is_file():
input_maf = metadata['maf'][ind]
else:
logger.error("genotype_variants::small_variants::multiple_patient:: Maf file to genotype variants is present but the path is invalid. Please provide a valid path")
exit(1)
else:
logger.error("genotype_variants::small_variants::multiple_patient:: Maf file to genotype variants is not present and is required.")
exit(1)
if pd.notnull(metadata['standard_bam'][ind]):
if pathlib.Path(metadata['standard_bam'][ind]).is_file():
standard_bam = metadata['standard_bam'][ind]
else:
standard_bam = None
else:
standard_bam = None
logger.info("genotype_variants::small_variants::multiple_patient:: Standard BAM file to genotype variants is not present.")
if pd.notnull(metadata['duplex_bam'][ind]):
if pathlib.Path(metadata['duplex_bam'][ind]).is_file():
duplex_bam = metadata['duplex_bam'][ind]
else:
duplex_bam = None
else:
duplex_bam = None
if pd.notnull(metadata['simplex_bam'][ind]):
if pathlib.Path(metadata['simplex_bam'][ind]).is_file():
simplex_bam = metadata['simplex_bam'][ind]
else:
simplex_bam = None
else:
simplex_bam = None
if duplex_bam and simplex_bam:
logger.info("genotype_variants::small_variants::multiple_patient:: duplex_bam and simplex_bam are present for genotype variants.")
else:
logger.error("genotype_variants::small_variants::multiple_patient:: duplex_bam and simplex_bam are not present for genotype variants! Please provide both of them to run genotype_variants.")
exit(1)
if pd.notnull(metadata['patient_id'][ind]):
patient_id = metadata['patient_id'][ind]
else:
logger.error("genotype_variants:small_variants:multiple_patient:: Patient Id is not a string, please check input metadata file and try again.")
exit(1)
logger.info("genotype_variants:small_variants::multiple_patient:: %s is being processed", patient_id)
final_file = all.callback(
input_maf,
reference_fasta,
gbcms_path,
patient_id,
standard_bam,
duplex_bam,
simplex_bam,
filter_duplicate,
fragment_count,
mapping_quality,
threads,
)
t1_stop = time.perf_counter()
t2_stop = time.process_time()
logger.info("--------------------------------------------------")
logger.info("Elapsed time: %.1f [min]" % ((t1_stop - t1_start) / 60))
logger.info("CPU process time: %.1f [min]" % ((t2_stop - t2_start) / 60))
logger.info("--------------------------------------------------")
return
10 changes: 5 additions & 5 deletions genotype_variants/create_all_maf_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,15 @@ def create_all_maf_dataframe(

try:
df_s["t_total_count_reverse_standard"] = (
df_s["t_total_count_standard"] - df_ds["t_total_count_forward_standard"]
df_s["t_total_count_standard"] - df_s["t_total_count_forward_standard"]
)
df_s["t_ref_count_reverse_standard"] = (
df_s["t_total_count_reverse_standard"]
- df_ds["t_ref_count_forward_standard"]
- df_s["t_ref_count_forward_standard"]
)
df_s["t_alt_count_reverse_standard"] = (
df_s["t_total_count_reverse_standard"]
- df_ds["t_alt_count_forward_standard"]
- df_s["t_alt_count_forward_standard"]
)
logger.debug(
"genotype:variants:small_variants::create_all_maf_dataframe:: Successfully generated reverse count columns in standard data frame"
Expand All @@ -130,7 +130,7 @@ def create_all_maf_dataframe(
exit(1)

try:
df_d["Tumor_Sample_Barcode"] = df_d["Tumor_Sample_Barcode"].str.replace(
df_s["Tumor_Sample_Barcode"] = df_s["Tumor_Sample_Barcode"].str.replace(
"-STANDARD", ""
)
logger.debug(
Expand All @@ -144,7 +144,7 @@ def create_all_maf_dataframe(
)

try:
df_d.set_index(mutation_key, drop=False, inplace=True)
df_s.set_index(mutation_key, drop=False, inplace=True)
logger.debug(
"genotype:variants:small_variants:create_all_maf_dataframe:: Successfully reset the index for standard data frame"
)
Expand Down
1 change: 1 addition & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ twine==3.1.1
Click==7.0
click-log==0.3.2
pandas==1.0.0
xlrd==1.2.0
3 changes: 1 addition & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.1.0
current_version = 0.2.0
commit = True
tag = True

Expand All @@ -18,5 +18,4 @@ universal = 1
exclude = docs

[aliases]
# Define setup.py command aliases here

4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
with open('HISTORY.rst') as history_file:
history = history_file.read()

requirements = ['Click>=7.0', 'click_log>=0.3.2', 'pandas>=1.0.0']
requirements = ['Click>=7.0', 'click_log>=0.3.2', 'pandas>=1.0.0', 'xlrd>=1.2.0']

setup_requirements = [ ]

Expand Down Expand Up @@ -48,6 +48,6 @@
test_suite='tests',
tests_require=test_requirements,
url='https://github.com/rhshah/genotype_variants',
version='0.1.0',
version='0.2.0',
zip_safe=False,
)

0 comments on commit d11a72b

Please sign in to comment.