From cc1d3d4c34108856e7c9d8c045f9e725e3dda85a Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Wed, 26 Jun 2024 04:05:02 +0800 Subject: [PATCH] add support for unmmapped bams (#98) * add support for unmmapped bams * fix _validate_bam_file for ubam * Add tests to make sure proper calls made with unmapped option --------- Co-authored-by: Yash Patel --- CHANGELOG.md | 4 +- pipeval/validate/__main__.py | 4 +- pipeval/validate/validate.py | 1 + pipeval/validate/validate_types.py | 2 +- pipeval/validate/validators/bam.py | 16 +++++-- test/unit/test_validate.py | 68 +++++++++++++++++++++++++++--- 6 files changed, 81 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6039e50..c80fb43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Unreleased] ### Added - Add validation flowchart - +- Add support for unmapped BAM --- ## [5.0.0] - 2024-02-16 @@ -72,7 +72,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ### Fixed -### Removed +### Removed - Remove deprecated parameter options from README - Remove inaccessible design doc link from README - Remove directory checking diff --git a/pipeval/validate/__main__.py b/pipeval/validate/__main__.py index 11f67b8..24b6df7 100644 --- a/pipeval/validate/__main__.py +++ b/pipeval/validate/__main__.py @@ -16,7 +16,7 @@ def positive_integer(arg): def add_subparser_validate(subparsers:argparse._SubParsersAction): """ Parse arguments """ - parser = subparsers.add_parser( + parser:argparse.ArgumentParser = subparsers.add_parser( name = 'validate', help = 'Validate one or more file(s)', description = 'Validate one or more file(s)', @@ -26,6 +26,8 @@ def add_subparser_validate(subparsers:argparse._SubParsersAction): parser.add_argument('path', help='One or more paths of files to validate', type=str, nargs='+') parser.add_argument('-r', '--cram-reference', default=None, \ help='Path to reference file for CRAM') + parser.add_argument('-u', '--unmapped-bam', action='store_true', + help='Input is unmmapped BAM.') parser.add_argument('-p', '--processes', type=positive_integer, default=1, \ help='Number of processes to run in parallel when validating multiple files') parser.add_argument('-t', '--test-integrity', action='store_true', \ diff --git a/pipeval/validate/validate.py b/pipeval/validate/validate.py index a54c1ad..52ffbbe 100644 --- a/pipeval/validate/validate.py +++ b/pipeval/validate/validate.py @@ -47,6 +47,7 @@ def _validate_file( `args` must contain the following: `path` is a required argument with a value of list of files `cram_reference` is a required argument with either a string value or None + `unmapped_bam` is a required argument of boolean variable ''' _path_exists(path) diff --git a/pipeval/validate/validate_types.py b/pipeval/validate/validate_types.py index 1c91e1d..2773fea 100644 --- a/pipeval/validate/validate_types.py +++ b/pipeval/validate/validate_types.py @@ -3,5 +3,5 @@ ValidateArgs = namedtuple( 'args', - 'path, cram_reference, processes, test_integrity' + 'path, cram_reference, unmapped_bam, processes, test_integrity' ) diff --git a/pipeval/validate/validators/bam.py b/pipeval/validate/validators/bam.py index 52749aa..a14aca2 100644 --- a/pipeval/validate/validators/bam.py +++ b/pipeval/validate/validators/bam.py @@ -6,14 +6,22 @@ from pipeval.validate.validate_types import ValidateArgs -def _validate_bam_file(path:Path): +def _validate_bam_file(path:Path, unmapped_bam:bool): '''Validates bam file''' + args = [str(path)] + if unmapped_bam: + args.append('-u') try: - pysam.quickcheck(str(path)) + pysam.quickcheck(*args) except pysam.SamtoolsError as err: raise ValueError("samtools bam check failed. " + str(err)) from err - bam_head: pysam.IteratorRowHead = pysam.AlignmentFile(str(path)).head(1) + kwargs = { + 'filename': str(path) + } + if unmapped_bam: + kwargs['check_sq'] = False + bam_head: pysam.IteratorRowHead = pysam.AlignmentFile(**kwargs).head(1) if next(bam_head, None) is None: raise ValueError("pysam bam check failed. No reads in " + str(path)) @@ -35,5 +43,5 @@ def _check_bam(path:Path, args:Union[ValidateArgs,Dict[str, Union[str,list]]]): `args` must contains the following: `cram_reference` is a required key with either a string value or None ''' - _validate_bam_file(path) + _validate_bam_file(path, args.unmapped_bam) _check_bam_index(path) diff --git a/test/unit/test_validate.py b/test/unit/test_validate.py index 86a6128..818843c 100644 --- a/test/unit/test_validate.py +++ b/test/unit/test_validate.py @@ -2,7 +2,7 @@ # pylint: disable=C0114 from pathlib import Path from argparse import Namespace, ArgumentTypeError -from unittest.mock import Mock, mock_open +from unittest.mock import Mock, mock_open, MagicMock import warnings import zlib import gzip @@ -110,7 +110,13 @@ def test__path_exists__errors_for_non_existing_path(mock_path): @mock.patch('pipeval.validate.files.Path', autospec=True) def test__check_compressed__raises_warning_for_uncompressed_path(mock_path, mock_magic): mock_magic.return_value = 'text/plain' - test_args = ValidateArgs(path=[], cram_reference=None, processes=1, test_integrity=False) + test_args = ValidateArgs( + path=[], + cram_reference=None, + unmapped_bam=False, + processes=1, + test_integrity=False + ) with pytest.warns(UserWarning): _check_compressed(mock_path, test_args) @@ -132,7 +138,13 @@ def test__check_compressed__passes_compression_check( compression_mime): mock_magic.return_value = compression_mime mock_integrity.return_value = None - test_args = ValidateArgs(path=[], cram_reference=None, processes=1, test_integrity=False) + test_args = ValidateArgs( + path=[], + cram_reference=None, + unmapped_bam=False, + processes=1, + test_integrity=False + ) with warnings.catch_warnings(): warnings.filterwarnings("error") @@ -147,14 +159,46 @@ def test__validate_bam_file__empty_bam_file(mock_pysam): test_path = Path('empty/valid/bam') with pytest.raises(ValueError): - _validate_bam_file(test_path) + _validate_bam_file(test_path, unmapped_bam=False) + +@mock.patch('pipeval.validate.validators.bam.pysam') +def test__validate_bam_file__quickcheck_called_with_unmapped(mock_pysam): + mock_alignment_file = Mock() + mock_alignment_file.head.return_value = iter(['read1']) + mock_quickcheck = Mock() + + mock_pysam.quickcheck = mock_quickcheck + mock_pysam.AlignmentFile.return_value = mock_alignment_file + + test_path = 'empty/valid/bam' + test_unmapped_option = '-u' + + _validate_bam_file(test_path, unmapped_bam=True) + mock_quickcheck.assert_called_with(test_path, test_unmapped_option) + +@mock.patch('pipeval.validate.validators.bam.pysam') +def test__validate_bam_file__alignmentfile_called_with_unmapped(mock_pysam): + mock_alignment_file = MagicMock() + mock_alignment_file.__iter__.return_value = ['read1'] + mock_quickcheck = Mock() + + mock_pysam.quickcheck = mock_quickcheck + mock_pysam.AlignmentFile = mock_alignment_file + + test_path = 'empty/valid/bam' + test_unmapped_option = False + + _validate_bam_file(test_path, unmapped_bam=True) + mock_alignment_file.assert_called_with( + **{'filename': test_path, 'check_sq': test_unmapped_option} + ) @mock.patch('pipeval.validate.validators.bam.Path', autospec=True) def test__validate_bam_file__quickcheck_fails(mock_path): mock_path.exists.return_value = False with pytest.raises(ValueError): - _validate_bam_file(mock_path) + _validate_bam_file(mock_path, unmapped_bam=False) @mock.patch('pipeval.validate.validators.bam.pysam', autospec=True) def test__check_bam_index__no_index_file_error(mock_pysam): @@ -247,7 +291,13 @@ def test__validate_vcf_file__passes_vcf_validation(mock_call): _validate_vcf_file('some/file') def test__run_validate__passes_validation_no_files(): - test_args = ValidateArgs(path=[], cram_reference=None, processes=1, test_integrity=False) + test_args = ValidateArgs( + path=[], + cram_reference=None, + unmapped_bam=False, + processes=1, + test_integrity=False + ) run_validate(test_args) @pytest.mark.parametrize( @@ -273,6 +323,7 @@ def test___validation_worker__fails_with_failing_checks( test_args = ValidateArgs( path=[test_path], cram_reference=None, + unmapped_bam=False, processes=1, test_integrity=False) mock_path_resolve.return_value = test_path @@ -292,6 +343,7 @@ def test__run_validate__passes_on_all_valid_files( test_args = ValidateArgs( path=[test_path], cram_reference=None, + unmapped_bam=False, processes=1, test_integrity=False) @@ -309,6 +361,7 @@ def test__run_validate__fails_with_failing_file( test_args = ValidateArgs( path=[test_path], cram_reference=None, + unmapped_bam=False, processes=1, test_integrity=False) expected_code = 1 @@ -352,6 +405,7 @@ def test__validate_file__checks_compression( test_args = ValidateArgs( path=[], cram_reference=None, + unmapped_bam=False, processes=1, test_integrity=False) @@ -369,6 +423,7 @@ def test__run_validate__fails_on_unresolvable_symlink(mock_path_resolve): test_args = ValidateArgs( path=[test_path], cram_reference=None, + unmapped_bam=False, processes=1, test_integrity=False) @@ -394,6 +449,7 @@ def test___validation_worker__passes_proper_validation( test_args = ValidateArgs( path=[test_path], cram_reference=None, + unmapped_bam=False, processes=1, test_integrity=False)