diff --git a/hail/python/hail/methods/impex.py b/hail/python/hail/methods/impex.py index 9dc23f031e3b..d1ba2b134fea 100644 --- a/hail/python/hail/methods/impex.py +++ b/hail/python/hail/methods/impex.py @@ -1,3 +1,4 @@ +from typing import Dict, Callable, Optional import os import re from collections import defaultdict @@ -8,9 +9,9 @@ import hail as hl from hail import ir -from hail.expr import StructExpression, LocusExpression, \ - expr_array, expr_float64, expr_str, expr_numeric, expr_call, expr_bool, \ - expr_int32, to_expr, analyze +from hail.expr import (StructExpression, LocusExpression, expr_array, expr_float64, expr_str, + expr_numeric, expr_call, expr_bool, expr_int32, to_expr, analyze, Expression, + DictExpression, expr_any) from hail.expr.types import hail_type, tarray, tfloat64, tstr, tint32, tstruct, \ tcall, tbool, tint64, tfloat32 from hail.genetics.reference_genome import reference_genome_type @@ -18,8 +19,8 @@ from hail.matrixtable import MatrixTable from hail.methods.misc import require_biallelic, require_row_key_variant, require_col_key_str from hail.table import Table -from hail.typecheck import typecheck, nullable, oneof, dictof, anytype, \ - sequenceof, enumeration, sized_tupleof, numeric, table_key_type, char +from hail.typecheck import (typecheck, nullable, oneof, dictof, anytype, sequenceof, enumeration, + sized_tupleof, numeric, table_key_type, char, func_spec) from hail.utils import new_temp_file from hail.utils.deduplicate import deduplicate from hail.utils.java import Env, FatalError, jindexed_seq_args, warning @@ -2628,6 +2629,9 @@ def get_vcf_metadata(path): return Env.backend().parse_vcf_metadata(path) + +array_elements_required_nonce = object() + @typecheck(path=oneof(str, sequenceof(str)), force=bool, force_bgz=bool, @@ -2637,7 +2641,7 @@ def get_vcf_metadata(path): call_fields=oneof(str, sequenceof(str)), reference_genome=nullable(reference_genome_type), contig_recoding=nullable(dictof(str, str)), - array_elements_required=bool, + array_elements_required=oneof(bool, enumeration(array_elements_required_nonce)), skip_invalid_loci=bool, entry_float_type=enumeration(tfloat32, tfloat64), filter=nullable(str), @@ -2645,7 +2649,8 @@ def get_vcf_metadata(path): n_partitions=nullable(int), block_size=nullable(int), _create_row_uids=bool, - _create_col_uids=bool) + _create_col_uids=bool, + disambiguate_single_dot=nullable(dictof(str, func_spec(1, expr_any)))) def import_vcf(path, force=False, force_bgz=False, @@ -2655,14 +2660,17 @@ def import_vcf(path, call_fields=['PGT'], reference_genome='default', contig_recoding=None, - array_elements_required=True, + array_elements_required=array_elements_required_nonce, skip_invalid_loci=False, entry_float_type=tfloat64, filter=None, find_replace=None, n_partitions=None, block_size=None, - _create_row_uids=False, _create_col_uids=False, + *, + _create_row_uids=False, + _create_col_uids=False, + disambiguate_single_dot: Optional[Dict[str, Callable[[Expression], Expression]]] = None, ) -> MatrixTable: """Import VCF file(s) as a :class:`.MatrixTable`. @@ -2779,12 +2787,11 @@ def import_vcf(path, All contigs must be present in the `reference_genome`, so this is useful for mapping differently-formatted data onto known references. array_elements_required : :obj:`bool` - If ``True``, all elements in an array field must be present. Set this - parameter to ``False`` for Hail to allow array fields with missing - values such as ``1,.,5``. In this case, the second element will be - missing. However, in the case of a single missing element ``.``, the - entire field will be missing and **not** an array with one missing - element. + DEPRECATED: use `disambiguate_single_dot` instead. If ``True``, all elements in an array + field must be present. Set this parameter to ``False`` for Hail to allow array fields with + missing values such as ``1,.,5``. In this case, the second element will be missing. However, + in the case of a single missing element ``.``, the entire field will be missing and **not** + an array with one missing element. skip_invalid_loci : :obj:`bool` If ``True``, skip loci that are not consistent with `reference_genome`. entry_float_type: :class:`.HailType` @@ -2803,23 +2810,134 @@ def import_vcf(path, are specified, `n_partitions` will be used. block_size : :obj:`int`, optional Block size, in MB. Default: 128MB blocks. + disambiguate_single_dot : :obj:`dict` of (:class:`str`, :class:`Callable`[[:class:`Expression`], :class:`Expression]), optional + User-specified expressions to interpret the meaning of an INFO or FORMAT array-field whose + value is ".". If specified, a dictionary mapping from INFO and/or FORMAT fields to + functions. Whenever a "." is encountered, the function for that field is called with the + current row as an argument. The returned value is used as the value of the field. If there + is no value for that field, an error is raised. Returns ------- :class:`.MatrixTable` + """ if force: - hl.utils.warning( + warning( f'You are trying to read {path} with *ONE* core of parallelism. This ' 'will be very slow. If this file is block-gzipped (bgzip-ed), use ' 'force_bgz=True instead.' ) + if array_elements_required is array_elements_required_nonce: + array_elements_required = False + else: + if array_elements_required is False: + warning( + f'"array_elements_required=False is deprecated. Please use "disambiguate_single_dot" instead.' + ) + + if disambiguate_single_dot is not None: + raise ValueError( + 'Do not provide "array_elements_required" when providing ' + '"disambiguate_single_dot". Provide only "disambiguate_single_dot".' + ) + + disambiguate_single_dot = disambiguate_single_dot or {} + reader = ir.MatrixVCFReader(path, call_fields, entry_float_type, header_file, n_partitions, block_size, min_partitions, reference_genome, contig_recoding, array_elements_required, skip_invalid_loci, force_bgz, force, filter, find_replace) - return MatrixTable(ir.MatrixRead(reader, drop_cols=drop_samples, drop_row_uids=not _create_row_uids, drop_col_uids=not _create_col_uids)) + mt = MatrixTable( + ir.MatrixRead( + reader, + drop_cols=drop_samples, + drop_row_uids=not _create_row_uids, + drop_col_uids=not _create_col_uids + ) + ) + + if array_elements_required is True: + return mt.select_globals() + + def disambiguate_field(field: str, value: Expression, attributes: DictExpression) -> Expression: + disambiguator = disambiguate_single_dot.get(field) + + if field in {'AF', 'CIGAR', 'EC'}: + # VCFs may incorrectly encode these as ".". The VCF spec requires A. + field_number = 'A' + elif field in {'AD', 'ADF', 'ADR'}: + # VCFs may incorrectly encode these as ".". The VCF spec requires R. + field_number = 'R' + elif field in {'GL', 'GP', 'PL', 'PP'}: + # VCFs may incorrectly encode these as ".". The VCF spec requires G. + field_number = 'G' + else: + field_number = attributes[field]['Number'] + + if not isinstance(value.dtype, hl.tarray): + if disambiguator is not None: + raise ValueError( + f'Do not provide a disambiguator for the non-array INFO field {field}. It cannot be ambiguous.' + ) + return value + + isA = field_number == 'A' + isR = field_number == 'R' + isG = field_number == 'G' + isDot = field_number == '.' + n_alts = hl.len(mt.alleles) - 1 + is_ambiguous = hl.any( + hl.all(isA, n_alts == 1), + hl.all(isR, n_alts == 0), + hl.all(isG, n_alts == 0), + isDot + ) + + cb = ( + hl.case() + .when(hl.is_defined(value), value) + .when(~is_ambiguous, value) + ) + + if disambiguator is not None: + disambiguated_value = disambiguator(mt.row) + try: + return cb.default(disambiguated_value) + except TypeError as exc: + raise ValueError( + f'Disambiguator must have same type as field: field {field} had type {value.dtype} but disambiguator had type {disambiguated_value.dtype}.' + ) from exc + + return cb.or_error( + hl.format( + f'At row with key %s, INFO array field {field} had the value ".". ' + f'This value is ambiguous. It could either be a missing array value or ' + f'an array with one element which is a missing value. You must use the ' + f'"disambiguate_single_dot" parameter of "import_vcf" to specify how to ' + f'interpret "." for this field.', + mt.row_key + ) + ) + + mt = mt.annotate_rows(info=hl.Struct(**{ + row_field: disambiguate_field( + row_field, + mt.info[row_field], + mt.vcf_header.infoAttrs + ) + for row_field in mt.info + })) + mt = mt.annotate_entries(**{ + entry_field: disambiguate_field( + entry_field, + mt.entry[entry_field], + mt.vcf_header.formatAttrs + ) + for entry_field in mt.entry + }) + return mt.select_globals() @typecheck(path=expr_str, diff --git a/hail/python/test/hail/methods/test_impex.py b/hail/python/test/hail/methods/test_impex.py index 0ded82bc3668..114f3e1e5423 100644 --- a/hail/python/test/hail/methods/test_impex.py +++ b/hail/python/test/hail/methods/test_impex.py @@ -2176,3 +2176,122 @@ def test_import_export_same(i): assert mt._same(mt2) assert mt._same(mt3) + + +def test_import_vcf_13346(): + mt = hl.import_vcf(resource(f'issue_13346.vcf'), reference_genome='GRCh38') + actual = mt.rows().collect()[0] + expected = hl.Struct( + locus=hl.Locus('chr1', 10403, reference_genome='GRCh38'), + alleles=['ACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC', 'A', 'ACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC'], + rsid=None, + qual=-10.0, + filters={'LowQual'} + , + info=hl.Struct( + AC=[1, 1], + AF=[0.250, 0.250], + AN=4, + AS_QUALapprox='0|23|45', + AS_VQSLOD=[None, None], + AS_YNG=[None, None], + QUALapprox=45, + ), + ) + assert actual == expected + + +def test_import_vcf_number0_field_disambiguator_is_error(): + with pytest.raises(ValueError, match='Do not provide a disambiguator for the non-array INFO field Number0. It cannot be ambiguous.'): + mt = hl.import_vcf( + resource(f'all_possible_ambiguous_info_array_fields.vcf'), + disambiguate_single_dot={'Number0': lambda x: 'hello'} + ) + + +def test_import_vcf_number1_field_disambiguator_is_error(): + with pytest.raises(ValueError, match='Do not provide a disambiguator for the non-array INFO field Number1. It cannot be ambiguous.'): + mt = hl.import_vcf( + resource(f'all_possible_ambiguous_info_array_fields.vcf'), + disambiguate_single_dot={'Number1': lambda x: 'hello'} + ) + + +def test_import_vcf_ambiguous_field_defaults_to_error(): + with pytest.raises(HailUserError, match='.*This value is ambiguous. It could either be a missing array value or.*'): + mt = hl.import_vcf(resource(f'all_possible_ambiguous_info_array_fields.vcf')) + print(mt.rows().collect()) + + +def test_import_vcf_disambiguator_changing_type_has_good_error_message(): + with pytest.raises(HailUserError, match='Disambiguator must have same type as field: field NumberA had type array but disambiguator had type int32.'): + mt = hl.import_vcf( + resource(f'all_possible_ambiguous_info_array_fields.vcf'), + disambiguate_single_dot={'NumberA': lambda x: 42} + ) + print(mt.rows().collect()) + + +def test_import_vcf_disambiguator_is_used_or_ambiguous_cases(): + mt = hl.import_vcf( + resource(f'all_possible_ambiguous_info_array_fields.vcf'), + disambiguate_single_dot=dict( + Number2=lambda _: ["Number2 is Ambiguous"], + NumberA=lambda _: ["NumberA is Ambiguous"], + NumberR=lambda _: ["NumberR is Ambiguous"], + NumberG=lambda _: ["NumberG is Ambiguous"], + NumberDot=lambda _: ["NumberDot is Ambiguous"], + ) + ) + one_alt, two_alts, no_ambiguities = mt.rows().collect() + + assert one_alt == hl.Struct( + locus=hl.Locus('1', 1), + alleles=['A', 'T'], + rsid=None, + qual=-10.0, + filters={'LowQual'}, + info=hl.Struct( + Number0=True, + Number1=None, + Number2=None, + NumberA=["NumberA is Ambiguous"], + NumberR=None, + NumberG=None, + NumberDot=["NumberDot is Ambiguous"], + ), + ) + + assert two_alts == hl.Struct( + locus=hl.Locus('1', 1), + alleles=['A', 'T', 'G'], + rsid=None, + qual=-10.0, + filters={'LowQual'}, + info=hl.Struct( + Number0=True, + Number1=None, + Number2=None, + NumberA=None, + NumberR=None, + NumberG=None, + NumberDot=["NumberDot is Ambiguous"], + ), + ) + + assert no_ambiguities == hl.Struct( + locus=hl.Locus('1', 2), + alleles=['A', 'T', 'G'], + rsid=None, + qual=-10.0, + filters={'LowQual'}, + info=hl.Struct( + Number0=True, + Number1='X', + Number2=['X', None], + NumberA=[None, 'X'], + NumberR=[None, 'X', None], + NumberG=[None, 'X', None, 'X', None, 'X'], + NumberDot=[None, 'X', 'X', 'X', None], + ), + ) diff --git a/hail/src/main/scala/is/hail/io/vcf/LoadVCF.scala b/hail/src/main/scala/is/hail/io/vcf/LoadVCF.scala index cf5801a3bfb0..92111733b3d4 100644 --- a/hail/src/main/scala/is/hail/io/vcf/LoadVCF.scala +++ b/hail/src/main/scala/is/hail/io/vcf/LoadVCF.scala @@ -1809,8 +1809,12 @@ class MatrixVCFReader( IRParser.parseType(params.entryFloatTypeName), params.callFields) + val globalType = TStruct( + "vcf_header" -> VCFHeaderInfo.headerType.typeAfterSelectNames(Array("formatAttrs", "filterAttrs", "infoAttrs")) + ) + def fullMatrixTypeWithoutUIDs: MatrixType = MatrixType( - globalType = TStruct.empty, + globalType = globalType, colType = TStruct("s" -> TString), colKey = Array("s"), rowType = TStruct( @@ -1892,7 +1896,14 @@ class MatrixVCFReader( GenericLines.read(fs, fileStatuses, params.nPartitions, params.blockSizeInMB, params.minPartitions, params.gzAsBGZ, params.forceGZ) } - val globals = Row(sampleIDs.zipWithIndex.map { case (s, i) => Row(s, i.toLong) }.toFastIndexedSeq) + val globals = Row( + Row( + header.formatAttrs, + header.filtersAttrs, + header.infoAttrs + ), + sampleIDs.zipWithIndex.map { case (s, i) => Row(s, i.toLong) }.toFastIndexedSeq + ) val fullRowPType: PType = fullRVDType.rowType diff --git a/hail/src/test/resources/all_possible_ambiguous_info_array_fields.vcf b/hail/src/test/resources/all_possible_ambiguous_info_array_fields.vcf new file mode 100644 index 000000000000..195d14cd5c2a --- /dev/null +++ b/hail/src/test/resources/all_possible_ambiguous_info_array_fields.vcf @@ -0,0 +1,14 @@ +##fileformat=VCFv4.2 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +#Hail cannot parse VCFs with no alts, they are parsed as an empty string allele +#1 1 . A . LowQual Number0=.;Number1=.;Number2=.;NumberA=.;NumberR=.;NumberG=.;NumberDot=. +1 1 . A T . LowQual Number0=.;Number1=.;Number2=.;NumberA=.;NumberR=.;NumberG=.;NumberDot=. +1 1 . A T,G . LowQual Number0=.;Number1=.;Number2=.;NumberA=.;NumberR=.;NumberG=.;NumberDot=. +1 2 . A T,G . LowQual Number0;Number1=X;Number2=X,.;NumberA=.,X;NumberR=.,X,.;NumberG=.,X,.,X,.,X;NumberDot=.,X,X,X,. diff --git a/hail/src/test/resources/issue_13346.vcf b/hail/src/test/resources/issue_13346.vcf new file mode 100644 index 000000000000..3ed900045b19 --- /dev/null +++ b/hail/src/test/resources/issue_13346.vcf @@ -0,0 +1,21 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8 9 10 11 1 +chr1 10403 . ACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC A,ACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC . LowQual AC=1,1;AF=0.250,0.250;AN=4;AS_QUALapprox=0|23|45;AS_VQSLOD=.,.;AS_YNG=.,.;QUALapprox=45 GT:AD:GQ:RGQ ./. 0/1:23,7,0:20:23 ./. ./. ./. 0/2:6,0,4:35:45 ./. ./. ./. ./. ./. ./. ./. ./. ./. ./. ./. ./.