diff --git a/hail/python/hail/docs/data/missing-values-in-array-fields.vcf b/hail/python/hail/docs/data/missing-values-in-array-fields.vcf new file mode 100644 index 00000000000..2a590a2b987 --- /dev/null +++ b/hail/python/hail/docs/data/missing-values-in-array-fields.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.1 +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 +1 123456 . A C . . A=1,.;B=.,2,.;C=. GT:X:Y:Z 0/0:1,.,1:. diff --git a/hail/python/hail/methods/impex.py b/hail/python/hail/methods/impex.py index 53ab8743362..7b8a66c52c6 100644 --- a/hail/python/hail/methods/impex.py +++ b/hail/python/hail/methods/impex.py @@ -2872,6 +2872,48 @@ def import_vcf( >>> ds = hl.import_vcf('data/sample.vcf.gz', force_bgz=True) + Import a VCF which has missing values (".") inside INFO or FORMAT array fields: + + >>> print(open('data/missing-values-in-array-fields.vcf').read()) + ##fileformat=VCFv4.1 + ##FORMAT= + ##FORMAT= + ##FORMAT= + ##FORMAT= + ##INFO= + ##INFO= + ##INFO= + ##INFO= + #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 + 1 123456 . A C . . A=1,.;B=.,2,.;C=. GT:X:Y:Z 0/0:1,.,1:. + + >>> ds = hl.import_vcf('data/missing-values-in-array-fields.vcf', array_elements_required=False) + >>> ds.show(n_rows=1, n_cols=1, include_row_fields=True) + +---------------+------------+------+-----------+----------+--------------+ + | locus | alleles | rsid | qual | filters | info.A | + +---------------+------------+------+-----------+----------+--------------+ + | locus | array | str | float64 | set | array | + +---------------+------------+------+-----------+----------+--------------+ + | 1:123456 | ["A","C"] | NA | -1.00e+01 | NA | [1,NA] | + +---------------+------------+------+-----------+----------+--------------+ + + +------------------+----------------+----------------+--------------+ + | info.B | info.C | info.D | 'SAMPLE1'.GT | + +------------------+----------------+----------------+--------------+ + | array | array | array | call | + +------------------+----------------+----------------+--------------+ + | [NA,2.00e+00,NA] | NA | NA | 0/0 | + +------------------+----------------+----------------+--------------+ + + +--------------+--------------+--------------+ + | 'SAMPLE1'.X | 'SAMPLE1'.Y | 'SAMPLE1'.Z | + +--------------+--------------+--------------+ + | array | array | array | + +--------------+--------------+--------------+ + | [1,NA,1] | NA | NA | + +--------------+--------------+--------------+ + + Notes ----- diff --git a/hail/python/test/hail/methods/test_impex.py b/hail/python/test/hail/methods/test_impex.py index 4fb62dcb6f4..372ae2a86a2 100644 --- a/hail/python/test/hail/methods/test_impex.py +++ b/hail/python/test/hail/methods/test_impex.py @@ -212,17 +212,34 @@ def test_import_vcf_can_import_negative_numbers(self): ) ) - def test_import_vcf_missing_info_field_elements(self): + def test_import_vcf_has_good_error_message_when_info_fields_have_missing_elements(self): + mt = hl.import_vcf(resource('missingInfoArray.vcf'), reference_genome='GRCh37') + with pytest.raises( + FatalError, + match=".*Missing value in INFO array. Use 'hl.import_vcf[(][.][.][.], array_elements_required=False[)]'[.].*", + ): + mt._force_count_rows() + + def test_import_vcf_array_elements_required_is_false_parses_info_fields_with_missing_elements(self): mt = hl.import_vcf(resource('missingInfoArray.vcf'), reference_genome='GRCh37', array_elements_required=False) - mt = mt.select_rows(FOO=mt.info.FOO, BAR=mt.info.BAR) + mt = mt.select_rows(**mt.info) expected = hl.Table.parallelize( [ - {'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 'FOO': [1, None], 'BAR': [2, None, None]}, + { + 'locus': hl.Locus('X', 16050036), + 'alleles': ['A', 'C'], + 'FOO': [1, None], + 'BAR': [2, None, None], + 'JUST_A_DOT': None, + 'NOT_EVEN_PRESENT': None, + }, { 'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 'FOO': [None, 2, None], 'BAR': [None, 1.0, None], + 'JUST_A_DOT': None, + 'NOT_EVEN_PRESENT': None, }, ], hl.tstruct( @@ -230,10 +247,12 @@ def test_import_vcf_missing_info_field_elements(self): alleles=hl.tarray(hl.tstr), FOO=hl.tarray(hl.tint), BAR=hl.tarray(hl.tfloat64), + JUST_A_DOT=hl.tarray(hl.tfloat64), + NOT_EVEN_PRESENT=hl.tarray(hl.tfloat64), ), key=['locus', 'alleles'], ) - self.assertTrue(mt.rows()._same(expected)) + assert mt.rows()._same(expected) def test_import_vcf_missing_format_field_elements(self): mt = hl.import_vcf(resource('missingFormatArray.vcf'), reference_genome='GRCh37', array_elements_required=False) diff --git a/hail/src/main/scala/is/hail/io/vcf/LoadVCF.scala b/hail/src/main/scala/is/hail/io/vcf/LoadVCF.scala index 82dbd89d9c5..a606dd0a817 100644 --- a/hail/src/main/scala/is/hail/io/vcf/LoadVCF.scala +++ b/hail/src/main/scala/is/hail/io/vcf/LoadVCF.scala @@ -740,7 +740,7 @@ final class VCFLine( if (formatArrayElementMissing()) { if (arrayElementsRequired) parseError( - s"missing value in FORMAT array. Import with argument 'array_elements_required=False'" + "Missing value in FORMAT array. Use 'hl.import_vcf(..., array_elements_required=False)'." ) ab.addMissing() pos += 1 @@ -749,11 +749,11 @@ final class VCFLine( } } - def parseIntArrayElement() { + def parseArrayIntElement() { if (formatArrayElementMissing()) { if (arrayElementsRequired) parseError( - s"missing value in FORMAT array. Import with argument 'array_elements_required=False'" + "Missing value in FORMAT array. Use 'hl.import_vcf(..., array_elements_required=False)'." ) abi.addMissing() pos += 1 @@ -766,7 +766,7 @@ final class VCFLine( if (formatArrayElementMissing()) { if (arrayElementsRequired) parseError( - s"missing value in FORMAT array. Import with argument 'array_elements_required=False'" + "Missing value in FORMAT array. Use 'hl.import_vcf(..., array_elements_required=False)'." ) abf.addMissing() pos += 1 @@ -775,11 +775,11 @@ final class VCFLine( } } - def parseDoubleArrayElement() { + def parseArrayDoubleElement() { if (formatArrayElementMissing()) { if (arrayElementsRequired) parseError( - s"missing value in FORMAT array. Import with argument 'array_elements_required=False'" + "Missing value in FORMAT array. Use 'hl.import_vcf(..., array_elements_required=False)'." ) abd.addMissing() pos += 1 @@ -788,11 +788,11 @@ final class VCFLine( } } - def parseStringArrayElement() { + def parseArrayStringElement() { if (formatArrayElementMissing()) { if (arrayElementsRequired) parseError( - s"missing value in FORMAT array. Import with argument 'array_elements_required=False'" + "Missing value in FORMAT array. Use 'hl.import_vcf(..., array_elements_required=False)'." ) abs.addMissing() pos += 1 @@ -808,11 +808,11 @@ final class VCFLine( } else { assert(abi.length == 0) - parseIntArrayElement() + parseArrayIntElement() while (!endFormatField()) { pos += 1 // comma - parseIntArrayElement() + parseArrayIntElement() } rvb.startArray(abi.length) @@ -837,10 +837,10 @@ final class VCFLine( } else { assert(abs.length == 0) - parseStringArrayElement() + parseArrayStringElement() while (!endFormatField()) { pos += 1 // comma - parseStringArrayElement() + parseArrayStringElement() } rvb.startArray(abs.length) @@ -890,10 +890,10 @@ final class VCFLine( } else { assert(abd.length == 0) - parseDoubleArrayElement() + parseArrayDoubleElement() while (!endFormatField()) { pos += 1 // comma - parseDoubleArrayElement() + parseArrayDoubleElement() } rvb.startArray(abd.length) @@ -993,24 +993,36 @@ final class VCFLine( def parseDoubleInInfoArray(): Double = VCFUtils.parseVcfDouble(parseStringInInfoArray()) - def parseIntInfoArrayElement() { + def parseInfoArrayIntElement() { if (infoArrayElementMissing()) { + if (arrayElementsRequired) + parseError( + "Missing value in INFO array. Use 'hl.import_vcf(..., array_elements_required=False)'." + ) abi.addMissing() pos += 1 // dot } else abi += parseIntInInfoArray() } - def parseStringInfoArrayElement() { + def parseInfoArrayStringElement() { if (infoArrayElementMissing()) { + if (arrayElementsRequired) + parseError( + "Missing value in INFO array. Use 'hl.import_vcf(..., array_elements_required=False)'." + ) abs.addMissing() pos += 1 // dot } else abs += parseStringInInfoArray() } - def parseDoubleInfoArrayElement() { + def parseInfoArrayDoubleElement() { if (infoArrayElementMissing()) { + if (arrayElementsRequired) + parseError( + "Missing value in INFO array. Use 'hl.import_vcf(..., array_elements_required=False)'." + ) abd.addMissing() pos += 1 } else { @@ -1022,10 +1034,10 @@ final class VCFLine( if (!infoFieldMissing()) { rvb.setPresent() assert(abi.length == 0) - parseIntInfoArrayElement() + parseInfoArrayIntElement() while (!endInfoField()) { pos += 1 // comma - parseIntInfoArrayElement() + parseInfoArrayIntElement() } rvb.startArray(abi.length) @@ -1046,10 +1058,10 @@ final class VCFLine( if (!infoFieldMissing()) { rvb.setPresent() assert(abs.length == 0) - parseStringInfoArrayElement() + parseInfoArrayStringElement() while (!endInfoField()) { pos += 1 // comma - parseStringInfoArrayElement() + parseInfoArrayStringElement() } rvb.startArray(abs.length) @@ -1070,10 +1082,10 @@ final class VCFLine( if (!infoFieldMissing()) { rvb.setPresent() assert(abd.length == 0) - parseDoubleInfoArrayElement() + parseInfoArrayDoubleElement() while (!endInfoField()) { pos += 1 // comma - parseDoubleInfoArrayElement() + parseInfoArrayDoubleElement() } rvb.startArray(abd.length) diff --git a/hail/src/test/resources/missingInfoArray.vcf b/hail/src/test/resources/missingInfoArray.vcf index 913ffb8d1a5..b65ac6fe572 100644 --- a/hail/src/test/resources/missingInfoArray.vcf +++ b/hail/src/test/resources/missingInfoArray.vcf @@ -9,6 +9,8 @@ ##FORMAT= ##INFO= ##INFO= +##INFO= +##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT C1046::HG02024 C1046::HG02025 -X 16050036 . A C 19961.13 . FOO=1,.;BAR=2,.,. GT:GTA:GTZ:AD:DP:GQ:PL 0/0:./.:0/1:10,0:10:44:0,44,180 1:.:0:0,6:7:70:70,0 -X 16061250 . T A,C 547794.46 . FOO=.,2,.;BAR=.,1.0,. GT:GTA:GTZ:AD:DP:GQ:PL 2/2:2/1:1/1:0,0,11:11:33:396,402,411,33,33,0 2:.:1:0,0,9:9:24:24,40,0 +X 16050036 . A C 19961.13 . FOO=1,.;BAR=2,.,.;JUST_A_DOT=. GT:GTA:GTZ:AD:DP:GQ:PL 0/0:./.:0/1:10,0:10:44:0,44,180 1:.:0:0,6:7:70:70,0 +X 16061250 . T A,C 547794.46 . FOO=.,2,.;BAR=.,1.0,.;JUST_A_DOT=. GT:GTA:GTZ:AD:DP:GQ:PL 2/2:2/1:1/1:0,0,11:11:33:396,402,411,33,33,0 2:.:1:0,0,9:9:24:24,40,0