Skip to content

Commit

Permalink
[query] better error message when info array field has missing elements
Browse files Browse the repository at this point in the history
Fixes hail-is#13346.

Another user was confused by this: hail-is#14102. Unfortunately, the world
appears to have embraced missing values in VCF array fields even though the single element case is
ambiguous. In hail-is#13346, I proposed a scheme by which we can disambiguate many of the cases, but implementing
it ran into challenges because LoadVCF.scala does not expose whether or not an INFO field was a literal
"." or elided entirely from that line.

Anyway, this error message actually points users to the fix. I also changed some method names such that
every method is ArrayType and never TypeArray.
  • Loading branch information
Dan King committed Dec 18, 2023
1 parent f355886 commit 39f72d7
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 35 deletions.
34 changes: 34 additions & 0 deletions hail/python/hail/methods/impex.py
Original file line number Diff line number Diff line change
Expand Up @@ -2694,6 +2694,40 @@ def import_vcf(path,
>>> ds = hl.import_vcf('data/samplepart*.vcf.gz', force_bgz=True)
Import a VCF which has missing values (".") inside INFO or FORMAT array fields:
>>> print(open('data/missing-values-in-array-fields.vcf').read())
##fileformat=VCFv4.1
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=X,Number=.,Type=Integer,Description="">
##FORMAT=<ID=Y,Number=.,Type=Integer,Description="">
##FORMAT=<ID=Z,Number=.,Type=Integer,Description="">
##INFO=<ID=A,Number=A,Type=Integer,Description="">
##INFO=<ID=B,Number=R,Type=Float,Description="">
##INFO=<ID=C,Number=3,Type=Float,Description="">
##INFO=<ID=D,Number=.,Type=Float,Description="">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1
1 123456 . A C . . A=1,.;B=.,2,.;C=. GT:X:Y:Z 0/0:1,.,1:.:
>>> ds = hl.import_vcf('data/missing-values-in-array-fields.vcf', array_elements_required=False)
>>> ds.show(n_rows=1, n_cols=1, include_row_fields=True)
+---------------+------------+------+-----------+----------+--------------+------------------+----------------+
| locus | alleles | rsid | qual | filters | info.A | info.B | info.C |
+---------------+------------+------+-----------+----------+--------------+------------------+----------------+
| locus<GRCh37> | array<str> | str | float64 | set<str> | array<int32> | array<float64> | array<float64> |
+---------------+------------+------+-----------+----------+--------------+------------------+----------------+
| 1:123456 | ["A","C"] | NA | -1.00e+01 | NA | [1,NA] | [2.00e+00,NA,NA] | NA |
+---------------+------------+------+-----------+----------+--------------+------------------+----------------+
+----------------+--------------+--------------+--------------+--------------+
| info.D | 'SAMPLE1'.GT | 'SAMPLE1'.X | 'SAMPLE1'.Y | 'SAMPLE1'.Z |
+----------------+--------------+--------------+--------------+--------------+
| array<float64> | call | array<int32> | array<int32> | array<int32> |
+----------------+--------------+--------------+--------------+--------------+
| NA | 0/0 | [1,NA,1] | NA | NA |
+----------------+--------------+--------------+--------------+--------------+
Notes
-----
Expand Down
47 changes: 37 additions & 10 deletions hail/python/test/hail/methods/test_impex.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,17 +183,44 @@ def test_import_vcf_can_import_negative_numbers(self):
hl.agg.all(mt.negative_int_array == [-1, -2]) &
hl.agg.all(mt.negative_float_array == [-0.5, -1.5])))

def test_import_vcf_missing_info_field_elements(self):
def test_import_vcf_has_good_error_message_when_info_fields_have_missing_elements(self):
mt = hl.import_vcf(resource('missingInfoArray.vcf'), reference_genome='GRCh37')
with pytest.raises(FatalError, match=".*Missing value in INFO array. Use 'hl.import_vcf[(][.][.][.], array_elements_required=False[)]'[.].*"):
mt._force_count_rows()

def test_import_vcf_array_elements_required_is_false_parses_info_fields_with_missing_elements(self):
mt = hl.import_vcf(resource('missingInfoArray.vcf'), reference_genome='GRCh37', array_elements_required=False)
mt = mt.select_rows(FOO=mt.info.FOO, BAR=mt.info.BAR)
expected = hl.Table.parallelize([{'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'],
'FOO': [1, None], 'BAR': [2, None, None]},
{'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'],
'FOO': [None, 2, None], 'BAR': [None, 1.0, None]}],
hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr),
FOO=hl.tarray(hl.tint), BAR=hl.tarray(hl.tfloat64)),
key=['locus', 'alleles'])
self.assertTrue(mt.rows()._same(expected))
mt = mt.select_rows(**mt.info)
expected = hl.Table.parallelize(
[
{
'locus': hl.Locus('X', 16050036),
'alleles': ['A', 'C'],
'FOO': [1, None],
'BAR': [2, None, None],
'JUST_A_DOT': None,
'NOT_EVEN_PRESENT': None,
},
{
'locus': hl.Locus('X', 16061250),
'alleles': ['T', 'A', 'C'],
'FOO': [None, 2, None],
'BAR': [None, 1.0, None],
'JUST_A_DOT': None,
'NOT_EVEN_PRESENT': None,
}
],
hl.tstruct(
locus=hl.tlocus('GRCh37'),
alleles=hl.tarray(hl.tstr),
FOO=hl.tarray(hl.tint),
BAR=hl.tarray(hl.tfloat64),
JUST_A_DOT=hl.tarray(hl.tfloat64),
NOT_EVEN_PRESENT=hl.tarray(hl.tfloat64),
),
key=['locus', 'alleles']
)
assert mt.rows()._same(expected)

def test_import_vcf_missing_format_field_elements(self):
mt = hl.import_vcf(resource('missingFormatArray.vcf'), reference_genome='GRCh37', array_elements_required=False)
Expand Down
52 changes: 29 additions & 23 deletions hail/src/main/scala/is/hail/io/vcf/LoadVCF.scala
Original file line number Diff line number Diff line change
Expand Up @@ -700,18 +700,18 @@ final class VCFLine(
def parseArrayElement[T](ab: MissingArrayBuilder[T], eltParser: () => T) {
if (formatArrayElementMissing()) {
if (arrayElementsRequired)
parseError(s"missing value in FORMAT array. Import with argument 'array_elements_required=False'")
parseError("Missing value in FORMAT array. Use 'hl.import_vcf(..., array_elements_required=False)'.")
ab.addMissing()
pos += 1
} else {
ab += eltParser()
}
}

def parseIntArrayElement() {
def parseArrayIntElement() {
if (formatArrayElementMissing()) {
if (arrayElementsRequired)
parseError(s"missing value in FORMAT array. Import with argument 'array_elements_required=False'")
parseError("Missing value in FORMAT array. Use 'hl.import_vcf(..., array_elements_required=False)'.")
abi.addMissing()
pos += 1
} else {
Expand All @@ -722,29 +722,29 @@ final class VCFLine(
def parseFloatArrayElement() {
if (formatArrayElementMissing()) {
if (arrayElementsRequired)
parseError(s"missing value in FORMAT array. Import with argument 'array_elements_required=False'")
parseError("Missing value in FORMAT array. Use 'hl.import_vcf(..., array_elements_required=False)'.")
abf.addMissing()
pos += 1
} else {
abf += parseFloatInFormatArray()
}
}

def parseDoubleArrayElement() {
def parseArrayDoubleElement() {
if (formatArrayElementMissing()) {
if (arrayElementsRequired)
parseError(s"missing value in FORMAT array. Import with argument 'array_elements_required=False'")
parseError("Missing value in FORMAT array. Use 'hl.import_vcf(..., array_elements_required=False)'.")
abd.addMissing()
pos += 1
} else {
abd += parseDoubleInFormatArray()
}
}

def parseStringArrayElement() {
def parseArrayStringElement() {
if (formatArrayElementMissing()) {
if (arrayElementsRequired)
parseError(s"missing value in FORMAT array. Import with argument 'array_elements_required=False'")
parseError("Missing value in FORMAT array. Use 'hl.import_vcf(..., array_elements_required=False)'.")
abs.addMissing()
pos += 1
} else {
Expand All @@ -759,11 +759,11 @@ final class VCFLine(
} else {
assert(abi.length == 0)

parseIntArrayElement()
parseArrayIntElement()

while (!endFormatField()) {
pos += 1 // comma
parseIntArrayElement()
parseArrayIntElement()
}

rvb.startArray(abi.length)
Expand All @@ -788,10 +788,10 @@ final class VCFLine(
} else {
assert(abs.length == 0)

parseStringArrayElement()
parseArrayStringElement()
while (!endFormatField()) {
pos += 1 // comma
parseStringArrayElement()
parseArrayStringElement()
}

rvb.startArray(abs.length)
Expand Down Expand Up @@ -841,10 +841,10 @@ final class VCFLine(
} else {
assert(abd.length == 0)

parseDoubleArrayElement()
parseArrayDoubleElement()
while (!endFormatField()) {
pos += 1 // comma
parseDoubleArrayElement()
parseArrayDoubleElement()
}

rvb.startArray(abd.length)
Expand Down Expand Up @@ -944,24 +944,30 @@ final class VCFLine(

def parseDoubleInInfoArray(): Double = VCFUtils.parseVcfDouble(parseStringInInfoArray())

def parseIntInfoArrayElement() {
def parseInfoArrayIntElement() {
if (infoArrayElementMissing()) {
if (arrayElementsRequired)
parseError("Missing value in INFO array. Use 'hl.import_vcf(..., array_elements_required=False)'.")
abi.addMissing()
pos += 1 // dot
} else
abi += parseIntInInfoArray()
}

def parseStringInfoArrayElement() {
def parseInfoArrayStringElement() {
if (infoArrayElementMissing()) {
if (arrayElementsRequired)
parseError("Missing value in INFO array. Use 'hl.import_vcf(..., array_elements_required=False)'.")
abs.addMissing()
pos += 1 // dot
} else
abs += parseStringInInfoArray()
}

def parseDoubleInfoArrayElement() {
def parseInfoArrayDoubleElement() {
if (infoArrayElementMissing()) {
if (arrayElementsRequired)
parseError("Missing value in INFO array. Use 'hl.import_vcf(..., array_elements_required=False)'.")
abd.addMissing()
pos += 1
} else {
Expand All @@ -973,10 +979,10 @@ final class VCFLine(
if (!infoFieldMissing()) {
rvb.setPresent()
assert(abi.length == 0)
parseIntInfoArrayElement()
parseInfoArrayIntElement()
while (!endInfoField()) {
pos += 1 // comma
parseIntInfoArrayElement()
parseInfoArrayIntElement()
}

rvb.startArray(abi.length)
Expand All @@ -997,10 +1003,10 @@ final class VCFLine(
if (!infoFieldMissing()) {
rvb.setPresent()
assert(abs.length == 0)
parseStringInfoArrayElement()
parseInfoArrayStringElement()
while (!endInfoField()) {
pos += 1 // comma
parseStringInfoArrayElement()
parseInfoArrayStringElement()
}

rvb.startArray(abs.length)
Expand All @@ -1021,10 +1027,10 @@ final class VCFLine(
if (!infoFieldMissing()) {
rvb.setPresent()
assert(abd.length == 0)
parseDoubleInfoArrayElement()
parseInfoArrayDoubleElement()
while (!endInfoField()) {
pos += 1 // comma
parseDoubleInfoArrayElement()
parseInfoArrayDoubleElement()
}

rvb.startArray(abd.length)
Expand Down
6 changes: 4 additions & 2 deletions hail/src/test/resources/missingInfoArray.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##INFO=<ID=FOO,Number=R,Type=Integer,Description="">
##INFO=<ID=BAR,Number=3,Type=Float,Description="">
##INFO=<ID=JUST_A_DOT,Number=3,Type=Float,Description="">
##INFO=<ID=NOT_EVEN_PRESENT,Number=3,Type=Float,Description="">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT C1046::HG02024 C1046::HG02025
X 16050036 . A C 19961.13 . FOO=1,.;BAR=2,.,. GT:GTA:GTZ:AD:DP:GQ:PL 0/0:./.:0/1:10,0:10:44:0,44,180 1:.:0:0,6:7:70:70,0
X 16061250 . T A,C 547794.46 . FOO=.,2,.;BAR=.,1.0,. GT:GTA:GTZ:AD:DP:GQ:PL 2/2:2/1:1/1:0,0,11:11:33:396,402,411,33,33,0 2:.:1:0,0,9:9:24:24,40,0
X 16050036 . A C 19961.13 . FOO=1,.;BAR=2,.,.;JUST_A_DOT=. GT:GTA:GTZ:AD:DP:GQ:PL 0/0:./.:0/1:10,0:10:44:0,44,180 1:.:0:0,6:7:70:70,0
X 16061250 . T A,C 547794.46 . FOO=.,2,.;BAR=.,1.0,.;JUST_A_DOT=. GT:GTA:GTZ:AD:DP:GQ:PL 2/2:2/1:1/1:0,0,11:11:33:396,402,411,33,33,0 2:.:1:0,0,9:9:24:24,40,0

0 comments on commit 39f72d7

Please sign in to comment.