Skip to content

Commit

Permalink
Issue #334: Normalization (#337)
Browse files Browse the repository at this point in the history
* add normalization example to test notebook

* update notebook metadata

* a few simple behavior tests

* Add keys to ReferenceLengthExpression and LiteralSequenceExpression

* Make LiteralSequenceExpression not-identifiable

* remove unnecessary / unused code

* addresses #338 (comment)

* Fix ReferenceLengthExpression tests in test_allele_translator

* linewise diff for test_annotate_vcf_grch38_noattrs

* Update test_vcf_expected_output_no_vrs_attrs.vcf.gz ReferenceLengthExpression

* Fix test_annotate_vcf_grch38_attrs

* Fix test_annotate_vcf_grch38_attrs_altsonly

---------

Co-authored-by: Kyle Ferriter <kferrite@broadinstitute.org>
  • Loading branch information
ahwagner and theferrit32 authored Feb 6, 2024
1 parent ca3c638 commit 253b10c
Show file tree
Hide file tree
Showing 11 changed files with 359 additions and 83 deletions.
322 changes: 286 additions & 36 deletions notebooks/testingstuff.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/ga4gh/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
parse_ga4gh_identifier, GA4GHComputeIdentifierWhen, use_ga4gh_compute_identifier_when
)
from ._internal.pydantic import (
is_pydantic_instance, is_curie_type, is_identifiable, is_literal, pydantic_copy
is_pydantic_instance, is_curie_type, is_ga4gh_identifiable, is_literal, pydantic_copy
)
from ._internal import models as core_models

Expand Down
6 changes: 3 additions & 3 deletions src/ga4gh/core/_internal/enderef.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
is_pydantic_instance,
is_list,
is_curie_type,
is_identifiable,
is_ga4gh_identifiable,
get_pydantic_root,
pydantic_copy)

Expand Down Expand Up @@ -60,7 +60,7 @@ def _enref(o):

if not is_pydantic_instance(o):
raise ValueError("Called ga4gh_enref() with non-pydantic instance")
if not is_identifiable(o):
if not is_ga4gh_identifiable(o):
raise ValueError("Called ga4gh_enref() with non-identifiable object")

# in-place replacement on object copy
Expand Down Expand Up @@ -101,7 +101,7 @@ def _deref(o):

if not is_pydantic_instance(o):
raise ValueError("Called ga4gh_deref() with non-non-pydantic instance")
if not is_identifiable(o):
if not is_ga4gh_identifiable(o):
raise ValueError("Called ga4gh_deref() with non-identifiable object")

# in-place replacement on object copy
Expand Down
46 changes: 23 additions & 23 deletions src/ga4gh/core/_internal/identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from .pydantic import (
is_pydantic_instance,
is_curie_type,
is_identifiable,
is_ga4gh_identifiable,
getattr_in,
get_pydantic_root,
is_pydantic_custom_type
Expand Down Expand Up @@ -143,7 +143,7 @@ def ga4gh_identify(vro):
'ga4gh:VSL.u5fspwVbQ79QkX6GHLF8tXPCAXFJqRPx'
"""
if is_identifiable(vro):
if is_ga4gh_identifiable(vro):
when_rule = ga4gh_compute_identifier_when.get(GA4GHComputeIdentifierWhen.ALWAYS)
do_compute = False
ir = None
Expand Down Expand Up @@ -281,7 +281,7 @@ def identify_all(

if is_pydantic_custom_type(input_obj):
val = export_pydantic_model(input_obj)
if isinstance(val, str) and is_curie_type(val) and is_ga4gh_identifier(val):
if isinstance(val, str) and is_ga4gh_identifier(val):
val = parse_ga4gh_identifier(val)["digest"]
output_obj = val
elif is_pydantic_instance(input_obj):
Expand All @@ -307,7 +307,7 @@ def identify_all(
# Assumes any obj with 'digest' should be collapsed.
collapsed_output_obj = collapse_identifiable_values(output_obj)
# Add a digest to the output if it is identifiable
if is_identifiable(input_obj):
if is_ga4gh_identifiable(input_obj):
# Compute digest for updated object, not re-running compaction
output_obj["digest"] = ga4gh_digest(collapsed_output_obj, do_compact=False)
else:
Expand All @@ -317,22 +317,22 @@ def identify_all(
return output_obj


def scrape_model_metadata(obj, meta={}) -> dict:
"""
For a Pydantic object obj, pull out .ga4gh.identifiable
and .ga4gh.keys and put them in meta keyed by the class name of obj
"""
assert isinstance(obj, BaseModel)
name = type(obj).__name__
if is_pydantic_custom_str_type(obj):
meta[name] = {"identifiable": False, "keys": None}
else:
meta[name] = {}
identifiable = getattr_in(obj, ["ga4gh", "identifiable"])
if identifiable:
meta[name]["identifiable"] = identifiable
keys = getattr_in(obj, ["ga4gh", "keys"])
if keys and len(keys) > 0:
meta[name]["keys"] = keys
# TODO recurse into fields
return meta
# def scrape_model_metadata(obj, meta={}) -> dict:
# """
# For a Pydantic object obj, pull out .ga4gh.identifiable
# and .ga4gh.keys and put them in meta keyed by the class name of obj
# """
# assert isinstance(obj, BaseModel)
# name = type(obj).__name__
# if is_pydantic_custom_str_type(obj):
# meta[name] = {"identifiable": False, "keys": None}
# else:
# meta[name] = {}
# identifiable = getattr_in(obj, ["ga4gh", "identifiable"])
# if identifiable:
# meta[name]["identifiable"] = identifiable
# keys = getattr_in(obj, ["ga4gh", "keys"])
# if keys and len(keys) > 0:
# meta[name]["keys"] = keys
# # TODO recurse into fields
# return meta
10 changes: 5 additions & 5 deletions src/ga4gh/core/_internal/pydantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ def getattr_in(obj, names) -> Any:
return v


def is_identifiable(o: Any) -> bool:
def is_ga4gh_identifiable(o: Any) -> bool:
"""
Determine if object is identifiable. An object is considered identifiable if
contains a `ga4gh_digest` attribute
Determine if object is GA4GH identifiable. An object is considered
GA4GH identifiable if it contains a `ga4gh_prefix` attribute
:param o: Object
:return: `True` if `o` has `ga4gh_digest` attribute. `False` otherwise.
:return: `True` if `o` has `ga4gh_prefix` attribute. `False` otherwise.
"""
return getattr_in(o, ['ga4gh', 'identifiable'])
return bool(getattr_in(o, ['ga4gh', 'prefix']))


def is_literal(o: Any) -> bool:
Expand Down
27 changes: 20 additions & 7 deletions src/ga4gh/vrs/_internal/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from pydantic import BaseModel, ConfigDict, Field, RootModel, constr

from ga4gh.core._internal.pydantic import (
is_identifiable,
is_ga4gh_identifiable,
getattr_in
)
from ga4gh.core._internal.models import IRI, _Entity
Expand Down Expand Up @@ -91,7 +91,7 @@ def pydantic_class_refatt_map():
# Types directly reffable
reffable_classes = list(filter(
lambda c: ('id' in c.model_fields
and is_identifiable(c)),
and is_ga4gh_identifiable(c)),
model_classes
))
# Types reffable because they are a union of reffable types
Expand Down Expand Up @@ -172,16 +172,17 @@ class _ValueObject(_Entity):
description='A sha512t24u digest created using the VRS Computed Identifier algorithm.',
)

class ga4gh:
keys: List[str]


class _Ga4ghIdentifiableObject(_ValueObject):
"""A contextual value object for which a GA4GH computed identifier can be created."""

type: str

class ga4gh:
identifiable = True
class ga4gh(_ValueObject.ga4gh):
prefix: str
keys: List[str]


class Expression(BaseModel):
Expand Down Expand Up @@ -239,7 +240,7 @@ class SequenceReference(_ValueObject):
)
residueAlphabet: Optional[ResidueAlphabet] = None

class ga4gh:
class ga4gh(_ValueObject.ga4gh):
assigned: bool = Field(
True,
description='This special property indicates that the `digest` field follows an alternate convention and is expected to have the value assigned following that convention. For SequenceReference, it is expected the digest will be the refget accession value without the `SQ.` prefix.'
Expand All @@ -262,6 +263,13 @@ class ReferenceLengthExpression(_ValueObject):
None, description='The number of residues in the repeat subunit.'
)

class ga4gh(_ValueObject.ga4gh):
keys = [
'length',
'repeatSubunitLength',
'type'
]


class LiteralSequenceExpression(_ValueObject):
"""An explicit expression of a Sequence."""
Expand All @@ -271,6 +279,12 @@ class LiteralSequenceExpression(_ValueObject):
)
sequence: SequenceString = Field(..., description='the literal sequence')

class ga4gh(_ValueObject.ga4gh):
keys = [
'sequence',
'type'
]


class SequenceLocation(_Ga4ghIdentifiableObject):
"""A `Location` defined by an interval on a referenced `Sequence`."""
Expand Down Expand Up @@ -407,7 +421,6 @@ class GenotypeMember(_ValueObject):
)

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
identifiable = False
keys = [
'type',
'count',
Expand Down
Binary file modified tests/extras/data/test_vcf_expected_altsonly_output.vcf.gz
Binary file not shown.
Binary file modified tests/extras/data/test_vcf_expected_output.vcf.gz
Binary file not shown.
Binary file modified tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf.gz
Binary file not shown.
11 changes: 6 additions & 5 deletions tests/extras/test_allele_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def tlr(rest_dataproxy):
insertion_output = {
"location": {
"end": 20003010,
"start":20003010,
"start": 20003010,
"sequenceReference": {
"refgetAccession": "SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
"type": "SequenceReference"
Expand Down Expand Up @@ -265,6 +265,7 @@ def test_to_spdi(tlr):
assert 1 == len(to_spdi)
assert spdiexpr == to_spdi[0]


hgvs_tests = (
("NC_000013.11:g.32936732=", {
"id": "ga4gh:VA.GuDPEe-WojSx4b4DxupN3si1poaR61qL",
Expand Down Expand Up @@ -303,7 +304,7 @@ def test_to_spdi(tlr):
"type": "Allele"
}),
("NC_000007.14:g.55181220del", {
"id": "ga4gh:VA.BNV6SfAuqDYKTTRknLcS-QuTryF5rSBi",
"id": "ga4gh:VA.wlYnlMsWc0ZTPZb-nQv2dXHbFcXa6J9u",
"location": {
"id": "ga4gh:SL.hnIOG_kul0Lf3mO1ddTRFb0GbQhtQ19t",
"end": 55181220,
Expand Down Expand Up @@ -341,7 +342,7 @@ def test_to_spdi(tlr):
"type": "Allele"
}),
("NC_000013.11:g.32331093_32331094dup", {
"id": "ga4gh:VA.g-q4OzcyYFC5eVQFSrbXwgJScSREvrY-",
"id": "ga4gh:VA.x5iNzjjXbb1-wWTBLMBcicYlCMwYoedq",
"location": {
"id": "ga4gh:SL.PJ8lHWhAMNRSrxHvkarfDjRWxF-GwaJ_",
"end": 32331094,
Expand All @@ -361,7 +362,7 @@ def test_to_spdi(tlr):
"type": "Allele"
}),
("NC_000013.11:g.32316467dup", {
"id": "ga4gh:VA._KlbF6GZCbuLxbL9z4hZE3oZSLzBHstS",
"id": "ga4gh:VA.ZAyA7Mmd7ERWN6CEd6muxn2mk_gTvEvF",
"location": {
"id": "ga4gh:SL.LURTeRdwh5bQf_QqPBoaA--MECYmrY5U",
"end": 32316467,
Expand Down Expand Up @@ -434,7 +435,7 @@ def test_hgvs(tlr, hgvsexpr, expected):

def test_to_hgvs_invalid(tlr):
# IRI is passed
iri_vo = models.Allele(
iri_vo = models.Allele(
**{
"location": {
"end": 1263,
Expand Down
18 changes: 15 additions & 3 deletions tests/extras/test_vcf_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@

TEST_DATA_DIR = "tests/extras/data"


@pytest.fixture
def vcf_annotator():
return VCFAnnotator("rest")


@pytest.mark.vcr
def test_annotate_vcf_grch38_noattrs(vcf_annotator, vcr_cassette):
vcr_cassette.allow_playback_repeats = False
Expand All @@ -26,12 +28,14 @@ def test_annotate_vcf_grch38_noattrs(vcf_annotator, vcr_cassette):
out_vcf_lines = out_vcf.readlines()
with gzip.open(expected_vcf_no_vrs_attrs, "rt") as expected_output:
expected_output_lines = expected_output.readlines()
assert out_vcf_lines == expected_output_lines
for actual_line, expected_line in zip(out_vcf_lines, expected_output_lines):
assert actual_line == expected_line
assert os.path.exists(output_vrs_pkl)
assert vcr_cassette.all_played
os.remove(output_vcf)
os.remove(output_vrs_pkl)


@pytest.mark.vcr
def test_annotate_vcf_grch38_attrs(vcf_annotator, vcr_cassette):
vcr_cassette.allow_playback_repeats = False
Expand All @@ -46,12 +50,14 @@ def test_annotate_vcf_grch38_attrs(vcf_annotator, vcr_cassette):
out_vcf_lines = out_vcf.readlines()
with gzip.open(expected_vcf, "rt") as expected_output:
expected_output_lines = expected_output.readlines()
assert out_vcf_lines == expected_output_lines
for actual_line, expected_line in zip(out_vcf_lines, expected_output_lines):
assert actual_line == expected_line
assert os.path.exists(output_vrs_pkl)
assert vcr_cassette.all_played
os.remove(output_vcf)
os.remove(output_vrs_pkl)


@pytest.mark.vcr
def test_annotate_vcf_grch38_attrs_altsonly(vcf_annotator, vcr_cassette):
vcr_cassette.allow_playback_repeats = False
Expand All @@ -66,12 +72,14 @@ def test_annotate_vcf_grch38_attrs_altsonly(vcf_annotator, vcr_cassette):
out_vcf_lines = out_vcf.readlines()
with gzip.open(expected_altsonly_vcf, "rt") as expected_output:
expected_output_lines = expected_output.readlines()
assert out_vcf_lines == expected_output_lines
for actual_line, expected_line in zip(out_vcf_lines, expected_output_lines):
assert actual_line == expected_line
assert os.path.exists(output_vrs_pkl)
assert vcr_cassette.all_played
os.remove(output_vcf)
os.remove(output_vrs_pkl)


@pytest.mark.vcr
def test_annotate_vcf_grch37_attrs(vcf_annotator, vcr_cassette):
vcr_cassette.allow_playback_repeats = False
Expand All @@ -92,6 +100,7 @@ def test_annotate_vcf_grch37_attrs(vcf_annotator, vcr_cassette):
os.remove(output_vcf)
os.remove(output_vrs_pkl)


@pytest.mark.vcr
def test_annotate_vcf_pickle_only(vcf_annotator, vcr_cassette):
vcr_cassette.allow_playback_repeats = False
Expand All @@ -106,6 +115,7 @@ def test_annotate_vcf_pickle_only(vcf_annotator, vcr_cassette):
assert vcr_cassette.all_played
os.remove(output_vrs_pkl)


@pytest.mark.vcr
def test_annotate_vcf_vcf_only(vcf_annotator, vcr_cassette):
vcr_cassette.allow_playback_repeats = False
Expand All @@ -125,13 +135,15 @@ def test_annotate_vcf_vcf_only(vcf_annotator, vcr_cassette):
assert not os.path.exists(output_vrs_pkl)
os.remove(output_vcf)


def test_annotate_vcf_input_validation(vcf_annotator):
input_vcf = f"{TEST_DATA_DIR}/test_vcf_input.vcf"

with pytest.raises(VCFAnnotatorException) as e:
vcf_annotator.annotate(input_vcf)
assert str(e.value) == "Must provide one of: `vcf_out` or `vrs_pickle_out`"


@pytest.mark.vcr
def test_get_vrs_object_invalid_input(vcf_annotator, caplog):
"""Test that _get_vrs_object method works as expected with invalid input"""
Expand Down

0 comments on commit 253b10c

Please sign in to comment.