Skip to content

Commit

Permalink
bug-1898345: fix metrics key prefix
Browse files Browse the repository at this point in the history
In bec86e3, I removed setting the statsd_prefix because I thought we
were adding "socorro.collector" in telegraf--not in DatadogMetrics
markus backend. That was wrong and I broke stats in Socorro stage AWS.

This fixes that by setting "socorro" in the MetricsInterface per our
convention and then fixing the keys to include "collector".

Then I discovered one of the tests was busted, so I fixed that. I also
added metrics assertions to test_submit_crash_report.

The host value can change, so I wrote an AnyTagValue class that'll let
us assert a tag was emitted with whatever value it has.
  • Loading branch information
willkg committed May 24, 2024
1 parent 3a8595a commit 739b4cc
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 36 deletions.
2 changes: 1 addition & 1 deletion antenna/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@


def count_sentry_scrub_error(msg):
METRICS.incr("app.sentry_scrub_error", value=1)
METRICS.incr("collector.sentry_scrub_error", value=1)


def configure_sentry(app_config):
Expand Down
26 changes: 15 additions & 11 deletions antenna/breakpad_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def extract_payload(self, req):

# Decompress payload if it's compressed
if req.env.get("HTTP_CONTENT_ENCODING") == "gzip":
METRICS.incr("breakpad_resource.gzipped_crash")
METRICS.incr("collector.breakpad_resource.gzipped_crash")
crash_report.payload_compressed = "1"

# If the content is gzipped, we pull it out and decompress it. We
Expand All @@ -154,13 +154,13 @@ def extract_payload(self, req):
try:
data = zlib.decompress(req.stream.read(content_length), gzip_header)
METRICS.histogram(
"breakpad_resource.gzipped_crash_decompress",
"collector.breakpad_resource.gzipped_crash_decompress",
value=(time.perf_counter() - start_time) * 1000.0,
tags=["result:success"],
)
except zlib.error as exc:
METRICS.histogram(
"breakpad_resource.gzipped_crash_decompress",
"collector.breakpad_resource.gzipped_crash_decompress",
value=(time.perf_counter() - start_time) * 1000.0,
tags=["result:fail"],
)
Expand All @@ -178,15 +178,15 @@ def extract_payload(self, req):

data = io.BytesIO(data)
METRICS.histogram(
"breakpad_resource.crash_size",
"collector.breakpad_resource.crash_size",
value=content_length,
tags=["payload:compressed"],
)

else:
data = req.bounded_stream
METRICS.histogram(
"breakpad_resource.crash_size",
"collector.breakpad_resource.crash_size",
value=content_length,
tags=["payload:uncompressed"],
)
Expand Down Expand Up @@ -261,7 +261,7 @@ def extract_payload(self, req):

if has_json and has_kvpairs:
# If the crash payload has both kvpairs and a JSON blob, then it's malformed
# so we add a note and log it.
# so we add a note and log it, but we don't reject it
msg = "includes annotations in both json-encoded extra and formdata parts"
LOGGER.info(msg)
crash_report.notes.append(msg)
Expand Down Expand Up @@ -302,7 +302,7 @@ def cleanup_crash_report(self, raw_crash):
del raw_crash[bad_field]
notes.append("Removed %s from raw crash." % bad_field)

@METRICS.timer_decorator("breakpad_resource.on_post.time")
@METRICS.timer_decorator("collector.breakpad_resource.on_post.time")
def on_post(self, req, resp):
"""Handle incoming HTTP POSTs.
Expand All @@ -324,12 +324,14 @@ def on_post(self, req, resp):
except MalformedCrashReport as exc:
# If this is malformed, then reject it with malformed error code.
msg = str(exc)
METRICS.incr("breakpad_resource.malformed", tags=["reason:%s" % msg])
METRICS.incr(
"collector.breakpad_resource.malformed", tags=["reason:%s" % msg]
)
resp.status = falcon.HTTP_400
resp.text = "Discarded=malformed_%s" % msg
return

METRICS.incr("breakpad_resource.incoming_crash")
METRICS.incr("collector.breakpad_resource.incoming_crash")

raw_crash = crash_report.annotations

Expand Down Expand Up @@ -380,9 +382,11 @@ def on_post(self, req, resp):
rule_name,
RESULT_TO_TEXT[throttle_result],
)
METRICS.incr("breakpad_resource.throttle_rule", tags=["rule:%s" % rule_name])
METRICS.incr(
"breakpad_resource.throttle",
"collector.breakpad_resource.throttle_rule", tags=["rule:%s" % rule_name]
)
METRICS.incr(
"collector.breakpad_resource.throttle",
tags=["result:%s" % RESULT_TO_TEXT[throttle_result].lower()],
)
raw_crash["metadata"]["throttle_rule"] = rule_name
Expand Down
14 changes: 7 additions & 7 deletions antenna/crashmover.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
def _incr_wait_generator(counter, attempts, sleep_seconds):
def _generator_generator():
for _ in range(attempts - 1):
METRICS.incr(f"crashmover.{counter}")
METRICS.incr(f"collector.crashmover.{counter}")
yield sleep_seconds

return _generator_generator
Expand Down Expand Up @@ -124,7 +124,7 @@ def check_health(self, state):
if hasattr(self.crashpublish, "check_health"):
self.crashpublish.check_health(state)

@METRICS.timer("crashmover.crash_handling.time")
@METRICS.timer("collector.crashmover.crash_handling.time")
def handle_crashreport(self, raw_crash, dumps, crash_id):
"""Handle a new crash report synchronously and return whether that succeeded.
Expand All @@ -143,27 +143,27 @@ def handle_crashreport(self, raw_crash, dumps, crash_id):
except MaxAttemptsError:
# After max attempts, we give up on this crash
LOGGER.error("%s: too many errors trying to save; dropped", crash_id)
METRICS.incr("crashmover.save_crash_dropped.count")
METRICS.incr("collector.crashmover.save_crash_dropped.count")
return False

try:
self.crashmover_publish_with_retry(crash_report)
METRICS.incr("crashmover.save_crash.count")
METRICS.incr("collector.crashmover.save_crash.count")
except MaxAttemptsError:
LOGGER.error("%s: too many errors trying to publish; dropped", crash_id)
METRICS.incr("crashmover.publish_crash_dropped.count")
METRICS.incr("collector.crashmover.publish_crash_dropped.count")
# return True even when publish fails because it will be automatically
# published later via self-healing mechanisms

return True

@METRICS.timer("crashmover.crash_save.time")
@METRICS.timer("collector.crashmover.crash_save.time")
def crashmover_save(self, crash_report):
"""Save crash report to storage."""
self.crashstorage.save_crash(crash_report)
LOGGER.info("%s saved", crash_report.crash_id)

@METRICS.timer("crashmover.crash_publish.time")
@METRICS.timer("collector.crashmover.crash_publish.time")
def crashmover_publish(self, crash_report):
"""Publish crash_id in publish queue."""
self.crashpublish.publish_crash(crash_report)
Expand Down
10 changes: 5 additions & 5 deletions antenna/health_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class BrokenResource:

def on_get(self, req, resp):
"""Implement GET HTTP request."""
METRICS.incr("health.broken.count")
METRICS.incr("collector.health.broken.count")
# This is intentional breakage
raise Exception("intentional exception")

Expand All @@ -30,7 +30,7 @@ def __init__(self, basedir):

def on_get(self, req, resp):
"""Implement GET HTTP request."""
METRICS.incr("health.version.count")
METRICS.incr("collector.health.version.count")
version_info = get_version_info(self.basedir)
# FIXME(willkg): there's no cloud provider environment variable to use, so
# we'll cheat and look at whether there's a "gcs" in
Expand All @@ -52,7 +52,7 @@ class LBHeartbeatResource:

def on_get(self, req, resp):
"""Implement GET HTTP request."""
METRICS.incr("health.lbheartbeat.count")
METRICS.incr("collector.health.lbheartbeat.count")
resp.content_type = "application/json; charset=utf-8"
resp.status = falcon.HTTP_200

Expand Down Expand Up @@ -94,7 +94,7 @@ def __init__(self, app):

def on_get(self, req, resp):
"""Implement GET HTTP request."""
METRICS.incr("health.heartbeat.count")
METRICS.incr("collector.health.heartbeat.count")
state = HealthState()

# So we're going to think of Antenna like a big object graph and
Expand All @@ -107,7 +107,7 @@ def on_get(self, req, resp):

# Go through and call gauge for each statsd item.
for key, value in state.statsd.items():
METRICS.gauge(f"health.{key}", value=value)
METRICS.gauge(f"collector.health.{key}", value=value)

if state.is_healthy():
resp.status = falcon.HTTP_200
Expand Down
2 changes: 1 addition & 1 deletion antenna/libmarkus.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
_IS_MARKUS_SETUP = False

LOGGER = logging.getLogger(__name__)
METRICS = markus.get_metrics()
METRICS = markus.get_metrics("socorro")


def setup_metrics(statsd_host, statsd_port, hostname, debug=False):
Expand Down
77 changes: 67 additions & 10 deletions tests/unittest/test_breakpad_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,34 @@
from testlib.mini_poster import compress, multipart_encode


class AnyTagValue:
"""Matches a markus metrics tag with any value"""

def __init__(self, key):
self.key = key

def __repr__(self):
return f"<AnyTagValue {self.key}>"

def get_other_key(self, other):
# This is comparing against a tag string
if ":" in other:
other_key, _ = other.split(":")
else:
other_key = other
return other_key

def __eq__(self, other):
if isinstance(other, AnyTagValue):
return self.key == other.key
return self.key == self.get_other_key(other)

def __lt__(self, other):
if isinstance(other, AnyTagValue):
return self.key < other.key
return self.key < self.get_other_key(other)


class FakeCrashMover:
"""Fake crash mover that raises an error when used"""

Expand Down Expand Up @@ -346,13 +374,14 @@ def test_extract_payload_invalid_json_not_dict(self, request_generator):
with pytest.raises(MalformedCrashReport, match="invalid_json_value"):
bsp.extract_payload(req)

def text_extract_payload_kvpairs_and_json(self, request_generator, metricsmock):
# If there's a JSON blob and also kv pairs, then that's a malformed
# crash
def test_extract_payload_kvpairs_and_json(self, request_generator, metricsmock):
# If there's a JSON blob and also kv pairs, use the annotations from "extra" and
# log a note
data, headers = multipart_encode(
{
"extra": '{"ProductName":"Firefox","Version":"1.0"}',
"BadKey": "BadValue",
# This annotation is dropped because it's not in "extra"
"IgnoredAnnotation": "someval",
"upload_file_minidump": ("fakecrash.dump", io.BytesIO(b"abcd1234")),
}
)
Expand All @@ -363,10 +392,20 @@ def text_extract_payload_kvpairs_and_json(self, request_generator, metricsmock):
bsp = BreakpadSubmitterResource(
config=EMPTY_CONFIG, crashmover=FakeCrashMover()
)
with metricsmock as metrics:
result = bsp.extract_payload(req)
assert result == ({}, {})
assert metrics.has_record(stat="malformed", tags=["reason:has_json_and_kv"])
crash_report = CrashReport(
annotations={
"ProductName": "Firefox",
"Version": "1.0",
},
dumps={"upload_file_minidump": b"abcd1234"},
notes=[
"includes annotations in both json-encoded extra and formdata parts"
],
payload="json",
payload_compressed="0",
payload_size=542,
)
assert bsp.extract_payload(req) == crash_report


@pytest.mark.parametrize(
Expand Down Expand Up @@ -405,7 +444,7 @@ def test_get_throttle_result(client):


class TestBreakpadSubmitterResourceIntegration:
def test_submit_crash_report(self, client):
def test_submit_crash_report(self, client, metricsmock):
data, headers = multipart_encode(
{
"ProductName": "Firefox",
Expand All @@ -415,7 +454,9 @@ def test_submit_crash_report(self, client):
}
)

result = client.simulate_post("/submit", headers=headers, body=data)
with metricsmock as mm:
result = client.simulate_post("/submit", headers=headers, body=data)

assert result.status_code == 200
assert result.headers["Content-Type"].startswith("text/plain")
assert result.content.startswith(b"CrashID=bp")
Expand Down Expand Up @@ -446,6 +487,22 @@ def test_submit_crash_report(self, client):
"version": 2,
}

mm.assert_histogram("socorro.collector.breakpad_resource.crash_size", value=632)
mm.assert_incr("socorro.collector.breakpad_resource.incoming_crash")
mm.assert_incr(
"socorro.collector.breakpad_resource.throttle_rule",
tags=["rule:is_nightly", AnyTagValue("host")],
)
mm.assert_incr(
"socorro.collector.breakpad_resource.throttle",
tags=["result:accept", AnyTagValue("host")],
)
mm.assert_timing("socorro.collector.crashmover.crash_save.time")
mm.assert_timing("socorro.collector.crashmover.crash_publish.time")
mm.assert_incr("socorro.collector.crashmover.save_crash.count")
mm.assert_timing("socorro.collector.crashmover.crash_handling.time")
mm.assert_timing("socorro.collector.breakpad_resource.on_post.time")

def test_existing_uuid(self, client):
"""Verify if the crash report has a uuid already, it's reused."""
crash_id = "de1bb258-cbbf-4589-a673-34f800160918"
Expand Down
2 changes: 1 addition & 1 deletion tests/unittest/test_sentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,4 +146,4 @@ def test_count_sentry_scrub_error():
with MetricsMock() as metricsmock:
metricsmock.clear_records()
count_sentry_scrub_error("foo")
metricsmock.assert_incr("app.sentry_scrub_error", value=1)
metricsmock.assert_incr("socorro.collector.sentry_scrub_error", value=1)

0 comments on commit 739b4cc

Please sign in to comment.