diff --git a/antenna/app.py b/antenna/app.py index 2513ad2d..f861083e 100644 --- a/antenna/app.py +++ b/antenna/app.py @@ -59,7 +59,7 @@ def count_sentry_scrub_error(msg): - METRICS.incr("app.sentry_scrub_error", value=1) + METRICS.incr("collector.sentry_scrub_error", value=1) def configure_sentry(app_config): diff --git a/antenna/breakpad_resource.py b/antenna/breakpad_resource.py index e56a747f..90805f6d 100644 --- a/antenna/breakpad_resource.py +++ b/antenna/breakpad_resource.py @@ -143,7 +143,7 @@ def extract_payload(self, req): # Decompress payload if it's compressed if req.env.get("HTTP_CONTENT_ENCODING") == "gzip": - METRICS.incr("breakpad_resource.gzipped_crash") + METRICS.incr("collector.breakpad_resource.gzipped_crash") crash_report.payload_compressed = "1" # If the content is gzipped, we pull it out and decompress it. We @@ -154,13 +154,13 @@ def extract_payload(self, req): try: data = zlib.decompress(req.stream.read(content_length), gzip_header) METRICS.histogram( - "breakpad_resource.gzipped_crash_decompress", + "collector.breakpad_resource.gzipped_crash_decompress", value=(time.perf_counter() - start_time) * 1000.0, tags=["result:success"], ) except zlib.error as exc: METRICS.histogram( - "breakpad_resource.gzipped_crash_decompress", + "collector.breakpad_resource.gzipped_crash_decompress", value=(time.perf_counter() - start_time) * 1000.0, tags=["result:fail"], ) @@ -178,7 +178,7 @@ def extract_payload(self, req): data = io.BytesIO(data) METRICS.histogram( - "breakpad_resource.crash_size", + "collector.breakpad_resource.crash_size", value=content_length, tags=["payload:compressed"], ) @@ -186,7 +186,7 @@ def extract_payload(self, req): else: data = req.bounded_stream METRICS.histogram( - "breakpad_resource.crash_size", + "collector.breakpad_resource.crash_size", value=content_length, tags=["payload:uncompressed"], ) @@ -261,7 +261,7 @@ def extract_payload(self, req): if has_json and has_kvpairs: # If the crash payload has both kvpairs and a JSON blob, then it's malformed - # so we add a note and log it. + # so we add a note and log it, but we don't reject it msg = "includes annotations in both json-encoded extra and formdata parts" LOGGER.info(msg) crash_report.notes.append(msg) @@ -302,7 +302,7 @@ def cleanup_crash_report(self, raw_crash): del raw_crash[bad_field] notes.append("Removed %s from raw crash." % bad_field) - @METRICS.timer_decorator("breakpad_resource.on_post.time") + @METRICS.timer_decorator("collector.breakpad_resource.on_post.time") def on_post(self, req, resp): """Handle incoming HTTP POSTs. @@ -324,12 +324,14 @@ def on_post(self, req, resp): except MalformedCrashReport as exc: # If this is malformed, then reject it with malformed error code. msg = str(exc) - METRICS.incr("breakpad_resource.malformed", tags=["reason:%s" % msg]) + METRICS.incr( + "collector.breakpad_resource.malformed", tags=["reason:%s" % msg] + ) resp.status = falcon.HTTP_400 resp.text = "Discarded=malformed_%s" % msg return - METRICS.incr("breakpad_resource.incoming_crash") + METRICS.incr("collector.breakpad_resource.incoming_crash") raw_crash = crash_report.annotations @@ -380,9 +382,11 @@ def on_post(self, req, resp): rule_name, RESULT_TO_TEXT[throttle_result], ) - METRICS.incr("breakpad_resource.throttle_rule", tags=["rule:%s" % rule_name]) METRICS.incr( - "breakpad_resource.throttle", + "collector.breakpad_resource.throttle_rule", tags=["rule:%s" % rule_name] + ) + METRICS.incr( + "collector.breakpad_resource.throttle", tags=["result:%s" % RESULT_TO_TEXT[throttle_result].lower()], ) raw_crash["metadata"]["throttle_rule"] = rule_name diff --git a/antenna/crashmover.py b/antenna/crashmover.py index 265d2c2d..2e57409c 100644 --- a/antenna/crashmover.py +++ b/antenna/crashmover.py @@ -17,7 +17,7 @@ def _incr_wait_generator(counter, attempts, sleep_seconds): def _generator_generator(): for _ in range(attempts - 1): - METRICS.incr(f"crashmover.{counter}") + METRICS.incr(f"collector.crashmover.{counter}") yield sleep_seconds return _generator_generator @@ -124,7 +124,7 @@ def check_health(self, state): if hasattr(self.crashpublish, "check_health"): self.crashpublish.check_health(state) - @METRICS.timer("crashmover.crash_handling.time") + @METRICS.timer("collector.crashmover.crash_handling.time") def handle_crashreport(self, raw_crash, dumps, crash_id): """Handle a new crash report synchronously and return whether that succeeded. @@ -143,27 +143,27 @@ def handle_crashreport(self, raw_crash, dumps, crash_id): except MaxAttemptsError: # After max attempts, we give up on this crash LOGGER.error("%s: too many errors trying to save; dropped", crash_id) - METRICS.incr("crashmover.save_crash_dropped.count") + METRICS.incr("collector.crashmover.save_crash_dropped.count") return False try: self.crashmover_publish_with_retry(crash_report) - METRICS.incr("crashmover.save_crash.count") + METRICS.incr("collector.crashmover.save_crash.count") except MaxAttemptsError: LOGGER.error("%s: too many errors trying to publish; dropped", crash_id) - METRICS.incr("crashmover.publish_crash_dropped.count") + METRICS.incr("collector.crashmover.publish_crash_dropped.count") # return True even when publish fails because it will be automatically # published later via self-healing mechanisms return True - @METRICS.timer("crashmover.crash_save.time") + @METRICS.timer("collector.crashmover.crash_save.time") def crashmover_save(self, crash_report): """Save crash report to storage.""" self.crashstorage.save_crash(crash_report) LOGGER.info("%s saved", crash_report.crash_id) - @METRICS.timer("crashmover.crash_publish.time") + @METRICS.timer("collector.crashmover.crash_publish.time") def crashmover_publish(self, crash_report): """Publish crash_id in publish queue.""" self.crashpublish.publish_crash(crash_report) diff --git a/antenna/health_resource.py b/antenna/health_resource.py index 248d4b59..02938858 100644 --- a/antenna/health_resource.py +++ b/antenna/health_resource.py @@ -17,7 +17,7 @@ class BrokenResource: def on_get(self, req, resp): """Implement GET HTTP request.""" - METRICS.incr("health.broken.count") + METRICS.incr("collector.health.broken.count") # This is intentional breakage raise Exception("intentional exception") @@ -30,7 +30,7 @@ def __init__(self, basedir): def on_get(self, req, resp): """Implement GET HTTP request.""" - METRICS.incr("health.version.count") + METRICS.incr("collector.health.version.count") version_info = get_version_info(self.basedir) # FIXME(willkg): there's no cloud provider environment variable to use, so # we'll cheat and look at whether there's a "gcs" in @@ -52,7 +52,7 @@ class LBHeartbeatResource: def on_get(self, req, resp): """Implement GET HTTP request.""" - METRICS.incr("health.lbheartbeat.count") + METRICS.incr("collector.health.lbheartbeat.count") resp.content_type = "application/json; charset=utf-8" resp.status = falcon.HTTP_200 @@ -94,7 +94,7 @@ def __init__(self, app): def on_get(self, req, resp): """Implement GET HTTP request.""" - METRICS.incr("health.heartbeat.count") + METRICS.incr("collector.health.heartbeat.count") state = HealthState() # So we're going to think of Antenna like a big object graph and @@ -107,7 +107,7 @@ def on_get(self, req, resp): # Go through and call gauge for each statsd item. for key, value in state.statsd.items(): - METRICS.gauge(f"health.{key}", value=value) + METRICS.gauge(f"collector.health.{key}", value=value) if state.is_healthy(): resp.status = falcon.HTTP_200 diff --git a/antenna/libmarkus.py b/antenna/libmarkus.py index 37a43281..dff4d684 100644 --- a/antenna/libmarkus.py +++ b/antenna/libmarkus.py @@ -13,7 +13,7 @@ _IS_MARKUS_SETUP = False LOGGER = logging.getLogger(__name__) -METRICS = markus.get_metrics() +METRICS = markus.get_metrics("socorro") def setup_metrics(statsd_host, statsd_port, hostname, debug=False): diff --git a/tests/unittest/test_breakpad_resource.py b/tests/unittest/test_breakpad_resource.py index ac7da45d..77c54ee5 100644 --- a/tests/unittest/test_breakpad_resource.py +++ b/tests/unittest/test_breakpad_resource.py @@ -19,6 +19,34 @@ from testlib.mini_poster import compress, multipart_encode +class AnyTagValue: + """Matches a markus metrics tag with any value""" + + def __init__(self, key): + self.key = key + + def __repr__(self): + return f"" + + def get_other_key(self, other): + # This is comparing against a tag string + if ":" in other: + other_key, _ = other.split(":") + else: + other_key = other + return other_key + + def __eq__(self, other): + if isinstance(other, AnyTagValue): + return self.key == other.key + return self.key == self.get_other_key(other) + + def __lt__(self, other): + if isinstance(other, AnyTagValue): + return self.key < other.key + return self.key < self.get_other_key(other) + + class FakeCrashMover: """Fake crash mover that raises an error when used""" @@ -346,13 +374,14 @@ def test_extract_payload_invalid_json_not_dict(self, request_generator): with pytest.raises(MalformedCrashReport, match="invalid_json_value"): bsp.extract_payload(req) - def text_extract_payload_kvpairs_and_json(self, request_generator, metricsmock): - # If there's a JSON blob and also kv pairs, then that's a malformed - # crash + def test_extract_payload_kvpairs_and_json(self, request_generator, metricsmock): + # If there's a JSON blob and also kv pairs, use the annotations from "extra" and + # log a note data, headers = multipart_encode( { "extra": '{"ProductName":"Firefox","Version":"1.0"}', - "BadKey": "BadValue", + # This annotation is dropped because it's not in "extra" + "IgnoredAnnotation": "someval", "upload_file_minidump": ("fakecrash.dump", io.BytesIO(b"abcd1234")), } ) @@ -363,10 +392,20 @@ def text_extract_payload_kvpairs_and_json(self, request_generator, metricsmock): bsp = BreakpadSubmitterResource( config=EMPTY_CONFIG, crashmover=FakeCrashMover() ) - with metricsmock as metrics: - result = bsp.extract_payload(req) - assert result == ({}, {}) - assert metrics.has_record(stat="malformed", tags=["reason:has_json_and_kv"]) + crash_report = CrashReport( + annotations={ + "ProductName": "Firefox", + "Version": "1.0", + }, + dumps={"upload_file_minidump": b"abcd1234"}, + notes=[ + "includes annotations in both json-encoded extra and formdata parts" + ], + payload="json", + payload_compressed="0", + payload_size=542, + ) + assert bsp.extract_payload(req) == crash_report @pytest.mark.parametrize( @@ -405,7 +444,7 @@ def test_get_throttle_result(client): class TestBreakpadSubmitterResourceIntegration: - def test_submit_crash_report(self, client): + def test_submit_crash_report(self, client, metricsmock): data, headers = multipart_encode( { "ProductName": "Firefox", @@ -415,7 +454,9 @@ def test_submit_crash_report(self, client): } ) - result = client.simulate_post("/submit", headers=headers, body=data) + with metricsmock as mm: + result = client.simulate_post("/submit", headers=headers, body=data) + assert result.status_code == 200 assert result.headers["Content-Type"].startswith("text/plain") assert result.content.startswith(b"CrashID=bp") @@ -446,6 +487,22 @@ def test_submit_crash_report(self, client): "version": 2, } + mm.assert_histogram("socorro.collector.breakpad_resource.crash_size", value=632) + mm.assert_incr("socorro.collector.breakpad_resource.incoming_crash") + mm.assert_incr( + "socorro.collector.breakpad_resource.throttle_rule", + tags=["rule:is_nightly", AnyTagValue("host")], + ) + mm.assert_incr( + "socorro.collector.breakpad_resource.throttle", + tags=["result:accept", AnyTagValue("host")], + ) + mm.assert_timing("socorro.collector.crashmover.crash_save.time") + mm.assert_timing("socorro.collector.crashmover.crash_publish.time") + mm.assert_incr("socorro.collector.crashmover.save_crash.count") + mm.assert_timing("socorro.collector.crashmover.crash_handling.time") + mm.assert_timing("socorro.collector.breakpad_resource.on_post.time") + def test_existing_uuid(self, client): """Verify if the crash report has a uuid already, it's reused.""" crash_id = "de1bb258-cbbf-4589-a673-34f800160918" diff --git a/tests/unittest/test_sentry.py b/tests/unittest/test_sentry.py index b0f18e2e..5e0cde4d 100644 --- a/tests/unittest/test_sentry.py +++ b/tests/unittest/test_sentry.py @@ -146,4 +146,4 @@ def test_count_sentry_scrub_error(): with MetricsMock() as metricsmock: metricsmock.clear_records() count_sentry_scrub_error("foo") - metricsmock.assert_incr("app.sentry_scrub_error", value=1) + metricsmock.assert_incr("socorro.collector.sentry_scrub_error", value=1)