Tweak statsd and arroyo metrics config (#297)

NicoHinderling · web-flow · commit dd1042086a35 · 2025-08-15T14:01:04.000-07:00
* Tweak statsd and arroyo metrics config

* add extra tag

* from the env var

* clean up the extra namespacing

* unit tests
diff --git a/src/launchpad/kafka.py b/src/launchpad/kafka.py
@@ -47,8 +47,8 @@ def create_kafka_consumer(
         os.environ["KAFKA_HEALTHCHECK_FILE"] = healthcheck_path
         logger.info(f"Using healthcheck file: {healthcheck_path}")
 
-    configure_metrics(DatadogMetricsBackend())
     config = get_kafka_config()
+    configure_metrics(DatadogMetricsBackend(config.group_id))
 
     environment = os.getenv("LAUNCHPAD_ENV")
     if not environment:
diff --git a/src/launchpad/service.py b/src/launchpad/service.py
@@ -136,14 +136,14 @@ def handle_kafka_message(self, payload: PreprodArtifactEvents) -> None:
             logger.info(f"Processing artifact: {artifact_id} (project: {project_id}, org: {organization_id})")
 
             if self._statsd:
-                self._statsd.increment("launchpad.artifact.processing.started")
+                self._statsd.increment("artifact.processing.started")
 
             self.process_artifact(artifact_id, project_id, organization_id)
 
             logger.info(f"Analysis completed for artifact {artifact_id}")
 
             if self._statsd:
-                self._statsd.increment("launchpad.artifact.processing.completed")
+                self._statsd.increment("artifact.processing.completed")
 
         except Exception as e:
             # Log the full error for debugging
@@ -153,7 +153,7 @@ def handle_kafka_message(self, payload: PreprodArtifactEvents) -> None:
             )
 
             if self._statsd:
-                self._statsd.increment("launchpad.artifact.processing.failed")
+                self._statsd.increment("artifact.processing.failed")
 
     def process_artifact(self, artifact_id: str, project_id: str, organization_id: str) -> None:
         """
@@ -346,7 +346,7 @@ def _update_artifact_error(
             # Log error to datadog with tags for better monitoring
             if self._statsd:
                 self._statsd.increment(
-                    "launchpad.artifact.processing.error",
+                    "artifact.processing.error",
                     tags=[
                         f"error_code:{error_code.value}",
                         f"error_type:{error_message.name}",
diff --git a/src/launchpad/utils/arroyo_metrics.py b/src/launchpad/utils/arroyo_metrics.py
@@ -5,7 +5,6 @@
 from typing import Optional, Union
 
 from arroyo.utils.metrics import MetricName, Metrics, Tags
-from datadog.dogstatsd.base import DogStatsd
 
 from launchpad.utils.statsd import get_statsd
 
@@ -17,8 +16,9 @@ class DatadogMetricsBackend(Metrics):
     This bridges Arroyo's metrics interface with DataDog StatsD.
     """
 
-    def __init__(self, statsd: Optional[DogStatsd] = None) -> None:
-        self._statsd = statsd or get_statsd()
+    def __init__(self, group_id: str) -> None:
+        self._statsd = get_statsd("consumer")
+        self._constant_tags = {"consumer_group": group_id}
 
     def increment(
         self,
@@ -46,7 +46,11 @@ def timing(self, name: MetricName, value: Union[int, float], tags: Optional[Tags
         self._statsd.timing(name, timing_value, tags=self._format_tags(tags))
 
     def _format_tags(self, tags: Optional[Tags]) -> Optional[list[str]]:
-        """Convert Arroyo tags format to DataDog tags format."""
-        if not tags:
+        """Convert Arroyo tags format to DataDog tags format, merging with constant tags."""
+        merged_tags = self._constant_tags.copy()
+        if tags:
+            merged_tags.update(tags)
+
+        if not merged_tags:
             return None
-        return [f"{key}:{value}" for key, value in tags.items()]
+        return [f"{key}:{value}" for key, value in merged_tags.items()]
diff --git a/src/launchpad/utils/statsd.py b/src/launchpad/utils/statsd.py
@@ -1,5 +1,7 @@
 import os
 
+from typing import Literal
+
 from datadog.dogstatsd.base import DogStatsd
 
 # There are a few weird issues with DataDog documented in other Sentry repos.
@@ -12,29 +14,34 @@
 # - not using the global initialize() and statsd instances.
 
 
-_statsd: DogStatsd | None = None
+_statsd_instances: dict[str, DogStatsd] = {}
+
+
+def get_statsd(environment: Literal["default", "consumer"] = "default") -> DogStatsd:
+    global _statsd_instances
 
+    if environment in _statsd_instances:
+        return _statsd_instances[environment]
 
-def get_statsd() -> DogStatsd:
-    global _statsd
+    disable_telemetry = True
+    origin_detection_enabled = False
 
-    if s := _statsd:
-        # Type checker does not seem to be able to work out _statsd
-        # must be set here hence the :=.
-        return s
-    else:
-        disable_telemetry = True
-        origin_detection_enabled = False
+    host = os.getenv("STATSD_HOST", "127.0.0.1")
+    port_str = os.getenv("STATSD_PORT", "8125")
 
-        host = os.getenv("STATSD_HOST", "127.0.0.1")
-        port_str = os.getenv("STATSD_PORT", "8125")
+    try:
+        port = int(port_str)
+    except ValueError:
+        raise ValueError(f"STATSD_PORT must be a valid integer, got: {port_str}")
 
-        try:
-            port = int(port_str)
-        except ValueError:
-            raise ValueError(f"STATSD_PORT must be a valid integer, got: {port_str}")
+    # Create namespace with environment
+    namespace = "launchpad" if environment == "default" else "launchpad_consumer"
 
-        _statsd = DogStatsd(
-            host=host, port=port, disable_telemetry=disable_telemetry, origin_detection_enabled=origin_detection_enabled
-        )
-        return _statsd
+    _statsd_instances[environment] = DogStatsd(
+        host=host,
+        port=port,
+        namespace=namespace,
+        disable_telemetry=disable_telemetry,
+        origin_detection_enabled=origin_detection_enabled,
+    )
+    return _statsd_instances[environment]
diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py
@@ -48,8 +48,8 @@ async def test_kafka_message_processing(self):
             mock_process.assert_called_once_with("ios-test-123", "test-project-ios", "test-org-123")
 
             # Verify statsd metrics were sent
-            service._statsd.increment.assert_any_call("launchpad.artifact.processing.started")
-            service._statsd.increment.assert_any_call("launchpad.artifact.processing.completed")
+            service._statsd.increment.assert_any_call("artifact.processing.started")
+            service._statsd.increment.assert_any_call("artifact.processing.completed")
 
             # Reset mocks for next test
             mock_process.reset_mock()
diff --git a/tests/integration/test_service.py b/tests/integration/test_service.py
@@ -72,8 +72,8 @@ def test_handle_kafka_message_ios(self, mock_process):
         mock_process.assert_called_once_with("ios-test-123", "test-project-ios", "test-org-123")
 
         # Verify metrics were recorded
-        service._statsd.increment.assert_any_call("launchpad.artifact.processing.started")
-        service._statsd.increment.assert_any_call("launchpad.artifact.processing.completed")
+        service._statsd.increment.assert_any_call("artifact.processing.started")
+        service._statsd.increment.assert_any_call("artifact.processing.completed")
 
     @patch.object(LaunchpadService, "process_artifact")
     def test_handle_kafka_message_android(self, mock_process):
@@ -97,8 +97,8 @@ def test_handle_kafka_message_android(self, mock_process):
         mock_process.assert_called_once_with("android-test-456", "test-project-android", "test-org-456")
 
         # Verify metrics were recorded
-        service._statsd.increment.assert_any_call("launchpad.artifact.processing.started")
-        service._statsd.increment.assert_any_call("launchpad.artifact.processing.completed")
+        service._statsd.increment.assert_any_call("artifact.processing.started")
+        service._statsd.increment.assert_any_call("artifact.processing.completed")
 
     @patch.object(LaunchpadService, "process_artifact")
     def test_handle_kafka_message_error(self, mock_process):
@@ -127,5 +127,5 @@ def test_handle_kafka_message_error(self, mock_process):
         # Verify the metrics were called correctly
         calls = service._statsd.increment.call_args_list
         assert len(calls) == 2
-        assert calls[0][0][0] == "launchpad.artifact.processing.started"
-        assert calls[1][0][0] == "launchpad.artifact.processing.failed"
+        assert calls[0][0][0] == "artifact.processing.started"
+        assert calls[1][0][0] == "artifact.processing.failed"
diff --git a/tests/unit/test_service_error_handling.py b/tests/unit/test_service_error_handling.py
@@ -246,7 +246,7 @@ def test_update_artifact_error_with_detailed_message(self, mock_sentry_client):
 
         # Verify datadog logging
         service._statsd.increment.assert_called_once_with(
-            "launchpad.artifact.processing.error",
+            "artifact.processing.error",
             tags=[
                 "error_code:3",
                 "error_type:PREPROCESSING_FAILED",