Skip to content
Merged
1 change: 1 addition & 0 deletions datadog_checks_base/changelog.d/21703.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added error health event for DBM async jobs
16 changes: 16 additions & 0 deletions datadog_checks_base/datadog_checks/base/utils/db/health.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from datadog_checks.base.stubs import datadog_agent


import traceback
from enum import Enum


Expand All @@ -28,6 +29,7 @@ class HealthEvent(Enum):
"""

INITIALIZATION = 'initialization'
UNKNOWN_ERROR = 'unknown_error'


class HealthStatus(Enum):
Expand Down Expand Up @@ -79,3 +81,17 @@ def submit_health_event(self, name: HealthEvent, status: HealthStatus, tags: lis
),
"dbm-health",
)

def submit_error_health_event(self, exception: Exception, **kwargs):
trace = traceback.extract_tb(exception.__traceback__)
exc = trace.pop()
if exc:
self.submit_health_event(
name=HealthEvent.UNKNOWN_ERROR,
status=HealthStatus.ERROR,
file=exc.filename,
line=exc.lineno,
function=exc.name,
exception_type=type(exception).__name__,
**kwargs,
)
5 changes: 5 additions & 0 deletions datadog_checks_base/datadog_checks/base/utils/db/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,11 @@ def _job_loop(self):
tags=self._job_tags + ["error:crash-{}".format(type(e))],
raw=True,
)
if not self._cancel_event.is_set() and self._check.health:
try:
self._check.health.submit_error_health_event(e, job_name=self._job_name)
except Exception as e:
self._log.exception("[%s] Failed to submit error health event", self._job_tags_str, e)
finally:
self._log.info("[%s] Shutting down job loop", self._job_tags_str)
if self._shutdown_callback:
Expand Down
35 changes: 34 additions & 1 deletion datadog_checks_base/tests/base/utils/db/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from datadog_checks.base import AgentCheck
from datadog_checks.base.stubs.datadog_agent import datadog_agent
from datadog_checks.base.utils.db.health import Health, HealthEvent, HealthStatus
from datadog_checks.base.utils.db.utils import (
ConstantRateLimiter,
DBMAsyncJob,
Expand Down Expand Up @@ -123,7 +124,35 @@ def test_ratelimiting_ttl_cache():
assert cache.acquire(i), "cache should be empty again so these keys should go in OK"


class DBExceptionForTests(BaseException):
def test_dbm_async_job_unknown_error(aggregator):
check = AgentCheck()
health = Health(check)
check.health = health
exception = UnexpectedExceptionForTests()
job = JobForTesting(check, exception=exception)
try:
job.run_job_loop(["hello:there"])
job._job_loop_future.result(timeout=10)
job.cancel()
except:
pass
finally:
events = aggregator.get_event_platform_events("dbm-health")
assert len(events) == 1
health_event = events[0]
assert health_event['name'] == HealthEvent.UNKNOWN_ERROR.value
assert health_event['status'] == HealthStatus.ERROR.value
assert health_event['data']['file'].endswith('test_util.py')
assert health_event['data']['line'] is not None
assert health_event['data']['function'] == 'run_job'
assert health_event['data']['exception_type'] == 'UnexpectedExceptionForTests'


class DBExceptionForTests(Exception):
pass


class UnexpectedExceptionForTests(Exception):
pass


Expand Down Expand Up @@ -250,6 +279,7 @@ def __init__(
min_collection_interval=15,
job_execution_time=0,
max_sleep_chunk_s=5,
exception=None,
):
super(JobForTesting, self).__init__(
check,
Expand All @@ -266,13 +296,16 @@ def __init__(
)
self._job_execution_time = job_execution_time
self.count_executed = 0
self._exception = exception

def test_shutdown(self):
self._check.count("dbm.async_job_test.shutdown", 1)

def run_job(self):
self._check.count("dbm.async_job_test.run_job", 1)
self.count_executed += 1
if self._exception:
raise self._exception
time.sleep(self._job_execution_time)


Expand Down
Loading