diff --git a/libs/back/tracer/src/eventLoop.ts b/libs/back/tracer/src/eventLoop.ts deleted file mode 100644 index 30818b0671..0000000000 --- a/libs/back/tracer/src/eventLoop.ts +++ /dev/null @@ -1,31 +0,0 @@ -import { logger } from "@td/logger"; -import { trace } from "@opentelemetry/api"; -import { hrtime } from "node:process"; - -const THRESHOLD_NS = - parseInt(process.env.EVENT_LOOP_THRESHOLD_MS ?? "200", 10) * 1e6; -const CHECK_INTERVAL = 100; - -export function enableEventLoopMonitor() { - let start = hrtime.bigint(); - - setInterval(function () { - const end = process.hrtime.bigint(); - const delta = end - start; - - if (delta > THRESHOLD_NS) { - const deltaInMs = Number(delta / BigInt(1e6)); - const activeSpan = trace.getActiveSpan(); - logger.warn({ - label: "EventLoopMonitor", - message: `Event loop was blocked for ${deltaInMs}ms`, - metadata: { - delta: deltaInMs, - traceId: activeSpan?.spanContext().traceId, - spanId: activeSpan?.spanContext().spanId - } - }); - } - start = hrtime.bigint(); - }, CHECK_INTERVAL).unref(); -} diff --git a/libs/back/tracer/src/index.ts b/libs/back/tracer/src/index.ts index 776ccc6cd7..386920daee 100644 --- a/libs/back/tracer/src/index.ts +++ b/libs/back/tracer/src/index.ts @@ -21,6 +21,7 @@ import { import { PrismaInstrumentation } from "@prisma/instrumentation"; import { ElasticsearchInstrumentation } from "opentelemetry-instrumentation-elasticsearch"; import { getAppRootFolderName } from "./utils"; +import { initializeNodeRuntimeMetrics } from "./metric"; // Incubating attributes const ATTR_CLOUD_REGION = "cloud.region"; @@ -68,14 +69,17 @@ if (process.env.NODE_ENV !== "test" && !process.env.OTEL_SDK_DISABLED) { try { sdk.start(); console.info("Telemetry started"); + + const cleanupRuntimeMetrics = initializeNodeRuntimeMetrics(); + + process.on("SIGTERM", () => { + cleanupRuntimeMetrics(); + sdk.shutdown(); + }); } catch (error) { console.error( "Error initializing OpenTelemetry SDK. Your application is not instrumented and will not produce telemetry", error ); } - - process.on("SIGTERM", () => { - sdk.shutdown(); - }); } diff --git a/libs/back/tracer/src/metric.ts b/libs/back/tracer/src/metric.ts new file mode 100644 index 0000000000..770922fae0 --- /dev/null +++ b/libs/back/tracer/src/metric.ts @@ -0,0 +1,89 @@ +import { metrics } from "@opentelemetry/api"; + +const METRIC_INTERVAL_MS = 30000; + +export function initializeNodeRuntimeMetrics() { + const meter = metrics.getMeter("nodejs-runtime-metrics"); + + // Memory metrics (gauges) + const heapUsedGauge = meter.createGauge("nodejs_heap_used", { + description: "V8 heap memory used in bytes", + unit: "By" + }); + + const heapTotalGauge = meter.createGauge("nodejs_heap_total", { + description: "Total allocated V8 heap memory in bytes", + unit: "By" + }); + + const heapRssGauge = meter.createGauge("nodejs_heap_rss", { + description: "Resident set size (total memory usage) in bytes", + unit: "By" + }); + + const heapExternalGauge = meter.createGauge("nodejs_heap_external", { + description: "V8 external memory usage in bytes", + unit: "By" + }); + + // Process metrics (gauges) + const processUptimeGauge = meter.createGauge("nodejs_process_uptime", { + description: "Process uptime in seconds", + unit: "s" + }); + + // Performance metrics (histograms) + const eventLoopLagHistogram = meter.createHistogram("nodejs_eventloop_lag", { + description: "Event loop lag in milliseconds", + unit: "ms" + }); + + const cpuUserHistogram = meter.createHistogram("nodejs_cpu_user", { + description: "CPU user time delta in milliseconds", + unit: "ms" + }); + + const cpuSystemHistogram = meter.createHistogram("nodejs_cpu_system", { + description: "CPU system time delta in milliseconds", + unit: "ms" + }); + + let lastCpuUsage = process.cpuUsage(); + + const recordRuntimeMetrics = () => { + const memUsage = process.memoryUsage(); + + heapUsedGauge.record(memUsage.heapUsed); + heapTotalGauge.record(memUsage.heapTotal); + heapRssGauge.record(memUsage.rss); + heapExternalGauge.record(memUsage.external); + + processUptimeGauge.record(process.uptime()); + + const start = process.hrtime.bigint(); + setImmediate(() => { + const lag = Number(process.hrtime.bigint() - start) / 1e6; // Convert to milliseconds + eventLoopLagHistogram.record(lag); + }); + + // Record CPU metrics (deltas) + const currentCpuUsage = process.cpuUsage(); + const userDelta = (currentCpuUsage.user - lastCpuUsage.user) / 1000; // Convert to milliseconds + const systemDelta = (currentCpuUsage.system - lastCpuUsage.system) / 1000; + + cpuUserHistogram.record(userDelta); + cpuSystemHistogram.record(systemDelta); + + lastCpuUsage = currentCpuUsage; + }; + + const metricsInterval = setInterval(recordRuntimeMetrics, METRIC_INTERVAL_MS); + // Allow process to exit even with pending timer + metricsInterval.unref(); + + recordRuntimeMetrics(); + + return () => { + clearInterval(metricsInterval); + }; +}