From 9717e07b740645d4b116fd2e10e9d0374405797d Mon Sep 17 00:00:00 2001
From: nizar-m <19857260+nizar-m@users.noreply.github.com>
Date: Thu, 19 Dec 2024 00:43:53 +0530
Subject: [PATCH] Add metrics hasura_events_fetch_query_time and
hasura_events_fetched_total
PR-URL: https://github.com/hasura/graphql-engine-mono/pull/11081
Co-authored-by: Rob Dominguez <24390149+robertjdominguez@users.noreply.github.com>
GitOrigin-RevId: da1d8b97eb053a4af558222ed1e6f00571e390aa
---
.../enterprise-edition/prometheus/metrics.mdx | 32 ++++++--
.../src-lib/Hasura/Eventing/EventTrigger.hs | 74 ++++++++++++++-----
server/src-lib/Hasura/Server/Prometheus.hs | 27 +++++++
3 files changed, 109 insertions(+), 24 deletions(-)
diff --git a/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx b/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx
index 881d0bb1a81ae..357a91e4e76df 100644
--- a/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx
+++ b/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx
@@ -80,12 +80,32 @@ represents the time taken to fetch a batch of events from the database.
A higher metric indicates slower polling of events from the database, you should consider looking into the performance
of your database.
-| | |
-| ------ | ------------------------------------------------------------------------------------------ |
-| Name | `hasura_event_fetch_time_per_batch_seconds` |
-| Type | Histogram
Buckets: 0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10 |
-| Labels | none |
-| Unit | seconds |
+| | |
+| ------ | --------------------------------------------------------------------------------------------------- |
+| Name | `hasura_event_fetch_time_per_batch_seconds` |
+| Type | Histogram
Buckets: 0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100 |
+| Labels | none |
+| Unit | seconds |
+
+#### Event fetch query time
+
+This metric represents the time taken by the query to fetch events from the database.
+
+| | |
+| ------ | --------------------------------------------------------------------------------------------------- |
+| Name | `hasura_events_fetch_query_time_seconds` |
+| Type | Histogram
Buckets: 0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100 |
+| Labels | `status`: success \| failed, `source_name` |
+
+#### Total events fetch
+
+This metric represents the total number of events fetched from a datasource.
+
+| | |
+| ------ | ---------------------------------------------------------- |
+| Name | `hasura_events_fetched_total` |
+| Type | Counter |
+| Labels | `source_name` |
#### Event invocations total
diff --git a/server/src-lib/Hasura/Eventing/EventTrigger.hs b/server/src-lib/Hasura/Eventing/EventTrigger.hs
index 21aa3a678686e..52490cf7beee9 100644
--- a/server/src-lib/Hasura/Eventing/EventTrigger.hs
+++ b/server/src-lib/Hasura/Eventing/EventTrigger.hs
@@ -53,6 +53,7 @@ import Data.Aeson qualified as J
import Data.Aeson.Key qualified as Key
import Data.Aeson.KeyMap qualified as KeyMap
import Data.Aeson.Lens qualified as JL
+import Data.Either (isRight)
import Data.Has
import Data.HashMap.Strict qualified as HashMap
import Data.SerializableBlob qualified as SB
@@ -94,6 +95,7 @@ import System.Metrics.Prometheus.CounterVector (CounterVector)
import System.Metrics.Prometheus.CounterVector qualified as CounterVector
import System.Metrics.Prometheus.Gauge qualified as Prometheus.Gauge
import System.Metrics.Prometheus.Histogram qualified as Prometheus.Histogram
+import System.Metrics.Prometheus.HistogramVector qualified as HistogramVector
import System.Timeout.Lifted (timeout)
newtype EventInternalErr
@@ -346,30 +348,60 @@ processEventQueue logger statsLogger httpMgr getSchemaCache getEventEngineCtx ac
-- only process events for this source if at least one event trigger exists
if eventTriggerCount > 0
then do
- eventPollStartTime <- getCurrentTime
- runExceptT (fetchUndeliveredEvents @b _siConfiguration sourceName triggerNames maintenanceMode (FetchBatchSize fetchBatchSize)) >>= \case
- Right events -> do
- let eventFetchCount = fromIntegral $ length events
- Prometheus.Gauge.set (eventsFetchedPerBatch eventTriggerMetrics) eventFetchCount
- if (null events)
- then return []
- else do
- eventsFetchedTime <- getCurrentTime -- This is also the poll end time
- let eventPollTime = realToFrac $ diffUTCTime eventsFetchedTime eventPollStartTime
- _ <- EKG.Distribution.add (smEventFetchTimePerBatch serverMetrics) eventPollTime
- Prometheus.Histogram.observe (eventsFetchTimePerBatch eventTriggerMetrics) eventPollTime
- _ <- EKG.Distribution.add (smNumEventsFetchedPerBatch serverMetrics) (fromIntegral $ length events)
- saveLockedEventTriggerEvents sourceName (eId <$> events) leEvents
- return $ map (\event -> AB.mkAnyBackend @b $ EventWithSource event _siConfiguration sourceName eventsFetchedTime) events
- Left err -> do
- L.unLogger logger $ EventInternalErr err
- pure []
+ collectMetrics
+ (observeFetchQuery sourceName)
+ (runExceptT $ fetchUndeliveredEvents @b _siConfiguration sourceName triggerNames maintenanceMode (FetchBatchSize fetchBatchSize))
+ >>= \case
+ Right events -> do
+ if (null events)
+ then return []
+ else do
+ saveLockedEventTriggerEvents sourceName (eId <$> events) leEvents
+ eventsFetchedTime <- getCurrentTime -- This is also the poll end time
+ return $ map (\event -> AB.mkAnyBackend @b $ EventWithSource event _siConfiguration sourceName eventsFetchedTime) events
+ Left err -> do
+ L.unLogger logger $ EventInternalErr err
+ pure []
else pure []
-- Log the statistics of events fetched
logFetchedEventsStatistics statsLogger events
pure events
+ observeFetchQuery :: SourceName -> DiffTime -> Either a [Event b] -> IO ()
+ observeFetchQuery sourceName fetchQueryTime result = do
+ let status = either (const Failed) (const Success) result
+ eventsCount = either (const 0) length result
+
+ -- time taken by event fetch query with the status label
+ HistogramVector.observe
+ (eventsFetchQueryTime eventTriggerMetrics)
+ (EventsFetchQueryStatusLabel sourceName status)
+ (realToFrac fetchQueryTime)
+
+ -- metrics total events fetched by source
+ when (isRight result)
+ $ Prometheus.Gauge.set
+ (eventsFetchedPerBatch eventTriggerMetrics)
+ (fromIntegral eventsCount)
+
+ when (eventsCount > 0) $ do
+ _ <-
+ EKG.Distribution.add
+ (smEventFetchTimePerBatch serverMetrics)
+ (realToFrac fetchQueryTime)
+ Prometheus.Histogram.observe
+ (eventsFetchTimePerBatch eventTriggerMetrics)
+ (realToFrac fetchQueryTime)
+ _ <-
+ EKG.Distribution.add
+ (smNumEventsFetchedPerBatch serverMetrics)
+ (fromIntegral eventsCount)
+ CounterVector.add
+ (eventsFetchedTotal eventTriggerMetrics)
+ (EventTriggerSourceLabel sourceName)
+ (fromIntegral eventsCount)
+
-- !!! CAREFUL !!!
-- The logic here in particular is subtle and has been fixed, broken,
-- and fixed again in several different ways, several times.
@@ -808,3 +840,9 @@ incEventTriggerCounterWithLabel getMetricState alwaysObserve counterVector (Even
alwaysObserve
(liftIO $ CounterVector.inc counterVector (EventStatusWithTriggerLabel status tl))
(liftIO $ CounterVector.inc counterVector (EventStatusWithTriggerLabel status Nothing))
+
+collectMetrics :: (MonadIO m) => (DiffTime -> a -> m ()) -> m a -> m a
+collectMetrics metricsFunc action = do
+ (duration, result) <- withElapsedTime action
+ metricsFunc duration result
+ pure result
diff --git a/server/src-lib/Hasura/Server/Prometheus.hs b/server/src-lib/Hasura/Server/Prometheus.hs
index 967f1540e5ca6..f1cf634b5cf30 100644
--- a/server/src-lib/Hasura/Server/Prometheus.hs
+++ b/server/src-lib/Hasura/Server/Prometheus.hs
@@ -20,6 +20,8 @@ module Hasura.Server.Prometheus
decWebsocketConnections,
ScheduledTriggerMetrics (..),
SubscriptionMetrics (..),
+ EventTriggerSourceLabel (..),
+ EventsFetchQueryStatusLabel (..),
DynamicEventTriggerLabel (..),
ResponseStatus (..),
responseStatusToLabelValue,
@@ -92,6 +94,8 @@ data GraphQLRequestMetrics = GraphQLRequestMetrics
data EventTriggerMetrics = EventTriggerMetrics
{ eventTriggerHTTPWorkers :: Gauge,
+ eventsFetchQueryTime :: HistogramVector EventsFetchQueryStatusLabel,
+ eventsFetchedTotal :: CounterVector EventTriggerSourceLabel,
eventsFetchedPerBatch :: Gauge,
eventQueueTimeSeconds :: HistogramVector (Maybe DynamicEventTriggerLabel),
eventsFetchTimePerBatch :: Histogram,
@@ -174,6 +178,8 @@ makeDummyGraphQLRequestMetrics = do
makeDummyEventTriggerMetrics :: IO EventTriggerMetrics
makeDummyEventTriggerMetrics = do
eventTriggerHTTPWorkers <- Gauge.new
+ eventsFetchQueryTime <- HistogramVector.new []
+ eventsFetchedTotal <- CounterVector.new
eventsFetchedPerBatch <- Gauge.new
eventQueueTimeSeconds <- HistogramVector.new []
eventsFetchTimePerBatch <- Histogram.new []
@@ -274,6 +280,27 @@ modifyConnectionsGauge ::
modifyConnectionsGauge f (ConnectionsGauge ref) =
atomicModifyIORef' ref $ \connections -> (f connections, ())
+data EventTriggerSourceLabel = EventTriggerSourceLabel
+ { _eslSourceName :: SourceName
+ }
+ deriving (Eq, Ord)
+
+instance ToLabels EventTriggerSourceLabel where
+ toLabels (EventTriggerSourceLabel {..}) = Map.singleton "source_name" $ sourceNameToText _eslSourceName
+
+data EventsFetchQueryStatusLabel = EventsFetchQueryStatusLabel
+ { _efqsSourceName :: SourceName,
+ _efqsStatus :: ResponseStatus
+ }
+ deriving (Eq, Ord)
+
+instance ToLabels EventsFetchQueryStatusLabel where
+ toLabels (EventsFetchQueryStatusLabel sourceName status) =
+ Map.fromList
+ [ ("source_name", sourceNameToText sourceName),
+ ("status", responseStatusToLabelValue status)
+ ]
+
data DynamicEventTriggerLabel = DynamicEventTriggerLabel
{ _detlTriggerName :: TriggerName,
_detlSourceName :: SourceName