diff --git a/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx b/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx index 881d0bb1a81ae..357a91e4e76df 100644 --- a/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx +++ b/docs/docs/observability/enterprise-edition/prometheus/metrics.mdx @@ -80,12 +80,32 @@ represents the time taken to fetch a batch of events from the database. A higher metric indicates slower polling of events from the database, you should consider looking into the performance of your database. -| | | -| ------ | ------------------------------------------------------------------------------------------ | -| Name | `hasura_event_fetch_time_per_batch_seconds` | -| Type | Histogram

Buckets: 0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10 | -| Labels | none | -| Unit | seconds | +| | | +| ------ | --------------------------------------------------------------------------------------------------- | +| Name | `hasura_event_fetch_time_per_batch_seconds` | +| Type | Histogram

Buckets: 0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100 | +| Labels | none | +| Unit | seconds | + +#### Event fetch query time + +This metric represents the time taken by the query to fetch events from the database. + +| | | +| ------ | --------------------------------------------------------------------------------------------------- | +| Name | `hasura_events_fetch_query_time_seconds` | +| Type | Histogram

Buckets: 0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100 | +| Labels | `status`: success \| failed, `source_name` | + +#### Total events fetch + +This metric represents the total number of events fetched from a datasource. + +| | | +| ------ | ---------------------------------------------------------- | +| Name | `hasura_events_fetched_total` | +| Type | Counter | +| Labels | `source_name` | #### Event invocations total diff --git a/server/src-lib/Hasura/Eventing/EventTrigger.hs b/server/src-lib/Hasura/Eventing/EventTrigger.hs index 21aa3a678686e..52490cf7beee9 100644 --- a/server/src-lib/Hasura/Eventing/EventTrigger.hs +++ b/server/src-lib/Hasura/Eventing/EventTrigger.hs @@ -53,6 +53,7 @@ import Data.Aeson qualified as J import Data.Aeson.Key qualified as Key import Data.Aeson.KeyMap qualified as KeyMap import Data.Aeson.Lens qualified as JL +import Data.Either (isRight) import Data.Has import Data.HashMap.Strict qualified as HashMap import Data.SerializableBlob qualified as SB @@ -94,6 +95,7 @@ import System.Metrics.Prometheus.CounterVector (CounterVector) import System.Metrics.Prometheus.CounterVector qualified as CounterVector import System.Metrics.Prometheus.Gauge qualified as Prometheus.Gauge import System.Metrics.Prometheus.Histogram qualified as Prometheus.Histogram +import System.Metrics.Prometheus.HistogramVector qualified as HistogramVector import System.Timeout.Lifted (timeout) newtype EventInternalErr @@ -346,30 +348,60 @@ processEventQueue logger statsLogger httpMgr getSchemaCache getEventEngineCtx ac -- only process events for this source if at least one event trigger exists if eventTriggerCount > 0 then do - eventPollStartTime <- getCurrentTime - runExceptT (fetchUndeliveredEvents @b _siConfiguration sourceName triggerNames maintenanceMode (FetchBatchSize fetchBatchSize)) >>= \case - Right events -> do - let eventFetchCount = fromIntegral $ length events - Prometheus.Gauge.set (eventsFetchedPerBatch eventTriggerMetrics) eventFetchCount - if (null events) - then return [] - else do - eventsFetchedTime <- getCurrentTime -- This is also the poll end time - let eventPollTime = realToFrac $ diffUTCTime eventsFetchedTime eventPollStartTime - _ <- EKG.Distribution.add (smEventFetchTimePerBatch serverMetrics) eventPollTime - Prometheus.Histogram.observe (eventsFetchTimePerBatch eventTriggerMetrics) eventPollTime - _ <- EKG.Distribution.add (smNumEventsFetchedPerBatch serverMetrics) (fromIntegral $ length events) - saveLockedEventTriggerEvents sourceName (eId <$> events) leEvents - return $ map (\event -> AB.mkAnyBackend @b $ EventWithSource event _siConfiguration sourceName eventsFetchedTime) events - Left err -> do - L.unLogger logger $ EventInternalErr err - pure [] + collectMetrics + (observeFetchQuery sourceName) + (runExceptT $ fetchUndeliveredEvents @b _siConfiguration sourceName triggerNames maintenanceMode (FetchBatchSize fetchBatchSize)) + >>= \case + Right events -> do + if (null events) + then return [] + else do + saveLockedEventTriggerEvents sourceName (eId <$> events) leEvents + eventsFetchedTime <- getCurrentTime -- This is also the poll end time + return $ map (\event -> AB.mkAnyBackend @b $ EventWithSource event _siConfiguration sourceName eventsFetchedTime) events + Left err -> do + L.unLogger logger $ EventInternalErr err + pure [] else pure [] -- Log the statistics of events fetched logFetchedEventsStatistics statsLogger events pure events + observeFetchQuery :: SourceName -> DiffTime -> Either a [Event b] -> IO () + observeFetchQuery sourceName fetchQueryTime result = do + let status = either (const Failed) (const Success) result + eventsCount = either (const 0) length result + + -- time taken by event fetch query with the status label + HistogramVector.observe + (eventsFetchQueryTime eventTriggerMetrics) + (EventsFetchQueryStatusLabel sourceName status) + (realToFrac fetchQueryTime) + + -- metrics total events fetched by source + when (isRight result) + $ Prometheus.Gauge.set + (eventsFetchedPerBatch eventTriggerMetrics) + (fromIntegral eventsCount) + + when (eventsCount > 0) $ do + _ <- + EKG.Distribution.add + (smEventFetchTimePerBatch serverMetrics) + (realToFrac fetchQueryTime) + Prometheus.Histogram.observe + (eventsFetchTimePerBatch eventTriggerMetrics) + (realToFrac fetchQueryTime) + _ <- + EKG.Distribution.add + (smNumEventsFetchedPerBatch serverMetrics) + (fromIntegral eventsCount) + CounterVector.add + (eventsFetchedTotal eventTriggerMetrics) + (EventTriggerSourceLabel sourceName) + (fromIntegral eventsCount) + -- !!! CAREFUL !!! -- The logic here in particular is subtle and has been fixed, broken, -- and fixed again in several different ways, several times. @@ -808,3 +840,9 @@ incEventTriggerCounterWithLabel getMetricState alwaysObserve counterVector (Even alwaysObserve (liftIO $ CounterVector.inc counterVector (EventStatusWithTriggerLabel status tl)) (liftIO $ CounterVector.inc counterVector (EventStatusWithTriggerLabel status Nothing)) + +collectMetrics :: (MonadIO m) => (DiffTime -> a -> m ()) -> m a -> m a +collectMetrics metricsFunc action = do + (duration, result) <- withElapsedTime action + metricsFunc duration result + pure result diff --git a/server/src-lib/Hasura/Server/Prometheus.hs b/server/src-lib/Hasura/Server/Prometheus.hs index 967f1540e5ca6..f1cf634b5cf30 100644 --- a/server/src-lib/Hasura/Server/Prometheus.hs +++ b/server/src-lib/Hasura/Server/Prometheus.hs @@ -20,6 +20,8 @@ module Hasura.Server.Prometheus decWebsocketConnections, ScheduledTriggerMetrics (..), SubscriptionMetrics (..), + EventTriggerSourceLabel (..), + EventsFetchQueryStatusLabel (..), DynamicEventTriggerLabel (..), ResponseStatus (..), responseStatusToLabelValue, @@ -92,6 +94,8 @@ data GraphQLRequestMetrics = GraphQLRequestMetrics data EventTriggerMetrics = EventTriggerMetrics { eventTriggerHTTPWorkers :: Gauge, + eventsFetchQueryTime :: HistogramVector EventsFetchQueryStatusLabel, + eventsFetchedTotal :: CounterVector EventTriggerSourceLabel, eventsFetchedPerBatch :: Gauge, eventQueueTimeSeconds :: HistogramVector (Maybe DynamicEventTriggerLabel), eventsFetchTimePerBatch :: Histogram, @@ -174,6 +178,8 @@ makeDummyGraphQLRequestMetrics = do makeDummyEventTriggerMetrics :: IO EventTriggerMetrics makeDummyEventTriggerMetrics = do eventTriggerHTTPWorkers <- Gauge.new + eventsFetchQueryTime <- HistogramVector.new [] + eventsFetchedTotal <- CounterVector.new eventsFetchedPerBatch <- Gauge.new eventQueueTimeSeconds <- HistogramVector.new [] eventsFetchTimePerBatch <- Histogram.new [] @@ -274,6 +280,27 @@ modifyConnectionsGauge :: modifyConnectionsGauge f (ConnectionsGauge ref) = atomicModifyIORef' ref $ \connections -> (f connections, ()) +data EventTriggerSourceLabel = EventTriggerSourceLabel + { _eslSourceName :: SourceName + } + deriving (Eq, Ord) + +instance ToLabels EventTriggerSourceLabel where + toLabels (EventTriggerSourceLabel {..}) = Map.singleton "source_name" $ sourceNameToText _eslSourceName + +data EventsFetchQueryStatusLabel = EventsFetchQueryStatusLabel + { _efqsSourceName :: SourceName, + _efqsStatus :: ResponseStatus + } + deriving (Eq, Ord) + +instance ToLabels EventsFetchQueryStatusLabel where + toLabels (EventsFetchQueryStatusLabel sourceName status) = + Map.fromList + [ ("source_name", sourceNameToText sourceName), + ("status", responseStatusToLabelValue status) + ] + data DynamicEventTriggerLabel = DynamicEventTriggerLabel { _detlTriggerName :: TriggerName, _detlSourceName :: SourceName