Skip to content

Commit

Permalink
Add metrics hasura_events_fetch_query_time and hasura_events_fetched_…
Browse files Browse the repository at this point in the history
…total

PR-URL: hasura/graphql-engine-mono#11081
Co-authored-by: Rob Dominguez <[email protected]>
GitOrigin-RevId: da1d8b97eb053a4af558222ed1e6f00571e390aa
  • Loading branch information
2 people authored and hasura-bot committed Dec 18, 2024
1 parent 4046187 commit 9717e07
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 24 deletions.
32 changes: 26 additions & 6 deletions docs/docs/observability/enterprise-edition/prometheus/metrics.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,32 @@ represents the time taken to fetch a batch of events from the database.
A higher metric indicates slower polling of events from the database, you should consider looking into the performance
of your database.

| | |
| ------ | ------------------------------------------------------------------------------------------ |
| Name | `hasura_event_fetch_time_per_batch_seconds` |
| Type | Histogram<br /><br />Buckets: 0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10 |
| Labels | none |
| Unit | seconds |
| | |
| ------ | --------------------------------------------------------------------------------------------------- |
| Name | `hasura_event_fetch_time_per_batch_seconds` |
| Type | Histogram<br /><br />Buckets: 0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100 |
| Labels | none |
| Unit | seconds |

#### Event fetch query time

This metric represents the time taken by the query to fetch events from the database.

| | |
| ------ | --------------------------------------------------------------------------------------------------- |
| Name | `hasura_events_fetch_query_time_seconds` |
| Type | Histogram<br /><br />Buckets: 0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100 |
| Labels | `status`: success \| failed, `source_name` |

#### Total events fetch

This metric represents the total number of events fetched from a datasource.

| | |
| ------ | ---------------------------------------------------------- |
| Name | `hasura_events_fetched_total` |
| Type | Counter |
| Labels | `source_name` |

#### Event invocations total

Expand Down
74 changes: 56 additions & 18 deletions server/src-lib/Hasura/Eventing/EventTrigger.hs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ import Data.Aeson qualified as J
import Data.Aeson.Key qualified as Key
import Data.Aeson.KeyMap qualified as KeyMap
import Data.Aeson.Lens qualified as JL
import Data.Either (isRight)
import Data.Has
import Data.HashMap.Strict qualified as HashMap
import Data.SerializableBlob qualified as SB
Expand Down Expand Up @@ -94,6 +95,7 @@ import System.Metrics.Prometheus.CounterVector (CounterVector)
import System.Metrics.Prometheus.CounterVector qualified as CounterVector
import System.Metrics.Prometheus.Gauge qualified as Prometheus.Gauge
import System.Metrics.Prometheus.Histogram qualified as Prometheus.Histogram
import System.Metrics.Prometheus.HistogramVector qualified as HistogramVector
import System.Timeout.Lifted (timeout)

newtype EventInternalErr
Expand Down Expand Up @@ -346,30 +348,60 @@ processEventQueue logger statsLogger httpMgr getSchemaCache getEventEngineCtx ac
-- only process events for this source if at least one event trigger exists
if eventTriggerCount > 0
then do
eventPollStartTime <- getCurrentTime
runExceptT (fetchUndeliveredEvents @b _siConfiguration sourceName triggerNames maintenanceMode (FetchBatchSize fetchBatchSize)) >>= \case
Right events -> do
let eventFetchCount = fromIntegral $ length events
Prometheus.Gauge.set (eventsFetchedPerBatch eventTriggerMetrics) eventFetchCount
if (null events)
then return []
else do
eventsFetchedTime <- getCurrentTime -- This is also the poll end time
let eventPollTime = realToFrac $ diffUTCTime eventsFetchedTime eventPollStartTime
_ <- EKG.Distribution.add (smEventFetchTimePerBatch serverMetrics) eventPollTime
Prometheus.Histogram.observe (eventsFetchTimePerBatch eventTriggerMetrics) eventPollTime
_ <- EKG.Distribution.add (smNumEventsFetchedPerBatch serverMetrics) (fromIntegral $ length events)
saveLockedEventTriggerEvents sourceName (eId <$> events) leEvents
return $ map (\event -> AB.mkAnyBackend @b $ EventWithSource event _siConfiguration sourceName eventsFetchedTime) events
Left err -> do
L.unLogger logger $ EventInternalErr err
pure []
collectMetrics
(observeFetchQuery sourceName)
(runExceptT $ fetchUndeliveredEvents @b _siConfiguration sourceName triggerNames maintenanceMode (FetchBatchSize fetchBatchSize))
>>= \case
Right events -> do
if (null events)
then return []
else do
saveLockedEventTriggerEvents sourceName (eId <$> events) leEvents
eventsFetchedTime <- getCurrentTime -- This is also the poll end time
return $ map (\event -> AB.mkAnyBackend @b $ EventWithSource event _siConfiguration sourceName eventsFetchedTime) events
Left err -> do
L.unLogger logger $ EventInternalErr err
pure []
else pure []

-- Log the statistics of events fetched
logFetchedEventsStatistics statsLogger events
pure events

observeFetchQuery :: SourceName -> DiffTime -> Either a [Event b] -> IO ()
observeFetchQuery sourceName fetchQueryTime result = do
let status = either (const Failed) (const Success) result
eventsCount = either (const 0) length result

-- time taken by event fetch query with the status label
HistogramVector.observe
(eventsFetchQueryTime eventTriggerMetrics)
(EventsFetchQueryStatusLabel sourceName status)
(realToFrac fetchQueryTime)

-- metrics total events fetched by source
when (isRight result)
$ Prometheus.Gauge.set
(eventsFetchedPerBatch eventTriggerMetrics)
(fromIntegral eventsCount)

when (eventsCount > 0) $ do
_ <-
EKG.Distribution.add
(smEventFetchTimePerBatch serverMetrics)
(realToFrac fetchQueryTime)
Prometheus.Histogram.observe
(eventsFetchTimePerBatch eventTriggerMetrics)
(realToFrac fetchQueryTime)
_ <-
EKG.Distribution.add
(smNumEventsFetchedPerBatch serverMetrics)
(fromIntegral eventsCount)
CounterVector.add
(eventsFetchedTotal eventTriggerMetrics)
(EventTriggerSourceLabel sourceName)
(fromIntegral eventsCount)

-- !!! CAREFUL !!!
-- The logic here in particular is subtle and has been fixed, broken,
-- and fixed again in several different ways, several times.
Expand Down Expand Up @@ -808,3 +840,9 @@ incEventTriggerCounterWithLabel getMetricState alwaysObserve counterVector (Even
alwaysObserve
(liftIO $ CounterVector.inc counterVector (EventStatusWithTriggerLabel status tl))
(liftIO $ CounterVector.inc counterVector (EventStatusWithTriggerLabel status Nothing))

collectMetrics :: (MonadIO m) => (DiffTime -> a -> m ()) -> m a -> m a
collectMetrics metricsFunc action = do
(duration, result) <- withElapsedTime action
metricsFunc duration result
pure result
27 changes: 27 additions & 0 deletions server/src-lib/Hasura/Server/Prometheus.hs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ module Hasura.Server.Prometheus
decWebsocketConnections,
ScheduledTriggerMetrics (..),
SubscriptionMetrics (..),
EventTriggerSourceLabel (..),
EventsFetchQueryStatusLabel (..),
DynamicEventTriggerLabel (..),
ResponseStatus (..),
responseStatusToLabelValue,
Expand Down Expand Up @@ -92,6 +94,8 @@ data GraphQLRequestMetrics = GraphQLRequestMetrics

data EventTriggerMetrics = EventTriggerMetrics
{ eventTriggerHTTPWorkers :: Gauge,
eventsFetchQueryTime :: HistogramVector EventsFetchQueryStatusLabel,
eventsFetchedTotal :: CounterVector EventTriggerSourceLabel,
eventsFetchedPerBatch :: Gauge,
eventQueueTimeSeconds :: HistogramVector (Maybe DynamicEventTriggerLabel),
eventsFetchTimePerBatch :: Histogram,
Expand Down Expand Up @@ -174,6 +178,8 @@ makeDummyGraphQLRequestMetrics = do
makeDummyEventTriggerMetrics :: IO EventTriggerMetrics
makeDummyEventTriggerMetrics = do
eventTriggerHTTPWorkers <- Gauge.new
eventsFetchQueryTime <- HistogramVector.new []
eventsFetchedTotal <- CounterVector.new
eventsFetchedPerBatch <- Gauge.new
eventQueueTimeSeconds <- HistogramVector.new []
eventsFetchTimePerBatch <- Histogram.new []
Expand Down Expand Up @@ -274,6 +280,27 @@ modifyConnectionsGauge ::
modifyConnectionsGauge f (ConnectionsGauge ref) =
atomicModifyIORef' ref $ \connections -> (f connections, ())

data EventTriggerSourceLabel = EventTriggerSourceLabel
{ _eslSourceName :: SourceName
}
deriving (Eq, Ord)

instance ToLabels EventTriggerSourceLabel where
toLabels (EventTriggerSourceLabel {..}) = Map.singleton "source_name" $ sourceNameToText _eslSourceName

data EventsFetchQueryStatusLabel = EventsFetchQueryStatusLabel
{ _efqsSourceName :: SourceName,
_efqsStatus :: ResponseStatus
}
deriving (Eq, Ord)

instance ToLabels EventsFetchQueryStatusLabel where
toLabels (EventsFetchQueryStatusLabel sourceName status) =
Map.fromList
[ ("source_name", sourceNameToText sourceName),
("status", responseStatusToLabelValue status)
]

data DynamicEventTriggerLabel = DynamicEventTriggerLabel
{ _detlTriggerName :: TriggerName,
_detlSourceName :: SourceName
Expand Down

0 comments on commit 9717e07

Please sign in to comment.