Skip to content

Commit

Permalink
server: add dynamic labels trigger_name and source_name to existi…
Browse files Browse the repository at this point in the history
…ng event trigger metrics

PR-URL: hasura/graphql-engine-mono#9265
GitOrigin-RevId: 6fb6504f1a476ea6c8b810e067770920757e8dc6
  • Loading branch information
krushanbauva authored and hasura-bot committed May 24, 2023
1 parent 6d27ad9 commit e3df245
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 39 deletions.
26 changes: 13 additions & 13 deletions docs/docs/enterprise/metrics.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -142,21 +142,21 @@ consider looking into the performance of your database.

Total number of events invoked. Represents the Event Trigger webhook HTTP requests made.

| | |
| ------ | -------------------------------- |
| Name | `hasura_event_invocations_total` |
| Type | Counter |
| Labels | `status`: success \| failed |
| | |
| ------ | ---------------------------------------------------------- |
| Name | `hasura_event_invocations_total` |
| Type | Counter |
| Labels | `status`: success \| failed, `source_name`, `trigger_name` |

### Hasura event processed total

Total number of events processed. Represents the Event Trigger egress.

| | |
| ------ | ------------------------------ |
| Name | `hasura_event_processed_total` |
| Type | Counter |
| Labels | `status`: success \| failed |
| | |
| ------ | ---------------------------------------------------------- |
| Name | `hasura_event_processed_total` |
| Type | Counter |
| Labels | `status`: success \| failed, `source_name`, `trigger_name` |

### Hasura event processing time

Expand All @@ -167,7 +167,7 @@ This metric can be considered as the end-to-end processing time for an event.
| ------ | --------------------------------------------------------------------- |
| Name | `hasura_event_processing_time_seconds` |
| Type | Histogram<br /><br />Buckets: 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100 |
| Labels | none |
| Labels | `source_name`, `trigger_name` |

### Hasura event queue time

Expand All @@ -180,7 +180,7 @@ server.
| ------ | --------------------------------------------------------------------- |
| Name | `hasura_event_queue_time_seconds` |
| Type | Histogram<br /><br />Buckets: 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100 |
| Labels | none |
| Labels | `source_name`, `trigger_name` |

### Hasura event trigger HTTP workers

Expand All @@ -203,7 +203,7 @@ processing time indicates slow webhook, you should try to optimize the event web
| ------ | ------------------------------------------------------------ |
| Name | `hasura_event_webhook_processing_time_seconds` |
| Type | Histogram<br /><br />Buckets: 0.01, 0.03, 0.1, 0.3, 1, 3, 10 |
| Labels | none |
| Labels | `source_name`, `trigger_name` |

### Hasura events fetched per batch

Expand Down
73 changes: 64 additions & 9 deletions server/src-lib/Hasura/Eventing/EventTrigger.hs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ import Refined.Unsafe (unsafeRefine)
import System.Metrics.Distribution qualified as EKG.Distribution
import System.Metrics.Gauge qualified as EKG.Gauge
import System.Metrics.Prometheus.Counter qualified as Prometheus.Counter
import System.Metrics.Prometheus.CounterVector (CounterVector)
import System.Metrics.Prometheus.CounterVector qualified as CounterVector
import System.Metrics.Prometheus.Gauge qualified as Prometheus.Gauge
import System.Metrics.Prometheus.Histogram qualified as Prometheus.Histogram
import System.Timeout.Lifted (timeout)
Expand Down Expand Up @@ -472,7 +474,13 @@ processEventQueue logger statsLogger httpMgr getSchemaCache getEventEngineCtx ac
eventProcessTime <- liftIO getCurrentTime
let eventQueueTime = realToFrac $ diffUTCTime eventProcessTime eventFetchedTime
_ <- liftIO $ EKG.Distribution.add (smEventQueueTime serverMetrics) eventQueueTime
liftIO $ Prometheus.Histogram.observe (eventQueueTimeSeconds eventTriggerMetrics) eventQueueTime
liftIO $
observeHistogramWithLabel
getPrometheusMetricsGranularity
True
(eventQueueTimeSeconds eventTriggerMetrics)
(DynamicEventTriggerLabel (tmName (eTrigger e)) sourceName)
eventQueueTime

cache <- liftIO getSchemaCache

Expand Down Expand Up @@ -566,16 +574,39 @@ processEventQueue logger statsLogger httpMgr getSchemaCache getEventEngineCtx ac
-- `eventStartTime`) used here in calculation are all UTC time.
eventStartTime = fromMaybe (eCreatedAtUTC e) (eRetryAtUTC e)
eventProcessingTime' = realToFrac $ diffUTCTime eventExecutionFinishTime eventStartTime
observeHistogramWithLabel getPrometheusMetricsGranularity True (eventProcessingTime eventTriggerMetrics) (TriggerNameLabel (etiName eti)) eventProcessingTime'
observeHistogramWithLabel
getPrometheusMetricsGranularity
True
(eventProcessingTime eventTriggerMetrics)
(DynamicEventTriggerLabel (etiName eti) sourceName)
eventProcessingTime'
liftIO $ do
EKG.Distribution.add (smEventWebhookProcessingTime serverMetrics) eventWebhookProcessingTime'
Prometheus.Histogram.observe (eventWebhookProcessingTime eventTriggerMetrics) eventWebhookProcessingTime'
observeHistogramWithLabel
getPrometheusMetricsGranularity
True
(eventWebhookProcessingTime eventTriggerMetrics)
(DynamicEventTriggerLabel (etiName eti) sourceName)
eventWebhookProcessingTime'
EKG.Distribution.add (smEventProcessingTime serverMetrics) eventProcessingTime'
Prometheus.Counter.inc (eventProcessedTotalSuccess eventTriggerMetrics)
Prometheus.Counter.inc (eventInvocationTotalSuccess eventTriggerMetrics)
incEventTriggerCounterWithLabel
getPrometheusMetricsGranularity
True
(eventProcessedTotal eventTriggerMetrics)
(EventStatusWithTriggerLabel eventSuccessLabel (Just (DynamicEventTriggerLabel (etiName eti) sourceName)))
incEventTriggerCounterWithLabel
getPrometheusMetricsGranularity
True
(eventInvocationTotal eventTriggerMetrics)
(EventStatusWithTriggerLabel eventSuccessLabel (Just (DynamicEventTriggerLabel (etiName eti) sourceName)))
Left eventError -> do
-- TODO (paritosh): We can also add a label to the metric to indicate the type of error
liftIO $ Prometheus.Counter.inc (eventInvocationTotalFailure eventTriggerMetrics)
liftIO $
incEventTriggerCounterWithLabel
getPrometheusMetricsGranularity
True
(eventInvocationTotal eventTriggerMetrics)
(EventStatusWithTriggerLabel eventFailedLabel (Just (DynamicEventTriggerLabel (etiName eti) sourceName)))
case eventError of
(HTTPError reqBody err) ->
processError @b sourceConfig e retryConf logHeaders reqBody maintenanceModeVersion eventTriggerMetrics err >>= flip onLeft logQErr
Expand Down Expand Up @@ -633,7 +664,8 @@ processSuccess sourceConfig e reqHeaders ep maintenanceModeVersion resp = do
processError ::
forall b m a.
( MonadIO m,
BackendEventTrigger b
BackendEventTrigger b,
MonadGetPolicies m
) =>
SourceConfig b ->
Event b ->
Expand Down Expand Up @@ -661,13 +693,16 @@ processError sourceConfig e retryConf reqHeaders ep maintenanceModeVersion event
recordError @b sourceConfig e invocation retryOrError maintenanceModeVersion

retryOrSetError ::
MonadIO m =>
( MonadIO m,
MonadGetPolicies m
) =>
Event b ->
RetryConf ->
EventTriggerMetrics ->
HTTPErr a ->
m ProcessEventError
retryOrSetError e retryConf eventTriggerMetrics err = do
getPrometheusMetricsGranularity <- runGetPrometheusMetricsGranularity
let mretryHeader = getRetryAfterHeaderFromError err
tries = eTries e
mretryHeaderSeconds = mretryHeader >>= parseRetryHeader
Expand All @@ -676,7 +711,12 @@ retryOrSetError e retryConf eventTriggerMetrics err = do
-- current_try = tries + 1 , allowed_total_tries = rcNumRetries retryConf + 1
if triesExhausted && noRetryHeader
then do
liftIO $ Prometheus.Counter.inc (eventProcessedTotalFailure eventTriggerMetrics)
liftIO $
incEventTriggerCounterWithLabel
getPrometheusMetricsGranularity
True
(eventProcessedTotal eventTriggerMetrics)
(EventStatusWithTriggerLabel eventFailedLabel (Just (DynamicEventTriggerLabel (tmName (eTrigger e)) (eSource e))))
pure PESetError
else do
currentTime <- liftIO getCurrentTime
Expand Down Expand Up @@ -732,3 +772,18 @@ getEventTriggerInfoFromEvent sc e = do
<> "' on table '"
<> table <<> "' not found"
)

incEventTriggerCounterWithLabel ::
(MonadIO m) =>
(IO GranularPrometheusMetricsState) ->
-- should the metric be observed without a label when granularMetricsState is OFF
Bool ->
CounterVector EventStatusWithTriggerLabel ->
EventStatusWithTriggerLabel ->
m ()
incEventTriggerCounterWithLabel getMetricState alwaysObserve counterVector (EventStatusWithTriggerLabel status tl) = do
recordMetricWithLabel
getMetricState
alwaysObserve
(liftIO $ CounterVector.inc counterVector (EventStatusWithTriggerLabel status tl))
(liftIO $ CounterVector.inc counterVector (EventStatusWithTriggerLabel status Nothing))
71 changes: 54 additions & 17 deletions server/src-lib/Hasura/Server/Prometheus.hs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,13 @@ module Hasura.Server.Prometheus
decWebsocketConnections,
ScheduledTriggerMetrics (..),
SubscriptionMetrics (..),
TriggerNameLabel (..),
DynamicEventTriggerLabel (..),
ResponseStatus (..),
responseStatusToLabelValue,
EventStatusLabel (..),
eventSuccessLabel,
eventFailedLabel,
EventStatusWithTriggerLabel (..),
GranularPrometheusMetricsState (..),
observeHistogramWithLabel,
SubscriptionKindLabel (..),
Expand All @@ -39,12 +45,15 @@ import Data.Int (Int64)
import Hasura.GraphQL.ParameterizedQueryHash
import Hasura.GraphQL.Transport.HTTP.Protocol (OperationName (..))
import Hasura.Prelude
import Hasura.RQL.Types.Common (SourceName, sourceNameToText)
import Hasura.RQL.Types.EventTrigger (TriggerName, triggerNameToTxt)
import Hasura.Server.Types (GranularPrometheusMetricsState (..))
import Language.GraphQL.Draft.Syntax qualified as G
import System.Metrics.Prometheus (ToLabels (..))
import System.Metrics.Prometheus.Counter (Counter)
import System.Metrics.Prometheus.Counter qualified as Counter
import System.Metrics.Prometheus.CounterVector (CounterVector)
import System.Metrics.Prometheus.CounterVector qualified as CounterVector
import System.Metrics.Prometheus.Gauge (Gauge)
import System.Metrics.Prometheus.Gauge qualified as Gauge
import System.Metrics.Prometheus.GaugeVector qualified as GaugeVector
Expand Down Expand Up @@ -85,16 +94,14 @@ data GraphQLRequestMetrics = GraphQLRequestMetrics
data EventTriggerMetrics = EventTriggerMetrics
{ eventTriggerHTTPWorkers :: Gauge,
eventsFetchedPerBatch :: Gauge,
eventQueueTimeSeconds :: Histogram,
eventQueueTimeSeconds :: HistogramVector (Maybe DynamicEventTriggerLabel),
eventsFetchTimePerBatch :: Histogram,
eventWebhookProcessingTime :: Histogram,
eventProcessingTime :: HistogramVector (Maybe TriggerNameLabel),
eventWebhookProcessingTime :: HistogramVector (Maybe DynamicEventTriggerLabel),
eventProcessingTime :: HistogramVector (Maybe DynamicEventTriggerLabel),
eventTriggerBytesReceived :: Counter,
eventTriggerBytesSent :: Counter,
eventProcessedTotalSuccess :: Counter,
eventProcessedTotalFailure :: Counter,
eventInvocationTotalSuccess :: Counter,
eventInvocationTotalFailure :: Counter
eventProcessedTotal :: CounterVector EventStatusWithTriggerLabel,
eventInvocationTotal :: CounterVector EventStatusWithTriggerLabel
}

data ScheduledTriggerMetrics = ScheduledTriggerMetrics
Expand Down Expand Up @@ -159,16 +166,14 @@ makeDummyEventTriggerMetrics :: IO EventTriggerMetrics
makeDummyEventTriggerMetrics = do
eventTriggerHTTPWorkers <- Gauge.new
eventsFetchedPerBatch <- Gauge.new
eventQueueTimeSeconds <- Histogram.new []
eventQueueTimeSeconds <- HistogramVector.new []
eventsFetchTimePerBatch <- Histogram.new []
eventWebhookProcessingTime <- Histogram.new []
eventWebhookProcessingTime <- HistogramVector.new []
eventProcessingTime <- HistogramVector.new []
eventTriggerBytesReceived <- Counter.new
eventTriggerBytesSent <- Counter.new
eventProcessedTotalSuccess <- Counter.new
eventProcessedTotalFailure <- Counter.new
eventInvocationTotalSuccess <- Counter.new
eventInvocationTotalFailure <- Counter.new
eventProcessedTotal <- CounterVector.new
eventInvocationTotal <- CounterVector.new
pure EventTriggerMetrics {..}

makeDummyScheduledTriggerMetrics :: IO ScheduledTriggerMetrics
Expand Down Expand Up @@ -250,12 +255,44 @@ modifyConnectionsGauge ::
modifyConnectionsGauge f (ConnectionsGauge ref) =
atomicModifyIORef' ref $ \connections -> (f connections, ())

newtype TriggerNameLabel = TriggerNameLabel TriggerName
data DynamicEventTriggerLabel = DynamicEventTriggerLabel
{ _detlTriggerName :: TriggerName,
_detlSourceName :: SourceName
}
deriving (Ord, Eq)

instance ToLabels (Maybe TriggerNameLabel) where
instance ToLabels (Maybe DynamicEventTriggerLabel) where
toLabels Nothing = Map.empty
toLabels (Just (TriggerNameLabel triggerName)) = Map.singleton "trigger_name" (triggerNameToTxt triggerName)
toLabels (Just (DynamicEventTriggerLabel triggerName sourceName)) = Map.fromList $ [("trigger_name", triggerNameToTxt triggerName), ("source_name", sourceNameToText sourceName)]

data ResponseStatus = Success | Failed

-- TODO: Make this a method of a new typeclass of the metrics library
responseStatusToLabelValue :: ResponseStatus -> Text
responseStatusToLabelValue = \case
Success -> "success"
Failed -> "failed"

newtype EventStatusLabel = EventStatusLabel
{ status :: Text
}
deriving stock (Generic, Ord, Eq)
deriving anyclass (ToLabels)

eventSuccessLabel :: EventStatusLabel
eventSuccessLabel = EventStatusLabel $ responseStatusToLabelValue Success

eventFailedLabel :: EventStatusLabel
eventFailedLabel = EventStatusLabel $ responseStatusToLabelValue Failed

data EventStatusWithTriggerLabel = EventStatusWithTriggerLabel
{ _eswtlStatus :: EventStatusLabel,
_eswtlDynamicLabels :: Maybe DynamicEventTriggerLabel
}
deriving stock (Generic, Ord, Eq)

instance ToLabels (EventStatusWithTriggerLabel) where
toLabels (EventStatusWithTriggerLabel esl tl) = (HashMap.fromList $ [("status", status esl)]) <> toLabels tl

data SubscriptionKindLabel = SubscriptionKindLabel
{ subscription_kind :: Text
Expand Down

0 comments on commit e3df245

Please sign in to comment.