From 27919b3001c483bb33807fbed84e76558d68f537 Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Thu, 13 Nov 2025 08:54:02 +0100 Subject: [PATCH 1/4] NETOBSERV-2485: DNS optimization, avoid searching for various suffixes --- README.md | 4 ++-- RELEASE.md | 4 ++-- .../manifests/netobserv-operator.clusterserviceversion.yaml | 4 ++-- config/samples/flows_v1beta2_flowcollector.yaml | 2 +- helm/README.md | 4 ++-- internal/controller/consoleplugin/consoleplugin_objects.go | 4 ++-- internal/controller/consoleplugin/consoleplugin_test.go | 4 ++-- internal/controller/ebpf/agent_controller.go | 2 +- internal/controller/flp/flp_test.go | 2 +- internal/pkg/helper/loki_config.go | 4 ++-- 10 files changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index bca8daec3..7cc1b17db 100644 --- a/README.md +++ b/README.md @@ -81,11 +81,11 @@ spec: loki: mode: Monolithic monolithic: - url: 'http://my-netobserv-loki.netobserv.svc:3100/' + url: 'http://my-netobserv-loki.netobserv.svc.cluster.local.:3100/' prometheus: querier: manual: - url: http://my-netobserv-prometheus-server.netobserv.svc/ + url: http://my-netobserv-prometheus-server.netobserv.svc.cluster.local./ EOF ``` diff --git a/RELEASE.md b/RELEASE.md index c8832ea46..84d9fadc1 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -63,11 +63,11 @@ spec: loki: mode: Monolithic monolithic: - url: 'http://my-netobserv-loki.netobserv.svc:3100/' + url: 'http://my-netobserv-loki.netobserv.svc.cluster.local.:3100/' prometheus: querier: manual: - url: http://my-netobserv-prometheus-server.netobserv.svc/ + url: http://my-netobserv-prometheus-server.netobserv.svc.cluster.local./ EOF # Check components image: diff --git a/bundle/manifests/netobserv-operator.clusterserviceversion.yaml b/bundle/manifests/netobserv-operator.clusterserviceversion.yaml index c2b67a38b..06f445de3 100644 --- a/bundle/manifests/netobserv-operator.clusterserviceversion.yaml +++ b/bundle/manifests/netobserv-operator.clusterserviceversion.yaml @@ -203,7 +203,7 @@ metadata: }, "enable": false }, - "url": "http://loki.netobserv.svc:3100/" + "url": "http://loki.netobserv.svc.cluster.local.:3100/" }, "readTimeout": "30s", "writeBatchSize": 10485760, @@ -253,7 +253,7 @@ metadata: categories: Monitoring, Networking, Observability console.openshift.io/plugins: '["netobserv-plugin"]' containerImage: quay.io/netobserv/network-observability-operator:1.10.0-community - createdAt: "2025-11-26T13:16:01Z" + createdAt: "2025-11-13T07:42:38Z" description: Network flows collector and monitoring solution operatorframework.io/initialization-resource: '{"apiVersion":"flows.netobserv.io/v1beta2", "kind":"FlowCollector","metadata":{"name":"cluster"},"spec": {}}' diff --git a/config/samples/flows_v1beta2_flowcollector.yaml b/config/samples/flows_v1beta2_flowcollector.yaml index 451261884..58924a511 100644 --- a/config/samples/flows_v1beta2_flowcollector.yaml +++ b/config/samples/flows_v1beta2_flowcollector.yaml @@ -151,7 +151,7 @@ spec: # Change mode to "LokiStack" to use with the loki operator mode: Monolithic monolithic: - url: 'http://loki.netobserv.svc:3100/' + url: 'http://loki.netobserv.svc.cluster.local.:3100/' tenantID: netobserv tls: enable: false diff --git a/helm/README.md b/helm/README.md index 950b50e84..fca8a2ce7 100644 --- a/helm/README.md +++ b/helm/README.md @@ -74,11 +74,11 @@ spec: loki: mode: Monolithic monolithic: - url: 'http://my-netobserv-loki.netobserv.svc:3100/' + url: 'http://my-netobserv-loki.netobserv.svc.cluster.local.:3100/' prometheus: querier: manual: - url: http://my-netobserv-prometheus-server.netobserv.svc/ + url: http://my-netobserv-prometheus-server.netobserv.svc.cluster.local./ EOF ``` diff --git a/internal/controller/consoleplugin/consoleplugin_objects.go b/internal/controller/consoleplugin/consoleplugin_objects.go index ade61658b..ad57c175d 100644 --- a/internal/controller/consoleplugin/consoleplugin_objects.go +++ b/internal/controller/consoleplugin/consoleplugin_objects.go @@ -395,8 +395,8 @@ func (b *builder) getPromConfig(ctx context.Context) cfg.PrometheusConfig { } if b.desired.Prometheus.Querier.Mode == "" || b.desired.Prometheus.Querier.Mode == flowslatest.PromModeAuto { if b.info.ClusterInfo.IsOpenShift() { - config.URL = "https://thanos-querier.openshift-monitoring.svc:9091/" // requires cluster-monitoringv-view cluster role - config.DevURL = "https://thanos-querier.openshift-monitoring.svc:9092/" // restricted to a particular namespace + config.URL = "https://thanos-querier.openshift-monitoring.svc.cluster.local.:9091/" // requires cluster-monitoringv-view cluster role + config.DevURL = "https://thanos-querier.openshift-monitoring.svc.cluster.local.:9092/" // restricted to a particular namespace config.ForwardUserToken = true tls = flowslatest.ClientTLS{ Enable: true, diff --git a/internal/controller/consoleplugin/consoleplugin_test.go b/internal/controller/consoleplugin/consoleplugin_test.go index 026e0be99..f034bb48c 100644 --- a/internal/controller/consoleplugin/consoleplugin_test.go +++ b/internal/controller/consoleplugin/consoleplugin_test.go @@ -341,8 +341,8 @@ func TestConfigMapContent(t *testing.T) { assert.Nil(err) // loki config - assert.Equal(config.Loki.URL, "https://lokistack-gateway-http.ls-namespace.svc:8080/api/logs/v1/network/") - assert.Equal(config.Loki.StatusURL, "https://lokistack-query-frontend-http.ls-namespace.svc:3100/") + assert.Equal(config.Loki.URL, "https://lokistack-gateway-http.ls-namespace.svc.cluster.local.:8080/api/logs/v1/network/") + assert.Equal(config.Loki.StatusURL, "https://lokistack-query-frontend-http.ls-namespace.svc.cluster.local.:3100/") // frontend params assert.Equal(config.Frontend.RecordTypes, []api.ConnTrackOutputRecordTypeEnum{api.ConnTrackFlowLog}) diff --git a/internal/controller/ebpf/agent_controller.go b/internal/controller/ebpf/agent_controller.go index e4028a47c..23085b5d6 100644 --- a/internal/controller/ebpf/agent_controller.go +++ b/internal/controller/ebpf/agent_controller.go @@ -484,7 +484,7 @@ func (c *AgentController) envConfig(ctx context.Context, coll *flowslatest.FlowC // Send to FLP service config = append(config, corev1.EnvVar{ Name: envFlowsTargetHost, - Value: fmt.Sprintf("%s.%s.svc", constants.FLPName, c.Namespace), + Value: fmt.Sprintf("%s.%s.svc.cluster.local.", constants.FLPName, c.Namespace), }, corev1.EnvVar{ Name: envFlowsTargetPort, Value: strconv.Itoa(int(*advancedConfig.Port)), diff --git a/internal/controller/flp/flp_test.go b/internal/controller/flp/flp_test.go index cf443fdbe..c7815d367 100644 --- a/internal/controller/flp/flp_test.go +++ b/internal/controller/flp/flp_test.go @@ -685,7 +685,7 @@ func TestConfigMapShouldDeserializeAsJSONWithLokiStack(t *testing.T) { assert.Len(params, 5) lokiCfg := params[3].Write.Loki - assert.Equal("https://lokistack-gateway-http.ls-namespace.svc:8080/api/logs/v1/network/", lokiCfg.URL) + assert.Equal("https://lokistack-gateway-http.ls-namespace.svc.cluster.local.:8080/api/logs/v1/network/", lokiCfg.URL) assert.Equal("network", lokiCfg.TenantID) assert.Equal("Bearer", lokiCfg.ClientConfig.Authorization.Type) assert.Equal("/var/run/secrets/tokens/flowlogs-pipeline", lokiCfg.ClientConfig.Authorization.CredentialsFile) diff --git a/internal/pkg/helper/loki_config.go b/internal/pkg/helper/loki_config.go index 916bfc6e3..56113b3d1 100644 --- a/internal/pkg/helper/loki_config.go +++ b/internal/pkg/helper/loki_config.go @@ -18,11 +18,11 @@ func NewLokiConfig(spec *flowslatest.FlowCollectorLoki, namespace string) LokiCo if len(spec.LokiStack.Namespace) > 0 { ns = spec.LokiStack.Namespace } - gatewayURL := fmt.Sprintf("https://%s-gateway-http.%s.svc:8080/api/logs/v1/network/", spec.LokiStack.Name, ns) + gatewayURL := fmt.Sprintf("https://%s-gateway-http.%s.svc.cluster.local.:8080/api/logs/v1/network/", spec.LokiStack.Name, ns) loki.LokiManualParams = flowslatest.LokiManualParams{ QuerierURL: gatewayURL, IngesterURL: gatewayURL, - StatusURL: fmt.Sprintf("https://%s-query-frontend-http.%s.svc:3100/", spec.LokiStack.Name, ns), + StatusURL: fmt.Sprintf("https://%s-query-frontend-http.%s.svc.cluster.local.:3100/", spec.LokiStack.Name, ns), TenantID: "network", AuthToken: flowslatest.LokiAuthForwardUserToken, TLS: flowslatest.ClientTLS{ From 39d414796e5ab612606c23afba1dcc3cdd0da40a Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Thu, 13 Nov 2025 13:32:45 +0100 Subject: [PATCH 2/4] Use Health/Alerts to help detect DNS optimizations - New Alert template dedicated to NX DOMAIN (with lesser severity) - Fix issues with the link to Network traffic page for DNS (dest-based, no bnf) - Add custom link for DNS optimization --- .../v1beta2/flowcollector_alert_types.go | 7 +-- .../v1beta2/flowcollector_defaults.go | 12 +++++ .../v1beta2/flowcollector_types.go | 2 +- .../flowcollector_validation_webhook.go | 4 +- .../flows.netobserv.io_flowcollectors.yaml | 5 ++- .../flows.netobserv.io_flowcollectors.yaml | 5 ++- docs/Alerts.md | 3 +- docs/FlowCollector.md | 6 +-- .../flows.netobserv.io_flowcollectors.yaml | 5 ++- internal/pkg/metrics/alerts/alerts.go | 44 +++++++++++++++++-- internal/pkg/metrics/alerts/alerts_test.go | 14 ++++++ internal/pkg/metrics/alerts/builder.go | 44 +++++++++++++------ 12 files changed, 119 insertions(+), 32 deletions(-) diff --git a/api/flowcollector/v1beta2/flowcollector_alert_types.go b/api/flowcollector/v1beta2/flowcollector_alert_types.go index 4635c7528..874ddf395 100644 --- a/api/flowcollector/v1beta2/flowcollector_alert_types.go +++ b/api/flowcollector/v1beta2/flowcollector_alert_types.go @@ -22,6 +22,7 @@ const ( AlertNetpolDenied AlertTemplate = "NetpolDenied" AlertLatencyHighTrend AlertTemplate = "LatencyHighTrend" AlertDNSErrors AlertTemplate = "DNSErrors" + AlertDNSNxDomain AlertTemplate = "DNSNxDomain" AlertExternalEgressHighTrend AlertTemplate = "ExternalEgressHighTrend" AlertExternalIngressHighTrend AlertTemplate = "ExternalIngressHighTrend" AlertCrossAZ AlertTemplate = "CrossAZ" @@ -33,9 +34,9 @@ const ( type FLPAlert struct { // Alert template name. // Possible values are: `PacketDropsByKernel`, `PacketDropsByDevice`, `IPsecErrors`, `NetpolDenied`, - // `LatencyHighTrend`, `DNSErrors`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. + // `LatencyHighTrend`, `DNSErrors`, `DNSNxDomain`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. // More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md - // +kubebuilder:validation:Enum:="PacketDropsByKernel";"PacketDropsByDevice";"IPsecErrors";"NetpolDenied";"LatencyHighTrend";"DNSErrors";"ExternalEgressHighTrend";"ExternalIngressHighTrend";"CrossAZ" + // +kubebuilder:validation:Enum:="PacketDropsByKernel";"PacketDropsByDevice";"IPsecErrors";"NetpolDenied";"LatencyHighTrend";"DNSErrors";"DNSNxDomain";"ExternalEgressHighTrend";"ExternalIngressHighTrend";"CrossAZ" // +required Template AlertTemplate `json:"template,omitempty"` @@ -166,7 +167,7 @@ func (g *FLPAlert) IsAllowed(spec *FlowCollectorSpec) (bool, string) { if !spec.Agent.EBPF.IsIPSecEnabled() { return false, fmt.Sprintf("Alert %s requires the %s agent feature to be enabled", g.Template, IPSec) } - case AlertDNSErrors: + case AlertDNSErrors, AlertDNSNxDomain: if !spec.Agent.EBPF.IsDNSTrackingEnabled() { return false, fmt.Sprintf("Alert %s requires the %s agent feature to be enabled", g.Template, DNSTracking) } diff --git a/api/flowcollector/v1beta2/flowcollector_defaults.go b/api/flowcollector/v1beta2/flowcollector_defaults.go index eb7d2a6ed..d28dff197 100644 --- a/api/flowcollector/v1beta2/flowcollector_defaults.go +++ b/api/flowcollector/v1beta2/flowcollector_defaults.go @@ -110,6 +110,18 @@ var ( }, }, }, + { + Template: AlertDNSNxDomain, + Variants: []AlertVariant{ + { + Thresholds: AlertThresholds{ + Info: "10", + Warning: "80", + }, + GroupBy: GroupByNamespace, + }, + }, + }, { Template: AlertNetpolDenied, Variants: []AlertVariant{ diff --git a/api/flowcollector/v1beta2/flowcollector_types.go b/api/flowcollector/v1beta2/flowcollector_types.go index 16582d616..4248a23e2 100644 --- a/api/flowcollector/v1beta2/flowcollector_types.go +++ b/api/flowcollector/v1beta2/flowcollector_types.go @@ -582,7 +582,7 @@ type FLPMetrics struct { // `disableAlerts` is a list of alert groups that should be disabled from the default set of alerts. // Possible values are: `NetObservNoFlows`, `NetObservLokiError`, `PacketDropsByKernel`, `PacketDropsByDevice`, `IPsecErrors`, `NetpolDenied`, - // `LatencyHighTrend`, `DNSErrors`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. + // `LatencyHighTrend`, `DNSErrors`, `DNSNxDomain`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. // More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md // +optional DisableAlerts []AlertTemplate `json:"disableAlerts"` diff --git a/api/flowcollector/v1beta2/flowcollector_validation_webhook.go b/api/flowcollector/v1beta2/flowcollector_validation_webhook.go index ddf77ef9a..01425c659 100644 --- a/api/flowcollector/v1beta2/flowcollector_validation_webhook.go +++ b/api/flowcollector/v1beta2/flowcollector_validation_webhook.go @@ -358,7 +358,7 @@ func (v *validator) isFLPAlertGroupBySupported(template AlertTemplate, variant * return variant.GroupBy != GroupByWorkload case AlertIPsecErrors: return variant.GroupBy != GroupByWorkload && variant.GroupBy != GroupByNamespace - case AlertPacketDropsByKernel, AlertDNSErrors, AlertExternalEgressHighTrend, AlertExternalIngressHighTrend, AlertLatencyHighTrend, AlertNetpolDenied, AlertCrossAZ: + case AlertPacketDropsByKernel, AlertDNSErrors, AlertDNSNxDomain, AlertExternalEgressHighTrend, AlertExternalIngressHighTrend, AlertLatencyHighTrend, AlertNetpolDenied, AlertCrossAZ: return true case AlertLokiError, AlertNoFlows: // not applicable return false @@ -410,7 +410,7 @@ func GetElligibleMetricsForAlert(template AlertTemplate, alertDef *AlertVariant) totalMetricPatterns = []string{"%s_ingress_packets_total", "%s_egress_packets_total"} case AlertIPsecErrors: return []string{"node_ipsec_flows_total"}, []string{"node_to_node_ingress_flows_total"} - case AlertDNSErrors: + case AlertDNSErrors, AlertDNSNxDomain: metricPatterns = []string{`%s_dns_latency_seconds`} totalMetricPatterns = []string{"%s_dns_latency_seconds"} case AlertExternalEgressHighTrend: // TODO diff --git a/bundle/manifests/flows.netobserv.io_flowcollectors.yaml b/bundle/manifests/flows.netobserv.io_flowcollectors.yaml index b741295ad..d6eb046c9 100644 --- a/bundle/manifests/flows.netobserv.io_flowcollectors.yaml +++ b/bundle/manifests/flows.netobserv.io_flowcollectors.yaml @@ -5812,7 +5812,7 @@ spec: description: |- Alert template name. Possible values are: `PacketDropsByKernel`, `PacketDropsByDevice`, `IPsecErrors`, `NetpolDenied`, - `LatencyHighTrend`, `DNSErrors`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. + `LatencyHighTrend`, `DNSErrors`, `DNSNxDomain`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md enum: - PacketDropsByKernel @@ -5821,6 +5821,7 @@ spec: - NetpolDenied - LatencyHighTrend - DNSErrors + - DNSNxDomain - ExternalEgressHighTrend - ExternalIngressHighTrend - CrossAZ @@ -5886,7 +5887,7 @@ spec: description: |- `disableAlerts` is a list of alert groups that should be disabled from the default set of alerts. Possible values are: `NetObservNoFlows`, `NetObservLokiError`, `PacketDropsByKernel`, `PacketDropsByDevice`, `IPsecErrors`, `NetpolDenied`, - `LatencyHighTrend`, `DNSErrors`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. + `LatencyHighTrend`, `DNSErrors`, `DNSNxDomain`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md items: type: string diff --git a/config/crd/bases/flows.netobserv.io_flowcollectors.yaml b/config/crd/bases/flows.netobserv.io_flowcollectors.yaml index 3addd7bc5..97ed3be32 100644 --- a/config/crd/bases/flows.netobserv.io_flowcollectors.yaml +++ b/config/crd/bases/flows.netobserv.io_flowcollectors.yaml @@ -5373,7 +5373,7 @@ spec: description: |- Alert template name. Possible values are: `PacketDropsByKernel`, `PacketDropsByDevice`, `IPsecErrors`, `NetpolDenied`, - `LatencyHighTrend`, `DNSErrors`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. + `LatencyHighTrend`, `DNSErrors`, `DNSNxDomain`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md enum: - PacketDropsByKernel @@ -5382,6 +5382,7 @@ spec: - NetpolDenied - LatencyHighTrend - DNSErrors + - DNSNxDomain - ExternalEgressHighTrend - ExternalIngressHighTrend - CrossAZ @@ -5438,7 +5439,7 @@ spec: description: |- `disableAlerts` is a list of alert groups that should be disabled from the default set of alerts. Possible values are: `NetObservNoFlows`, `NetObservLokiError`, `PacketDropsByKernel`, `PacketDropsByDevice`, `IPsecErrors`, `NetpolDenied`, - `LatencyHighTrend`, `DNSErrors`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. + `LatencyHighTrend`, `DNSErrors`, `DNSNxDomain`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md items: type: string diff --git a/docs/Alerts.md b/docs/Alerts.md index 6e64eacee..93bdbe891 100644 --- a/docs/Alerts.md +++ b/docs/Alerts.md @@ -18,7 +18,8 @@ Here is the list of alerts installed by default: - `IPsecErrors`: triggered when NetObserv detects IPsec encyption errors; it requires the `IPSec` agent feature. - `NetpolDenied`: triggered when NetObserv detects traffic denied by network policies; it requires the `NetworkEvents` agent feature. - `LatencyHighTrend`: triggered when NetObserv detects an increase of TCP latency; it requires the `FlowRTT` agent feature. -- `DNSErrors`: triggered when NetObserv detects DNS errors; it requires the `DNSTracking` agent feature. +- `DNSErrors`: triggered when NetObserv detects DNS errors, other than NX_DOMAIN; it requires the `DNSTracking` agent feature. +- `DNSNxDomain`: triggered when NetObserv detects DNS NX_DOMAIN errors; it requires the `DNSTracking` agent feature. - `ExternalEgressHighTrend`: TODO. - `ExternalIngressHighTrend`: TODO. diff --git a/docs/FlowCollector.md b/docs/FlowCollector.md index 1abc9937f..b3368f743 100644 --- a/docs/FlowCollector.md +++ b/docs/FlowCollector.md @@ -11512,7 +11512,7 @@ More information on alerts: https://github.com/netobserv/network-observability-o `disableAlerts` is a list of alert groups that should be disabled from the default set of alerts. Possible values are: `NetObservNoFlows`, `NetObservLokiError`, `PacketDropsByKernel`, `PacketDropsByDevice`, `IPsecErrors`, `NetpolDenied`, -`LatencyHighTrend`, `DNSErrors`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. +`LatencyHighTrend`, `DNSErrors`, `DNSNxDomain`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md
false @@ -11565,10 +11565,10 @@ More information, with full list of available metrics: https://github.com/netobs Alert template name. Possible values are: `PacketDropsByKernel`, `PacketDropsByDevice`, `IPsecErrors`, `NetpolDenied`, -`LatencyHighTrend`, `DNSErrors`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. +`LatencyHighTrend`, `DNSErrors`, `DNSNxDomain`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md

- Enum: PacketDropsByKernel, PacketDropsByDevice, IPsecErrors, NetpolDenied, LatencyHighTrend, DNSErrors, ExternalEgressHighTrend, ExternalIngressHighTrend, CrossAZ
+ Enum: PacketDropsByKernel, PacketDropsByDevice, IPsecErrors, NetpolDenied, LatencyHighTrend, DNSErrors, DNSNxDomain, ExternalEgressHighTrend, ExternalIngressHighTrend, CrossAZ
true diff --git a/helm/crds/flows.netobserv.io_flowcollectors.yaml b/helm/crds/flows.netobserv.io_flowcollectors.yaml index 153219ed7..2eecc866d 100644 --- a/helm/crds/flows.netobserv.io_flowcollectors.yaml +++ b/helm/crds/flows.netobserv.io_flowcollectors.yaml @@ -5377,7 +5377,7 @@ spec: description: |- Alert template name. Possible values are: `PacketDropsByKernel`, `PacketDropsByDevice`, `IPsecErrors`, `NetpolDenied`, - `LatencyHighTrend`, `DNSErrors`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. + `LatencyHighTrend`, `DNSErrors`, `DNSNxDomain`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md enum: - PacketDropsByKernel @@ -5386,6 +5386,7 @@ spec: - NetpolDenied - LatencyHighTrend - DNSErrors + - DNSNxDomain - ExternalEgressHighTrend - ExternalIngressHighTrend - CrossAZ @@ -5442,7 +5443,7 @@ spec: description: |- `disableAlerts` is a list of alert groups that should be disabled from the default set of alerts. Possible values are: `NetObservNoFlows`, `NetObservLokiError`, `PacketDropsByKernel`, `PacketDropsByDevice`, `IPsecErrors`, `NetpolDenied`, - `LatencyHighTrend`, `DNSErrors`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. + `LatencyHighTrend`, `DNSErrors`, `DNSNxDomain`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`, `CrossAZ`. More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md items: type: string diff --git a/internal/pkg/metrics/alerts/alerts.go b/internal/pkg/metrics/alerts/alerts.go index 823d4c5dd..030fac7af 100644 --- a/internal/pkg/metrics/alerts/alerts.go +++ b/internal/pkg/metrics/alerts/alerts.go @@ -135,14 +135,14 @@ func (rb *ruleBuilder) dnsErrors() (*monitoringv1.Rule, error) { return nil, nil } description := fmt.Sprintf( - "NetObserv is detecting more than %s%% of DNS errors%s. %s", + "NetObserv is detecting more than %s%% of DNS errors%s (other than NX_DOMAIN). %s", rb.threshold, rb.getAlertLegend(), rb.additionalDescription(), ) metric, totalMetric := rb.getMetricsForAlert() - metricsFilter := rb.buildLabelFilter(`DnsFlagsResponseCode!="NoError"`) + metricsFilter := rb.buildLabelFilter(`DnsFlagsResponseCode!~"NoError|NXDomain"`) totalFilter := rb.buildLabelFilter("") metricsRate := promQLRateFromMetric(metric, "_count", metricsFilter, "2m", "") totalRate := promQLRateFromMetric(totalMetric, "_count", totalFilter, "2m", "") @@ -150,11 +150,49 @@ func (rb *ruleBuilder) dnsErrors() (*monitoringv1.Rule, error) { totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "") promql := percentagePromQL(metricsSumBy, totalSumBy, rb.threshold, rb.upperThreshold, rb.alert.LowVolumeThreshold) - rb.trafficLinkFilter = `dns_flag_response_code!=""` + rb.trafficLink = &trafficLink{ + BackAndForth: false, + ExtraFilter: `dns_flag_response_code!="NoError,NXDomain"`, + FilterDestination: true, + } return rb.createRule(promql, "Too many DNS errors", description) } +func (rb *ruleBuilder) dnsNxDomainErrors() (*monitoringv1.Rule, error) { + // DNS errors are in return traffic only + if rb.side == asSource { + return nil, nil + } + description := fmt.Sprintf( + "NetObserv is detecting more than %s%% of DNS NX_DOMAIN errors%s. In Kubernetes, this is a common error due to the resolution using several search suffixes. It can be optimized by using trailing dots in domain names. %s", + rb.threshold, + rb.getAlertLegend(), + rb.additionalDescription(), + ) + + metric, totalMetric := rb.getMetricsForAlert() + metricsFilter := rb.buildLabelFilter(`DnsFlagsResponseCode="NXDomain"`) + totalFilter := rb.buildLabelFilter("") + metricsRate := promQLRateFromMetric(metric, "_count", metricsFilter, "2m", "") + totalRate := promQLRateFromMetric(totalMetric, "_count", totalFilter, "2m", "") + metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "") + totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "") + promql := percentagePromQL(metricsSumBy, totalSumBy, rb.threshold, rb.upperThreshold, rb.alert.LowVolumeThreshold) + + rb.trafficLink = &trafficLink{ + BackAndForth: false, + ExtraFilter: `dns_flag_response_code="NXDomain"`, + FilterDestination: true, + } + rb.extraLinks = []link{ + // TODO: write our own blog + {Name: "Trailing dot optimization", URL: "https://tech.evaneos.com/how-a-single-dot-can-drastically-improve-performance-771cd3ca888d"}, + } + + return rb.createRule(promql, "Too many DNS NX_DOMAIN errors", description) +} + func (rb *ruleBuilder) netpolDenied() (*monitoringv1.Rule, error) { description := fmt.Sprintf( "NetObserv is detecting more than %s%% of denied traffic due to Network Policies%s. %s", diff --git a/internal/pkg/metrics/alerts/alerts_test.go b/internal/pkg/metrics/alerts/alerts_test.go index 0a5c84f3e..b06254da5 100644 --- a/internal/pkg/metrics/alerts/alerts_test.go +++ b/internal/pkg/metrics/alerts/alerts_test.go @@ -17,6 +17,7 @@ func allTemplates() []flowslatest.AlertTemplate { flowslatest.AlertPacketDropsByKernel, flowslatest.AlertPacketDropsByDevice, flowslatest.AlertDNSErrors, + flowslatest.AlertDNSNxDomain, flowslatest.AlertIPsecErrors, flowslatest.AlertLatencyHighTrend, flowslatest.AlertNetpolDenied, @@ -290,3 +291,16 @@ func TestLatencyPromql(t *testing.T) { ) assert.Equal(t, `{"namespaceLabels":["namespace"],"threshold":"100","unit":"%","upperBound":"500"}`, rules[0].Annotations["netobserv_io_network_health"]) } + +func TestExtraLinks(t *testing.T) { + variant := flowslatest.AlertVariant{ + GroupBy: flowslatest.GroupByNamespace, + Thresholds: flowslatest.AlertThresholds{ + Info: "80", + }, + } + rules, err := convertToRules(flowslatest.AlertDNSNxDomain, &variant, []string{"namespace_dns_bla"}) + assert.NoError(t, err) + assert.Len(t, rules, 1) + assert.Contains(t, rules[0].Annotations["netobserv_io_network_health"], `"links":[{"name":"Trailing dot optimization","url":"https://tech.evaneos.com/how-a-single-dot-can-drastically-improve-performance-771cd3ca888d"}]`) +} diff --git a/internal/pkg/metrics/alerts/builder.go b/internal/pkg/metrics/alerts/builder.go index cef84894c..a907ee47c 100644 --- a/internal/pkg/metrics/alerts/builder.go +++ b/internal/pkg/metrics/alerts/builder.go @@ -21,16 +21,28 @@ const ( ) type ruleBuilder struct { - template flowslatest.AlertTemplate - alert *flowslatest.AlertVariant - enabledMetrics []string - side srcOrDst - severity string - threshold string - upperThreshold string - upperValueRange string - trafficLinkFilter string - duration monitoringv1.Duration + template flowslatest.AlertTemplate + alert *flowslatest.AlertVariant + enabledMetrics []string + side srcOrDst + severity string + threshold string + upperThreshold string + upperValueRange string + trafficLink *trafficLink + extraLinks []link + duration monitoringv1.Duration +} + +type trafficLink struct { + ExtraFilter string `json:"extraFilter"` + BackAndForth bool `json:"backAndForth"` + FilterDestination bool `json:"filterDestination"` +} + +type link struct { + Name string `json:"name"` + URL string `json:"url"` } func BuildRules(ctx context.Context, fc *flowslatest.FlowCollectorSpec) []monitoringv1.Rule { @@ -118,11 +130,14 @@ func (rb *ruleBuilder) convertToRule() (*monitoringv1.Rule, error) { return rb.ipsecErrors() case flowslatest.AlertDNSErrors: return rb.dnsErrors() + case flowslatest.AlertDNSNxDomain: + return rb.dnsNxDomainErrors() case flowslatest.AlertNetpolDenied: return rb.netpolDenied() case flowslatest.AlertLatencyHighTrend: return rb.latencyTrend() - case flowslatest.AlertCrossAZ, flowslatest.AlertExternalEgressHighTrend, flowslatest.AlertExternalIngressHighTrend: // TODO + case flowslatest.AlertCrossAZ, flowslatest.AlertExternalEgressHighTrend, flowslatest.AlertExternalIngressHighTrend: + return nil, nil // TODO case flowslatest.AlertLokiError, flowslatest.AlertNoFlows: // error } @@ -184,8 +199,11 @@ func (rb *ruleBuilder) buildHealthAnnotation(override map[string]any) ([]byte, e if rb.upperValueRange != "" { annotation["upperBound"] = rb.upperValueRange } - if rb.trafficLinkFilter != "" { - annotation["trafficLinkFilter"] = rb.trafficLinkFilter + if rb.trafficLink != nil { + annotation["trafficLink"] = rb.trafficLink + } + if len(rb.extraLinks) > 0 { + annotation["links"] = rb.extraLinks } switch rb.alert.GroupBy { case flowslatest.GroupByNode: From 68cf024ad771fd01b1e2d39dcdc404bf3dba70d2 Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Wed, 3 Dec 2025 15:30:51 +0100 Subject: [PATCH 3/4] fix test --- internal/pkg/metrics/alerts/alerts_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/pkg/metrics/alerts/alerts_test.go b/internal/pkg/metrics/alerts/alerts_test.go index b06254da5..bc784b059 100644 --- a/internal/pkg/metrics/alerts/alerts_test.go +++ b/internal/pkg/metrics/alerts/alerts_test.go @@ -96,6 +96,8 @@ func TestBuildRules_DefaultWithFeaturesAndDisabled(t *testing.T) { "DNSErrors_Warning", "DNSErrors_PerDstNamespaceWarning", "DNSErrors_PerDstNamespaceInfo", + "DNSNxDomain_PerDstNamespaceWarning", + "DNSNxDomain_PerDstNamespaceInfo", "NetpolDenied_PerSrcNamespaceWarning", "NetpolDenied_PerDstNamespaceWarning", "NetpolDenied_PerSrcNamespaceInfo", From b1a3522624371019277c69b812bb2616bec2c47b Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Tue, 9 Dec 2025 08:47:46 +0100 Subject: [PATCH 4/4] Add code comments on dns optimization --- config/samples/flows_v1beta2_flowcollector.yaml | 1 + internal/controller/consoleplugin/consoleplugin_objects.go | 1 + internal/controller/ebpf/agent_controller.go | 3 ++- internal/pkg/helper/loki_config.go | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/config/samples/flows_v1beta2_flowcollector.yaml b/config/samples/flows_v1beta2_flowcollector.yaml index 58924a511..5f5e1cbf6 100644 --- a/config/samples/flows_v1beta2_flowcollector.yaml +++ b/config/samples/flows_v1beta2_flowcollector.yaml @@ -151,6 +151,7 @@ spec: # Change mode to "LokiStack" to use with the loki operator mode: Monolithic monolithic: + # NB: trailing dot (...local.:3100) is a DNS optimization for exact name match without extra search url: 'http://loki.netobserv.svc.cluster.local.:3100/' tenantID: netobserv tls: diff --git a/internal/controller/consoleplugin/consoleplugin_objects.go b/internal/controller/consoleplugin/consoleplugin_objects.go index ad57c175d..a6a363157 100644 --- a/internal/controller/consoleplugin/consoleplugin_objects.go +++ b/internal/controller/consoleplugin/consoleplugin_objects.go @@ -395,6 +395,7 @@ func (b *builder) getPromConfig(ctx context.Context) cfg.PrometheusConfig { } if b.desired.Prometheus.Querier.Mode == "" || b.desired.Prometheus.Querier.Mode == flowslatest.PromModeAuto { if b.info.ClusterInfo.IsOpenShift() { + // NB: trailing dot (...local.:9091) is a DNS optimization for exact name match without extra search config.URL = "https://thanos-querier.openshift-monitoring.svc.cluster.local.:9091/" // requires cluster-monitoringv-view cluster role config.DevURL = "https://thanos-querier.openshift-monitoring.svc.cluster.local.:9092/" // restricted to a particular namespace config.ForwardUserToken = true diff --git a/internal/controller/ebpf/agent_controller.go b/internal/controller/ebpf/agent_controller.go index 23085b5d6..48d4ca803 100644 --- a/internal/controller/ebpf/agent_controller.go +++ b/internal/controller/ebpf/agent_controller.go @@ -483,7 +483,8 @@ func (c *AgentController) envConfig(ctx context.Context, coll *flowslatest.FlowC } else { // Send to FLP service config = append(config, corev1.EnvVar{ - Name: envFlowsTargetHost, + Name: envFlowsTargetHost, + // NB: trailing dot (...local.) is a DNS optimization for exact name match without extra search Value: fmt.Sprintf("%s.%s.svc.cluster.local.", constants.FLPName, c.Namespace), }, corev1.EnvVar{ Name: envFlowsTargetPort, diff --git a/internal/pkg/helper/loki_config.go b/internal/pkg/helper/loki_config.go index 56113b3d1..f6246bdb4 100644 --- a/internal/pkg/helper/loki_config.go +++ b/internal/pkg/helper/loki_config.go @@ -18,6 +18,7 @@ func NewLokiConfig(spec *flowslatest.FlowCollectorLoki, namespace string) LokiCo if len(spec.LokiStack.Namespace) > 0 { ns = spec.LokiStack.Namespace } + // NB: trailing dot (...local.:8080) is a DNS optimization for exact name match without extra search gatewayURL := fmt.Sprintf("https://%s-gateway-http.%s.svc.cluster.local.:8080/api/logs/v1/network/", spec.LokiStack.Name, ns) loki.LokiManualParams = flowslatest.LokiManualParams{ QuerierURL: gatewayURL,