Skip to content
Draft
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
# Language-Specific Build Artifacts & Dependencies
# ============================================================================

### Version Managers ###
mise.toml

### Go ###
# Binaries for programs and plugins
*.exe
Expand Down Expand Up @@ -450,4 +453,4 @@ health-monitors/kubernetes-object-monitor/kubernetes-object-monitor
labeler/labeler
metadata-collector/metadata-collector
node-drainer/node-drainer
platform-connectors/platform-connectors
platform-connectors/platform-connectors
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/config"
"github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/csp"
awsclient "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/csp/aws"
azureclient "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/csp/azure"
gcpclient "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/csp/gcp"
"github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/datastore"
eventpkg "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/event"
Expand Down Expand Up @@ -256,7 +257,24 @@ func initActiveMonitor(
return awsMonitor
}

slog.Info("No CSP is explicitly enabled in the configuration (GCP or AWS).")
if cfg.Azure.Enabled {
slog.Info("Azure configuration is enabled.")

azureMonitor, err := azureclient.NewClient(ctx, cfg.Azure, cfg.ClusterName, kubeconfigPath)
if err != nil {
metrics.CSPMonitorErrors.WithLabelValues(string(model.CSPAzure), "init_error").Inc()
slog.Error("Failed to initialize Azure monitor. Azure will not be monitored.", "error", err)

return nil
}

slog.Info("Azure monitor initialized",
"subscriptionID", cfg.Azure.SubscriptionID)

return azureMonitor
}

slog.Info("No CSP is explicitly enabled in the configuration (GCP, AWS, or Azure).")

return nil
}
Expand Down
8 changes: 8 additions & 0 deletions health-monitors/csp-health-monitor/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ toolchain go1.25.3
require (
cloud.google.com/go/compute v1.38.0
cloud.google.com/go/logging v1.13.1
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.0
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/maintenance/armmaintenance v1.3.0
github.com/BurntSushi/toml v1.5.0
github.com/aws/aws-sdk-go-v2 v1.39.6
github.com/aws/aws-sdk-go-v2/config v1.31.18
Expand Down Expand Up @@ -37,6 +39,9 @@ require (
cloud.google.com/go/compute/metadata v0.9.0 // indirect
cloud.google.com/go/iam v1.5.2 // indirect
cloud.google.com/go/longrunning v0.6.7 // indirect
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 // indirect
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect
github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect
github.com/Masterminds/semver/v3 v3.4.0 // indirect
github.com/aws/aws-sdk-go-v2/credentials v1.18.22 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 // indirect
Expand Down Expand Up @@ -73,6 +78,7 @@ require (
github.com/go-openapi/swag/typeutils v0.25.1 // indirect
github.com/go-openapi/swag/yamlutils v0.25.1 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang-jwt/jwt/v5 v5.3.0 // indirect
github.com/golang/snappy v1.0.0 // indirect
github.com/google/gnostic-models v0.7.0 // indirect
github.com/google/go-cmp v0.7.0 // indirect
Expand All @@ -83,10 +89,12 @@ require (
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.18.0 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
github.com/montanaflynn/stats v0.7.1 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.67.2 // indirect
Expand Down
21 changes: 21 additions & 0 deletions health-monitors/csp-health-monitor/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,20 @@ cloud.google.com/go/monitoring v1.24.2 h1:5OTsoJ1dXYIiMiuL+sYscLc9BumrL3CarVLL7d
cloud.google.com/go/monitoring v1.24.2/go.mod h1:x7yzPWcgDRnPEv3sI+jJGBkwl5qINf+6qY4eq0I9B4U=
cloud.google.com/go/storage v1.56.0 h1:iixmq2Fse2tqxMbWhLWC9HfBj1qdxqAmiK8/eqtsLxI=
cloud.google.com/go/storage v1.56.0/go.mod h1:Tpuj6t4NweCLzlNbw9Z9iwxEkrSem20AetIeH/shgVU=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 h1:JXg2dwJUmPB9JmtVmdEB16APJ7jurfbY5jnfXpJoRMc=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0/go.mod h1:YD5h/ldMsG0XiIw7PdyNhLxaM317eFh5yNLccNfGdyw=
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.0 h1:KpMC6LFL7mqpExyMC9jVOYRiVhLmamjeZfRsUpB7l4s=
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.0/go.mod h1:J7MUC/wtRpfGVbQ5sIItY5/FuVWmvzlY21WAOfQnq/I=
github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2 h1:yz1bePFlP5Vws5+8ez6T3HWXPmwOK7Yvq8QxDBD3SKY=
github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2/go.mod h1:Pa9ZNPuoNu/GztvBSKk9J1cDJW6vk/n0zLtV4mgd8N8=
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA=
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/maintenance/armmaintenance v1.3.0 h1:rx/pIYQIlCjb+n7TzMyFUzIJYb+d0Gi7Vh+ozA0fSJA=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/maintenance/armmaintenance v1.3.0/go.mod h1:o8YD+BbSeK8ANH4SpxQFCiz5OIFKgHxV1uwF2FrQJYY=
github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM=
github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE=
github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs=
github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk=
github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg=
github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.29.0 h1:UQUsRi8WTzhZntp5313l+CHIAT95ojUI2lpP/ExlZa4=
Expand Down Expand Up @@ -126,6 +140,8 @@ github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1v
github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo=
github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
Expand Down Expand Up @@ -154,6 +170,8 @@ github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+l
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU=
github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
Expand All @@ -178,6 +196,8 @@ github.com/onsi/ginkgo/v2 v2.26.0 h1:1J4Wut1IlYZNEAWIV3ALrT9NfiaGW2cDCJQSFQMs/gE
github.com/onsi/ginkgo/v2 v2.26.0/go.mod h1:qhEywmzWTBUY88kfO0BRvX4py7scov9yR+Az2oavUzw=
github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A=
github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k=
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ=
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU=
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo=
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
Expand Down Expand Up @@ -286,6 +306,7 @@ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
Expand Down
43 changes: 33 additions & 10 deletions health-monitors/csp-health-monitor/pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,14 @@
)

type Config struct {
MaintenanceEventPollIntervalSeconds int `toml:"maintenanceEventPollIntervalSeconds"`
TriggerQuarantineWorkflowTimeLimitMinutes int `toml:"triggerQuarantineWorkflowTimeLimitMinutes"`
PostMaintenanceHealthyDelayMinutes int `toml:"postMaintenanceHealthyDelayMinutes"`
NodeReadinessTimeoutMinutes int `toml:"nodeReadinessTimeoutMinutes"`
ClusterName string `toml:"clusterName"`
GCP GCPConfig `toml:"gcp"`
AWS AWSConfig `toml:"aws"`
MaintenanceEventPollIntervalSeconds int `toml:"maintenanceEventPollIntervalSeconds"`
TriggerQuarantineWorkflowTimeLimitMinutes int `toml:"triggerQuarantineWorkflowTimeLimitMinutes"`
PostMaintenanceHealthyDelayMinutes int `toml:"postMaintenanceHealthyDelayMinutes"`
NodeReadinessTimeoutMinutes int `toml:"nodeReadinessTimeoutMinutes"`
ClusterName string `toml:"clusterName"`
GCP GCPConfig `toml:"gcp"`
AWS AWSConfig `toml:"aws"`
Azure AzureConfig `toml:"azure"`
}

// GCPConfig holds GCP specific configuration.
Expand All @@ -62,6 +63,13 @@
Region string `toml:"region"`
}

// AzureConfig holds Azure specific configuration.
type AzureConfig struct {
Enabled bool `toml:"enabled"`
SubscriptionID string `toml:"subscriptionId"`
PollingIntervalSeconds int `toml:"pollingIntervalSeconds"`
}

// LoadConfig reads the configuration from a TOML file.
func LoadConfig(filePath string) (*Config, error) {
var cfg Config
Expand Down Expand Up @@ -174,7 +182,7 @@
return nil
}

// validateCSPConfig checks GCP/AWS polling intervals and ensures only one CSP is enabled.
// validateCSPConfig checks GCP/AWS/Azure polling intervals and ensures only one CSP is enabled.
func validateCSPConfig(cfg *Config) error {
// Validate GCP polling interval
if cfg.GCP.Enabled && cfg.GCP.APIPollingIntervalSeconds < minCSPSpecificPollingIntervalSeconds {
Expand All @@ -194,9 +202,24 @@
)
}

// Validate Azure polling interval
if cfg.Azure.Enabled && cfg.Azure.PollingIntervalSeconds < minCSPSpecificPollingIntervalSeconds {
return fmt.Errorf(
"azure.pollingIntervalSeconds must be at least %d seconds (got %d)",
minCSPSpecificPollingIntervalSeconds,
cfg.Azure.PollingIntervalSeconds,
)
}

// Ensure only one CSP is enabled
if cfg.GCP.Enabled && cfg.AWS.Enabled {
return fmt.Errorf("multiple CSPs enabled: only one of GCP or AWS can be enabled at a time in the configuration")
count := 0
for _, csp := range []bool{cfg.GCP.Enabled, cfg.AWS.Enabled, cfg.Azure.Enabled} {

Check failure on line 216 in health-monitors/csp-health-monitor/pkg/config/config.go

View workflow job for this annotation

GitHub Actions / health-monitors-lint-test (csp-health-monitor)

missing whitespace above this line (no shared variables above range)
if csp {
count++
}
}
if count > 1 {
return fmt.Errorf("multiple CSPs enabled: only one of GCP, AWS, or Azure can be enabled at a time in the configuration")

Check failure on line 222 in health-monitors/csp-health-monitor/pkg/config/config.go

View workflow job for this annotation

GitHub Actions / health-monitors-lint-test (csp-health-monitor)

The line is 122 characters long, which exceeds the maximum of 120 characters.
}

return nil
Expand Down
Loading
Loading