From 0882c42c077f6669166eb8cfce7ecd904b69dd80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Thu, 20 Nov 2025 14:20:59 +0100 Subject: [PATCH 1/9] PMM-14442 PG max connections as env variable. --- managed/cmd/pmm-managed/main.go | 7 ++++++- managed/services/grafana/auth_server.go | 12 ++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/managed/cmd/pmm-managed/main.go b/managed/cmd/pmm-managed/main.go index 714ed0d0a8a..bbc8f6e1230 100644 --- a/managed/cmd/pmm-managed/main.go +++ b/managed/cmd/pmm-managed/main.go @@ -727,6 +727,11 @@ func main() { //nolint:maintidx,cyclop Envar("PMM_NOMAD_GC_PARALLEL_DESTROYS"). Int() + grafanaCacheInvalidationPeriodF := kingpin.Flag("grafana-cache-invalidation-period", "cacheInvalidationPeriod is the period when cache for grafana response should be invalidated ('3s', '1m' etc)"). + Default("3s"). + Envar("PMM_GRAFANA_CACHE_INVALIDATION_PERIOD"). + Duration() + kingpin.Parse() logger.SetupGlobalLogger() @@ -1077,7 +1082,7 @@ func main() { //nolint:maintidx,cyclop l.Fatalf("Failed to get settings: %+v.", err) } - authServer := grafana.NewAuthServer(grafanaClient, db) + authServer := grafana.NewAuthServer(grafanaClient, db, *grafanaCacheInvalidationPeriodF) l.Info("Starting services...") var wg sync.WaitGroup diff --git a/managed/services/grafana/auth_server.go b/managed/services/grafana/auth_server.go index 769dcee32f2..415967b6371 100644 --- a/managed/services/grafana/auth_server.go +++ b/managed/services/grafana/auth_server.go @@ -129,9 +129,6 @@ const lbacHeaderName = "X-Proxy-Filter" // as this code is reserved for auth_request. const authenticationErrorCode = 401 -// cacheInvalidationPeriod is and period when cache for grafana response should be invalidated. -const cacheInvalidationPeriod = 3 * time.Second - // clientError contains authentication error response details. type authError struct { code codes.Code // error code for API client; not mapped to HTTP status code @@ -165,11 +162,13 @@ type AuthServer struct { accessControl *accessControl + cacheInvalidationPeriod time.Duration + // TODO server metrics should be provided by middleware https://jira.percona.com/browse/PMM-4326 } // NewAuthServer creates new AuthServer. -func NewAuthServer(c clientInterface, db *reform.DB) *AuthServer { +func NewAuthServer(c clientInterface, db *reform.DB, cacheInvalidationPeriod time.Duration) *AuthServer { return &AuthServer{ c: c, db: db, @@ -178,12 +177,13 @@ func NewAuthServer(c clientInterface, db *reform.DB) *AuthServer { accessControl: &accessControl{ db: db, }, + CacheInvalidationPeriod: cacheInvalidationPeriod, } } // Run runs cache invalidator which removes expired cache items. func (s *AuthServer) Run(ctx context.Context) { - t := time.NewTicker(cacheInvalidationPeriod) + t := time.NewTicker(s.cacheInvalidationPeriod) for { select { @@ -194,7 +194,7 @@ func (s *AuthServer) Run(ctx context.Context) { now := time.Now() s.rw.Lock() for key, item := range s.cache { - if now.Add(-cacheInvalidationPeriod).After(item.created) { + if now.Add(-s.cacheInvalidationPeriod).After(item.created) { delete(s.cache, key) } } From 49f73c1a7b11d8dc67b896c50a20145966f7bd48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Thu, 20 Nov 2025 14:40:27 +0100 Subject: [PATCH 2/9] PMM-14442 Typo. --- managed/services/grafana/auth_server.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/managed/services/grafana/auth_server.go b/managed/services/grafana/auth_server.go index 415967b6371..328d9ce7b19 100644 --- a/managed/services/grafana/auth_server.go +++ b/managed/services/grafana/auth_server.go @@ -177,7 +177,7 @@ func NewAuthServer(c clientInterface, db *reform.DB, cacheInvalidationPeriod tim accessControl: &accessControl{ db: db, }, - CacheInvalidationPeriod: cacheInvalidationPeriod, + cacheInvalidationPeriod: cacheInvalidationPeriod, } } From f78deac48cce972a96b06adfe8fd7720fb252e76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Thu, 20 Nov 2025 15:11:41 +0100 Subject: [PATCH 3/9] PMM-14442 PG env variables. --- managed/cmd/pmm-managed-init/main.go | 10 +++++-- managed/models/settings.go | 6 ++++ managed/models/settings_helpers.go | 35 ++++++++++++++++++++++ managed/services/supervisord/pmm_config.go | 2 ++ managed/utils/validators/validators.go | 22 ++++++++++++++ 5 files changed, 72 insertions(+), 3 deletions(-) diff --git a/managed/cmd/pmm-managed-init/main.go b/managed/cmd/pmm-managed-init/main.go index 4934ef14013..d4bfa69259d 100644 --- a/managed/cmd/pmm-managed-init/main.go +++ b/managed/cmd/pmm-managed-init/main.go @@ -52,9 +52,13 @@ func main() { os.Exit(1) } - pmmConfigParams := make(map[string]any) - pmmConfigParams["DisableInternalDB"], _ = strconv.ParseBool(os.Getenv("PMM_DISABLE_BUILTIN_POSTGRES")) - pmmConfigParams["DisableInternalClickhouse"], _ = strconv.ParseBool(os.Getenv("PMM_DISABLE_BUILTIN_CLICKHOUSE")) + pmmConfigParams := map[string]any{ + "DisableInternalDB": func() bool { v, _ := strconv.ParseBool(os.Getenv("PMM_DISABLE_BUILTIN_POSTGRES")); return v }(), + "DisableInternalClickhouse": func() bool { v, _ := strconv.ParseBool(os.Getenv("PMM_DISABLE_BUILTIN_CLICKHOUSE")); return v }(), + "PostgresMaxConnections": envSettings.PostgresMaxConnections, + "PostgresSharedBuffers": envSettings.PostgresSharedBuffers, + } + if err := supervisord.SavePMMConfig(pmmConfigParams); err != nil { logrus.Errorf("PMM Server configuration error: %s.", err) os.Exit(1) diff --git a/managed/models/settings.go b/managed/models/settings.go index 0937666c214..37aa40e16db 100644 --- a/managed/models/settings.go +++ b/managed/models/settings.go @@ -118,6 +118,12 @@ type Settings struct { // Contains all encrypted tables in format 'db.table.column'. EncryptedItems []string `json:"encrypted_items"` + + // PostgreSQL max_connections + PostgresMaxConnections string `json:"postgres_max_connections"` + + // PostgreSQL shared_buffers + PostgresSharedBuffers string `json:"postgres_shared_buffers"` } // IsAlertingEnabled returns true if alerting is enabled. diff --git a/managed/models/settings_helpers.go b/managed/models/settings_helpers.go index b5400318c90..b01425c2d8a 100644 --- a/managed/models/settings_helpers.go +++ b/managed/models/settings_helpers.go @@ -18,6 +18,7 @@ package models import ( "encoding/json" "fmt" + "strconv" "time" "github.com/AlekSi/pointer" @@ -100,6 +101,12 @@ type ChangeSettingsParams struct { // List of items in format 'db.table.column' to be encrypted. EncryptedItems []string + + // PostgreSQL max_connections + PostgresMaxConnections string + + // PostgreSQL shared_buffers + PostgresSharedBuffers string } // SetPMMServerID should be run on start up to generate unique PMM Server ID. @@ -240,6 +247,16 @@ func UpdateSettings(q reform.DBTX, params *ChangeSettingsParams) (*Settings, err settings.EncryptedItems = params.EncryptedItems } + // Update PostgresMaxConnections if provided + if params.PostgresMaxConnections != "" { + settings.PostgresMaxConnections = params.PostgresMaxConnections + } + + // Update PostgresSharedBuffers if provided + if params.PostgresSharedBuffers != "" { + settings.PostgresSharedBuffers = params.PostgresSharedBuffers + } + err = SaveSettings(q, settings) if err != nil { return nil, err @@ -317,6 +334,24 @@ func ValidateSettings(params *ChangeSettingsParams) error { return err } + // Validate PostgresMaxConnections (must be a positive integer string) + if params.PostgresMaxConnections != "" { + if _, err := strconv.Atoi(params.PostgresMaxConnections); err != nil || params.PostgresMaxConnections[0] == '-' { + return errors.Errorf("PostgresMaxConnections: must be a positive integer string") + } + } else { + params.PostgresMaxConnections = "500" + } + + // Validate PostgresSharedBuffers (must match ) + if params.PostgresSharedBuffers != "" { + if err := validators.ValidateSize(params.PostgresSharedBuffers); err != nil { + return errors.Errorf("PostgresSharedBuffers: %v", err) + } + } else { + params.PostgresSharedBuffers = "256MB" + } + return nil } diff --git a/managed/services/supervisord/pmm_config.go b/managed/services/supervisord/pmm_config.go index 303626f0ac9..3fd96206b0c 100644 --- a/managed/services/supervisord/pmm_config.go +++ b/managed/services/supervisord/pmm_config.go @@ -114,6 +114,8 @@ command = -c pg_stat_statements.track=all -c pg_stat_statements.save=off -c logging_collector=off + -c max_connections={{ .PostgresMaxConnections }} + -c shared_buffers={{ .PostgresSharedBuffers }} autorestart = true autostart = true startretries = 10 diff --git a/managed/utils/validators/validators.go b/managed/utils/validators/validators.go index 9bf330bfdd0..9ce6df04704 100644 --- a/managed/utils/validators/validators.go +++ b/managed/utils/validators/validators.go @@ -109,3 +109,25 @@ func AWSPartitions() []string { "aws-us-gov", // U.S. GovCloud regions } } + +func ValidateSize(size string) error { + if size == "" { + return fmt.Errorf("size string is empty") + } + valid := false + suffixes := []string{"kB", "MB", "GB", "KB", "mb", "gb", "kb"} + for _, s := range suffixes { + if len(size) > len(s) && size[len(size)-len(s):] == s { + num := size[:len(size)-len(s)] + if num != "" { + valid = true + break + } + } + } + if !valid { + return fmt.Errorf("must be like 512MB, 1GB, etc") + } + + return nil +} From a9bcb1529c520cc91e7ad5f2cede340289012994 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Thu, 20 Nov 2025 15:12:26 +0100 Subject: [PATCH 4/9] PMM-14442 Fix Grafana test. --- managed/services/grafana/auth_server_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/managed/services/grafana/auth_server_test.go b/managed/services/grafana/auth_server_test.go index 10afd4ea4b7..64276c544f2 100644 --- a/managed/services/grafana/auth_server_test.go +++ b/managed/services/grafana/auth_server_test.go @@ -69,7 +69,7 @@ func TestAuthServerAuthenticate(t *testing.T) { ctx := context.Background() c := NewClient("127.0.0.1:3000") - s := NewAuthServer(c, nil) + s := NewAuthServer(c, nil, time.Duration(3)*time.Second) req, err := http.NewRequestWithContext(ctx, http.MethodGet, "/dummy", nil) require.NoError(t, err) @@ -184,7 +184,7 @@ func TestServerClientConnection(t *testing.T) { ctx := context.Background() c := NewClient("127.0.0.1:3000") - s := NewAuthServer(c, nil) + s := NewAuthServer(c, nil, time.Duration(3)*time.Second) t.Run("Basic auth - success", func(t *testing.T) { t.Parallel() @@ -261,7 +261,7 @@ func TestAuthServerAddVMGatewayToken(t *testing.T) { }(t) c := NewClient("127.0.0.1:3000") - s := NewAuthServer(c, db) + s := NewAuthServer(c, db, time.Duration(3)*time.Second) roleA := models.Role{ Title: "Role A", From a6e2a4bc2faa761a055c06ac9ca1659767940257 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Thu, 20 Nov 2025 15:21:55 +0100 Subject: [PATCH 5/9] PMM-14442 Lint. --- managed/cmd/pmm-managed/main.go | 8 ++++---- managed/utils/validators/validators.go | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/managed/cmd/pmm-managed/main.go b/managed/cmd/pmm-managed/main.go index bbc8f6e1230..71b2fc13d27 100644 --- a/managed/cmd/pmm-managed/main.go +++ b/managed/cmd/pmm-managed/main.go @@ -727,10 +727,10 @@ func main() { //nolint:maintidx,cyclop Envar("PMM_NOMAD_GC_PARALLEL_DESTROYS"). Int() - grafanaCacheInvalidationPeriodF := kingpin.Flag("grafana-cache-invalidation-period", "cacheInvalidationPeriod is the period when cache for grafana response should be invalidated ('3s', '1m' etc)"). - Default("3s"). - Envar("PMM_GRAFANA_CACHE_INVALIDATION_PERIOD"). - Duration() + grafanaCacheInvalidationPeriodF := kingpin.Flag( + "grafana-cache-invalidation-period", + "Period for Grafana cache invalidation (e.g. '3s', '1m')"). + Default("3s").Envar("PMM_GRAFANA_CACHE_INVALIDATION_PERIOD").Duration() kingpin.Parse() diff --git a/managed/utils/validators/validators.go b/managed/utils/validators/validators.go index 9ce6df04704..14b94ef60d9 100644 --- a/managed/utils/validators/validators.go +++ b/managed/utils/validators/validators.go @@ -110,6 +110,7 @@ func AWSPartitions() []string { } } +// ValidateSize checks if a string matches the pattern (e.g., 512MB, 1GB, etc.) func ValidateSize(size string) error { if size == "" { return fmt.Errorf("size string is empty") From 259d21ab1142b182eed7dfac13fd6b674ffa5b94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Mon, 24 Nov 2025 10:09:02 +0100 Subject: [PATCH 6/9] PMM-14442 Fix tests. --- managed/services/supervisord/pmm_config_test.go | 9 +++++++-- managed/testdata/supervisord.d/pmm-db_enabled.ini | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/managed/services/supervisord/pmm_config_test.go b/managed/services/supervisord/pmm_config_test.go index bb36783e3a5..f129714455e 100644 --- a/managed/services/supervisord/pmm_config_test.go +++ b/managed/services/supervisord/pmm_config_test.go @@ -39,8 +39,13 @@ func TestSavePMMConfig(t *testing.T) { }, { description: "enable internal postgresql db", - params: map[string]any{"DisableInternalDB": false, "DisableSupervisor": false, "DisableInternalClickhouse": false}, - file: "pmm-db_enabled", + params: map[string]any{ + "DisableInternalDB": false, + "DisableSupervisor": false, + "DisableInternalClickhouse": false, + "PostgresMaxConnections": "1000", + "PostgresSharedBuffers": "512MB"}, + file: "pmm-db_enabled", }, } for _, test := range tests { diff --git a/managed/testdata/supervisord.d/pmm-db_enabled.ini b/managed/testdata/supervisord.d/pmm-db_enabled.ini index 54d70f365c2..834d4588bab 100644 --- a/managed/testdata/supervisord.d/pmm-db_enabled.ini +++ b/managed/testdata/supervisord.d/pmm-db_enabled.ini @@ -34,6 +34,8 @@ command = -c pg_stat_statements.track=all -c pg_stat_statements.save=off -c logging_collector=off + -c max_connections=1000 + -c shared_buffers=512MB autorestart = true autostart = true startretries = 10 From 1f3b6482323bed80cb6c2c90a259985f71280d80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Mon, 24 Nov 2025 10:30:49 +0100 Subject: [PATCH 7/9] PMM-14442 Make format. --- managed/services/supervisord/pmm_config_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/managed/services/supervisord/pmm_config_test.go b/managed/services/supervisord/pmm_config_test.go index f129714455e..1430f1d7fca 100644 --- a/managed/services/supervisord/pmm_config_test.go +++ b/managed/services/supervisord/pmm_config_test.go @@ -44,7 +44,8 @@ func TestSavePMMConfig(t *testing.T) { "DisableSupervisor": false, "DisableInternalClickhouse": false, "PostgresMaxConnections": "1000", - "PostgresSharedBuffers": "512MB"}, + "PostgresSharedBuffers": "512MB", + }, file: "pmm-db_enabled", }, } From bdd5a93426979fb09ad703a95c1ba9a994187171 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Mon, 24 Nov 2025 10:56:24 +0100 Subject: [PATCH 8/9] PMM-14442 Raised invalidation default. --- managed/cmd/pmm-managed/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/managed/cmd/pmm-managed/main.go b/managed/cmd/pmm-managed/main.go index 71b2fc13d27..1b454655eb8 100644 --- a/managed/cmd/pmm-managed/main.go +++ b/managed/cmd/pmm-managed/main.go @@ -730,7 +730,7 @@ func main() { //nolint:maintidx,cyclop grafanaCacheInvalidationPeriodF := kingpin.Flag( "grafana-cache-invalidation-period", "Period for Grafana cache invalidation (e.g. '3s', '1m')"). - Default("3s").Envar("PMM_GRAFANA_CACHE_INVALIDATION_PERIOD").Duration() + Default("5s").Envar("PMM_GRAFANA_CACHE_INVALIDATION_PERIOD").Duration() kingpin.Parse() From aae9f09fa1021519b9dcf4854764d94ed498cf3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Mon, 24 Nov 2025 10:56:36 +0100 Subject: [PATCH 9/9] PMM-14442 Connection issues doc. --- .../docs/troubleshoot/connection_issues.md | 32 +++++++++++++++++++ documentation/docs/troubleshoot/index.md | 1 + 2 files changed, 33 insertions(+) create mode 100644 documentation/docs/troubleshoot/connection_issues.md diff --git a/documentation/docs/troubleshoot/connection_issues.md b/documentation/docs/troubleshoot/connection_issues.md new file mode 100644 index 00000000000..453e3df722b --- /dev/null +++ b/documentation/docs/troubleshoot/connection_issues.md @@ -0,0 +1,32 @@ + +# Connection Issues: PMM Clients Disconnecting + +## Problem +When deploying hundreds of PMM clients, you may observe frequent disconnects or instability in PMM Server. This typically manifests as agents losing connection, failed metric collection, or intermittent errors in the PMM UI. + +## Cause +The default configuration for PMM Server's internal PostgreSQL and Grafana cache may not be sufficient to handle a large number of concurrent connections from many PMM clients. This can lead to resource exhaustion, connection drops, or degraded performance. + +## Solution +Increase the following environment variables to improve stability and support more PMM clients: + +- `PMM_POSTGRES_MAX_CONNECTIONS`: Increase this value to allow more concurrent database connections. Default: 500. Example: `PMM_POSTGRES_MAX_CONNECTIONS=1000` +- `PMM_POSTGRES_SHARED_BUFFERS`: Increase shared buffers for PostgreSQL to improve performance. Default: 256MB. Example: `PMM_POSTGRES_SHARED_BUFFERS=512MB` +- `PMM_GRAFANA_CACHE_INVALIDATION_PERIOD`: Increase the cache invalidation period to reduce load on Grafana. Default: 3 seconds. Example: `PMM_GRAFANA_CACHE_INVALIDATION_PERIOD=30s` + +Set these variables in your PMM Server environment (Docker, Compose, or systemd unit), then restart PMM Server and PostgreSQL for the changes to take effect. + +## Example (Docker) +```sh +docker run -d \ + -e PMM_POSTGRES_MAX_CONNECTIONS=1000 \ + -e PMM_GRAFANA_CACHE_INVALIDATION_PERIOD=10s \ + -e PMM_POSTGRES_SHARED_BUFFERS=512MB \ + ...other options... \ + percona/pmm-server:latest +``` + +## Additional Notes +- Monitor PMM Server logs for connection errors and adjust values as needed. +- Ensure your server hardware and resources are sufficient for the increased limits. +- For very large deployments, consider using an external PostgreSQL database or [High Availability setup](../install-pmm/HA.md). diff --git a/documentation/docs/troubleshoot/index.md b/documentation/docs/troubleshoot/index.md index b1b04c58bc4..9a70f952408 100644 --- a/documentation/docs/troubleshoot/index.md +++ b/documentation/docs/troubleshoot/index.md @@ -6,6 +6,7 @@ To quickly identify the issues and find the appropriate solution, the issues are - [Upgrade issues](upgrade_issues.md) - [Configuration issues](config_issues.md) +- [Connection issues](connection_issues.md) - [Percona Alerting issues](alerting_issues.md) - [QAN issues](qan_issues.md) - [Plugins issues](plugin_issues.md)