Skip to content
32 changes: 32 additions & 0 deletions documentation/docs/troubleshoot/connection_issues.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

# Connection Issues: PMM Clients Disconnecting

## Problem
When deploying hundreds of PMM clients, you may observe frequent disconnects or instability in PMM Server. This typically manifests as agents losing connection, failed metric collection, or intermittent errors in the PMM UI.

## Cause
The default configuration for PMM Server's internal PostgreSQL and Grafana cache may not be sufficient to handle a large number of concurrent connections from many PMM clients. This can lead to resource exhaustion, connection drops, or degraded performance.

## Solution
Increase the following environment variables to improve stability and support more PMM clients:

- `PMM_POSTGRES_MAX_CONNECTIONS`: Increase this value to allow more concurrent database connections. Default: 500. Example: `PMM_POSTGRES_MAX_CONNECTIONS=1000`
- `PMM_POSTGRES_SHARED_BUFFERS`: Increase shared buffers for PostgreSQL to improve performance. Default: 256MB. Example: `PMM_POSTGRES_SHARED_BUFFERS=512MB`
- `PMM_GRAFANA_CACHE_INVALIDATION_PERIOD`: Increase the cache invalidation period to reduce load on Grafana. Default: 3 seconds. Example: `PMM_GRAFANA_CACHE_INVALIDATION_PERIOD=30s`

Set these variables in your PMM Server environment (Docker, Compose, or systemd unit), then restart PMM Server and PostgreSQL for the changes to take effect.

## Example (Docker)
```sh
docker run -d \
-e PMM_POSTGRES_MAX_CONNECTIONS=1000 \
-e PMM_GRAFANA_CACHE_INVALIDATION_PERIOD=10s \
-e PMM_POSTGRES_SHARED_BUFFERS=512MB \
...other options... \
percona/pmm-server:latest
```

## Additional Notes
- Monitor PMM Server logs for connection errors and adjust values as needed.
- Ensure your server hardware and resources are sufficient for the increased limits.
- For very large deployments, consider using an external PostgreSQL database or [High Availability setup](../install-pmm/HA.md).
1 change: 1 addition & 0 deletions documentation/docs/troubleshoot/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ To quickly identify the issues and find the appropriate solution, the issues are

- [Upgrade issues](upgrade_issues.md)
- [Configuration issues](config_issues.md)
- [Connection issues](connection_issues.md)
- [Percona Alerting issues](alerting_issues.md)
- [QAN issues](qan_issues.md)
- [Plugins issues](plugin_issues.md)
Expand Down
10 changes: 7 additions & 3 deletions managed/cmd/pmm-managed-init/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,13 @@ func main() {
os.Exit(1)
}

pmmConfigParams := make(map[string]any)
pmmConfigParams["DisableInternalDB"], _ = strconv.ParseBool(os.Getenv("PMM_DISABLE_BUILTIN_POSTGRES"))
pmmConfigParams["DisableInternalClickhouse"], _ = strconv.ParseBool(os.Getenv("PMM_DISABLE_BUILTIN_CLICKHOUSE"))
pmmConfigParams := map[string]any{
"DisableInternalDB": func() bool { v, _ := strconv.ParseBool(os.Getenv("PMM_DISABLE_BUILTIN_POSTGRES")); return v }(),
"DisableInternalClickhouse": func() bool { v, _ := strconv.ParseBool(os.Getenv("PMM_DISABLE_BUILTIN_CLICKHOUSE")); return v }(),
"PostgresMaxConnections": envSettings.PostgresMaxConnections,
"PostgresSharedBuffers": envSettings.PostgresSharedBuffers,
}

if err := supervisord.SavePMMConfig(pmmConfigParams); err != nil {
logrus.Errorf("PMM Server configuration error: %s.", err)
os.Exit(1)
Expand Down
7 changes: 6 additions & 1 deletion managed/cmd/pmm-managed/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,11 @@ func main() { //nolint:maintidx,cyclop
Envar("PMM_NOMAD_GC_PARALLEL_DESTROYS").
Int()

grafanaCacheInvalidationPeriodF := kingpin.Flag(
"grafana-cache-invalidation-period",
"Period for Grafana cache invalidation (e.g. '3s', '1m')").
Default("5s").Envar("PMM_GRAFANA_CACHE_INVALIDATION_PERIOD").Duration()

kingpin.Parse()

logger.SetupGlobalLogger()
Expand Down Expand Up @@ -1077,7 +1082,7 @@ func main() { //nolint:maintidx,cyclop
l.Fatalf("Failed to get settings: %+v.", err)
}

authServer := grafana.NewAuthServer(grafanaClient, db)
authServer := grafana.NewAuthServer(grafanaClient, db, *grafanaCacheInvalidationPeriodF)

l.Info("Starting services...")
var wg sync.WaitGroup
Expand Down
6 changes: 6 additions & 0 deletions managed/models/settings.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,12 @@ type Settings struct {

// Contains all encrypted tables in format 'db.table.column'.
EncryptedItems []string `json:"encrypted_items"`

// PostgreSQL max_connections
PostgresMaxConnections string `json:"postgres_max_connections"`

// PostgreSQL shared_buffers
PostgresSharedBuffers string `json:"postgres_shared_buffers"`
}

// IsAlertingEnabled returns true if alerting is enabled.
Expand Down
35 changes: 35 additions & 0 deletions managed/models/settings_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package models
import (
"encoding/json"
"fmt"
"strconv"
"time"

"github.com/AlekSi/pointer"
Expand Down Expand Up @@ -100,6 +101,12 @@ type ChangeSettingsParams struct {

// List of items in format 'db.table.column' to be encrypted.
EncryptedItems []string

// PostgreSQL max_connections
PostgresMaxConnections string

// PostgreSQL shared_buffers
PostgresSharedBuffers string
}

// SetPMMServerID should be run on start up to generate unique PMM Server ID.
Expand Down Expand Up @@ -240,6 +247,16 @@ func UpdateSettings(q reform.DBTX, params *ChangeSettingsParams) (*Settings, err
settings.EncryptedItems = params.EncryptedItems
}

// Update PostgresMaxConnections if provided
if params.PostgresMaxConnections != "" {
settings.PostgresMaxConnections = params.PostgresMaxConnections
}

// Update PostgresSharedBuffers if provided
if params.PostgresSharedBuffers != "" {
settings.PostgresSharedBuffers = params.PostgresSharedBuffers
}

err = SaveSettings(q, settings)
if err != nil {
return nil, err
Expand Down Expand Up @@ -317,6 +334,24 @@ func ValidateSettings(params *ChangeSettingsParams) error {
return err
}

// Validate PostgresMaxConnections (must be a positive integer string)
if params.PostgresMaxConnections != "" {
if _, err := strconv.Atoi(params.PostgresMaxConnections); err != nil || params.PostgresMaxConnections[0] == '-' {
return errors.Errorf("PostgresMaxConnections: must be a positive integer string")
}
} else {
params.PostgresMaxConnections = "500"
}

// Validate PostgresSharedBuffers (must match <number><unit>)
if params.PostgresSharedBuffers != "" {
if err := validators.ValidateSize(params.PostgresSharedBuffers); err != nil {
return errors.Errorf("PostgresSharedBuffers: %v", err)
}
} else {
params.PostgresSharedBuffers = "256MB"
}

return nil
}

Expand Down
12 changes: 6 additions & 6 deletions managed/services/grafana/auth_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,6 @@ const lbacHeaderName = "X-Proxy-Filter"
// as this code is reserved for auth_request.
const authenticationErrorCode = 401

// cacheInvalidationPeriod is and period when cache for grafana response should be invalidated.
const cacheInvalidationPeriod = 3 * time.Second

// clientError contains authentication error response details.
type authError struct {
code codes.Code // error code for API client; not mapped to HTTP status code
Expand Down Expand Up @@ -165,11 +162,13 @@ type AuthServer struct {

accessControl *accessControl

cacheInvalidationPeriod time.Duration

// TODO server metrics should be provided by middleware https://jira.percona.com/browse/PMM-4326
}

// NewAuthServer creates new AuthServer.
func NewAuthServer(c clientInterface, db *reform.DB) *AuthServer {
func NewAuthServer(c clientInterface, db *reform.DB, cacheInvalidationPeriod time.Duration) *AuthServer {
return &AuthServer{
c: c,
db: db,
Expand All @@ -178,12 +177,13 @@ func NewAuthServer(c clientInterface, db *reform.DB) *AuthServer {
accessControl: &accessControl{
db: db,
},
cacheInvalidationPeriod: cacheInvalidationPeriod,
}
}

// Run runs cache invalidator which removes expired cache items.
func (s *AuthServer) Run(ctx context.Context) {
t := time.NewTicker(cacheInvalidationPeriod)
t := time.NewTicker(s.cacheInvalidationPeriod)

for {
select {
Expand All @@ -194,7 +194,7 @@ func (s *AuthServer) Run(ctx context.Context) {
now := time.Now()
s.rw.Lock()
for key, item := range s.cache {
if now.Add(-cacheInvalidationPeriod).After(item.created) {
if now.Add(-s.cacheInvalidationPeriod).After(item.created) {
delete(s.cache, key)
}
}
Expand Down
6 changes: 3 additions & 3 deletions managed/services/grafana/auth_server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func TestAuthServerAuthenticate(t *testing.T) {

ctx := context.Background()
c := NewClient("127.0.0.1:3000")
s := NewAuthServer(c, nil)
s := NewAuthServer(c, nil, time.Duration(3)*time.Second)

req, err := http.NewRequestWithContext(ctx, http.MethodGet, "/dummy", nil)
require.NoError(t, err)
Expand Down Expand Up @@ -184,7 +184,7 @@ func TestServerClientConnection(t *testing.T) {

ctx := context.Background()
c := NewClient("127.0.0.1:3000")
s := NewAuthServer(c, nil)
s := NewAuthServer(c, nil, time.Duration(3)*time.Second)

t.Run("Basic auth - success", func(t *testing.T) {
t.Parallel()
Expand Down Expand Up @@ -261,7 +261,7 @@ func TestAuthServerAddVMGatewayToken(t *testing.T) {
}(t)

c := NewClient("127.0.0.1:3000")
s := NewAuthServer(c, db)
s := NewAuthServer(c, db, time.Duration(3)*time.Second)

roleA := models.Role{
Title: "Role A",
Expand Down
2 changes: 2 additions & 0 deletions managed/services/supervisord/pmm_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ command =
-c pg_stat_statements.track=all
-c pg_stat_statements.save=off
-c logging_collector=off
-c max_connections={{ .PostgresMaxConnections }}
-c shared_buffers={{ .PostgresSharedBuffers }}
autorestart = true
autostart = true
startretries = 10
Expand Down
10 changes: 8 additions & 2 deletions managed/services/supervisord/pmm_config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,14 @@ func TestSavePMMConfig(t *testing.T) {
},
{
description: "enable internal postgresql db",
params: map[string]any{"DisableInternalDB": false, "DisableSupervisor": false, "DisableInternalClickhouse": false},
file: "pmm-db_enabled",
params: map[string]any{
"DisableInternalDB": false,
"DisableSupervisor": false,
"DisableInternalClickhouse": false,
"PostgresMaxConnections": "1000",
"PostgresSharedBuffers": "512MB",
},
file: "pmm-db_enabled",
},
}
for _, test := range tests {
Expand Down
2 changes: 2 additions & 0 deletions managed/testdata/supervisord.d/pmm-db_enabled.ini
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ command =
-c pg_stat_statements.track=all
-c pg_stat_statements.save=off
-c logging_collector=off
-c max_connections=1000
-c shared_buffers=512MB
autorestart = true
autostart = true
startretries = 10
Expand Down
23 changes: 23 additions & 0 deletions managed/utils/validators/validators.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,26 @@ func AWSPartitions() []string {
"aws-us-gov", // U.S. GovCloud regions
}
}

// ValidateSize checks if a string matches the pattern <number><unit> (e.g., 512MB, 1GB, etc.)
func ValidateSize(size string) error {
if size == "" {
return fmt.Errorf("size string is empty")
}
valid := false
suffixes := []string{"kB", "MB", "GB", "KB", "mb", "gb", "kb"}
for _, s := range suffixes {
if len(size) > len(s) && size[len(size)-len(s):] == s {
num := size[:len(size)-len(s)]
if num != "" {
valid = true
break
}
}
}
if !valid {
return fmt.Errorf("must be like 512MB, 1GB, etc")
}

return nil
}
Loading