From 8f09cce19663bdd68bc8652f80f602a9fbd404ce Mon Sep 17 00:00:00 2001 From: Alex Demidoff Date: Thu, 27 Nov 2025 17:49:13 +0300 Subject: [PATCH 1/8] PMM-14442 Fix the data race --- managed/services/ha/services.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/managed/services/ha/services.go b/managed/services/ha/services.go index d4c35e54bf8..157c04e3032 100644 --- a/managed/services/ha/services.go +++ b/managed/services/ha/services.go @@ -68,14 +68,14 @@ func (s *services) StartAllServices(ctx context.Context) { if _, ok := s.running[id]; !ok { s.wg.Add(1) s.running[id] = service - go func() { - s.l.Infoln("Starting", service.ID()) - err := service.Start(ctx) + go func(svc LeaderService, svcID string) { + s.l.Infoln("Starting", svcID) + err := svc.Start(ctx) if err != nil { s.l.Errorln(err) - s.removeService(service.ID()) + s.removeService(svcID) } - }() + }(service, id) } } } From fb0d8c81a6ef33f2edef6ef187dd4ec41bc3bc97 Mon Sep 17 00:00:00 2001 From: Alex Demidoff Date: Fri, 28 Nov 2025 17:44:17 +0300 Subject: [PATCH 2/8] PMM-14442 Improve waits and locks --- managed/services/ha/highavailability.go | 18 ++++------ managed/services/ha/services.go | 45 ++++++++++++++++--------- 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/managed/services/ha/highavailability.go b/managed/services/ha/highavailability.go index 82f27babfe3..93b5a17cc14 100644 --- a/managed/services/ha/highavailability.go +++ b/managed/services/ha/highavailability.go @@ -93,9 +93,7 @@ func New(params *models.HAParams) *Service { // Run runs the high availability service. func (s *Service) Run(ctx context.Context) error { - s.wg.Add(1) - go func() { - defer s.wg.Done() + s.wg.Go(func() { for { select { case <-s.services.Refresh(): @@ -107,7 +105,7 @@ func (s *Service) Run(ctx context.Context) error { return } } - }() + }) if !s.params.Enabled { s.l.Infoln("High availability is disabled") @@ -197,17 +195,13 @@ func (s *Service) Run(ctx context.Context) error { return fmt.Errorf("failed to join memberlist cluster: %w", err) } } - s.wg.Add(1) - go func() { - defer s.wg.Done() + s.wg.Go(func() { s.runLeaderObserver(ctx) - }() + }) - s.wg.Add(1) - go func() { - defer s.wg.Done() + s.wg.Go(func() { s.runRaftNodesSynchronizer(ctx) - }() + }) <-ctx.Done() diff --git a/managed/services/ha/services.go b/managed/services/ha/services.go index 157c04e3032..6e886ab3b01 100644 --- a/managed/services/ha/services.go +++ b/managed/services/ha/services.go @@ -50,7 +50,7 @@ func (s *services) Add(service LeaderService) error { id := service.ID() if _, ok := s.all[id]; ok { - return fmt.Errorf("service with id %s is already exist", id) + return fmt.Errorf("service with id %s already exists", id) } s.all[id] = service select { @@ -61,33 +61,46 @@ func (s *services) Add(service LeaderService) error { } func (s *services) StartAllServices(ctx context.Context) { - s.rw.Lock() - defer s.rw.Unlock() + type startItem struct { + svc LeaderService + id string + } + var toStart []startItem + s.rw.Lock() for id, service := range s.all { if _, ok := s.running[id]; !ok { - s.wg.Add(1) s.running[id] = service - go func(svc LeaderService, svcID string) { - s.l.Infoln("Starting", svcID) - err := svc.Start(ctx) - if err != nil { - s.l.Errorln(err) - s.removeService(svcID) - } - }(service, id) + toStart = append(toStart, startItem{svc: service, id: id}) } } + s.rw.Unlock() + + for _, service := range toStart { + s.wg.Add(1) + go func(svc LeaderService, svcID string) { + s.l.Infoln("Starting", svcID) + err := svc.Start(ctx) + if err != nil { + s.l.Errorln(err) + s.removeService(svcID) + } + }(service.svc, service.id) + } } func (s *services) StopRunningServices() { s.rw.Lock() - defer s.rw.Unlock() - + toStop := make([]LeaderService, 0, len(s.running)) for id, service := range s.running { + toStop = append(toStop, service) + delete(s.running, id) + } + s.rw.Unlock() + + for _, service := range toStop { s.l.Infoln("Stopping", service.ID()) service.Stop() - delete(s.running, id) s.wg.Done() } } @@ -102,7 +115,7 @@ func (s *services) Wait() { func (s *services) removeService(id string) { s.rw.Lock() - defer s.rw.Unlock() delete(s.running, id) + s.rw.Unlock() s.wg.Done() } From 2998a6ed794165d3b28c6d97a2d0faaf8e395258 Mon Sep 17 00:00:00 2001 From: Alex Demidoff Date: Sat, 29 Nov 2025 00:20:31 +0300 Subject: [PATCH 3/8] PMM-14442 Change the wait order --- managed/services/ha/highavailability.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/managed/services/ha/highavailability.go b/managed/services/ha/highavailability.go index 93b5a17cc14..32ac3cd084d 100644 --- a/managed/services/ha/highavailability.go +++ b/managed/services/ha/highavailability.go @@ -109,8 +109,8 @@ func (s *Service) Run(ctx context.Context) error { if !s.params.Enabled { s.l.Infoln("High availability is disabled") - s.services.Wait() s.wg.Wait() + s.services.Wait() return nil } @@ -205,8 +205,8 @@ func (s *Service) Run(ctx context.Context) error { <-ctx.Done() - s.services.Wait() s.wg.Wait() + s.services.Wait() return nil } From 6bcf2f304600db4082e51d858ee7bc38d447c433 Mon Sep 17 00:00:00 2001 From: Alex Demidoff Date: Sat, 29 Nov 2025 01:22:32 +0300 Subject: [PATCH 4/8] PMM-14442 Fix VM -dryRun parameter syntax --- managed/services/ha/services.go | 2 +- managed/services/victoriametrics/victoriametrics.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/managed/services/ha/services.go b/managed/services/ha/services.go index 6e886ab3b01..f33efd787c9 100644 --- a/managed/services/ha/services.go +++ b/managed/services/ha/services.go @@ -65,9 +65,9 @@ func (s *services) StartAllServices(ctx context.Context) { svc LeaderService id string } - var toStart []startItem s.rw.Lock() + toStart := make([]startItem, 0, len(s.all)) for id, service := range s.all { if _, ok := s.running[id]; !ok { s.running[id] = service diff --git a/managed/services/victoriametrics/victoriametrics.go b/managed/services/victoriametrics/victoriametrics.go index f9a3ca845a7..72018be0f19 100644 --- a/managed/services/victoriametrics/victoriametrics.go +++ b/managed/services/victoriametrics/victoriametrics.go @@ -242,7 +242,7 @@ func (svc *Service) validateConfig(ctx context.Context, cfg []byte) error { _ = os.Remove(f.Name()) }() - args := []string{"-dryRun", "-promscrape.config", f.Name()} + args := []string{"-promscrape.config.dryRun=true", "-promscrape.config", f.Name()} cmd := exec.CommandContext(ctx, "victoriametrics", args...) //nolint:gosec pdeathsig.Set(cmd, unix.SIGKILL) From 8847b57f53630c7e7ed0b7c1c61c67ef61e76007 Mon Sep 17 00:00:00 2001 From: Alex Demidoff Date: Sat, 29 Nov 2025 01:39:19 +0300 Subject: [PATCH 5/8] PMM-14442 Minor compose fixes --- agent/docker-compose.yml | 2 -- api-tests/docker-compose.yml | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/agent/docker-compose.yml b/agent/docker-compose.yml index e56d3381e0a..9bd1a72c184 100644 --- a/agent/docker-compose.yml +++ b/agent/docker-compose.yml @@ -1,6 +1,4 @@ --- -version: '3.7' - services: pmm-server: image: ${PMM_SERVER_IMAGE:-perconalab/pmm-server:3-dev-latest} diff --git a/api-tests/docker-compose.yml b/api-tests/docker-compose.yml index 55af34e3a45..05286e186d9 100644 --- a/api-tests/docker-compose.yml +++ b/api-tests/docker-compose.yml @@ -82,7 +82,7 @@ services: mysql: condition: service_healthy environment: - PMM_AGENT_SERVER_ADDRESS: pmm-server + PMM_AGENT_SERVER_ADDRESS: pmm-server:8443 PMM_AGENT_SERVER_USERNAME: admin PMM_AGENT_SERVER_PASSWORD: admin PMM_AGENT_SERVER_INSECURE_TLS: 1 @@ -90,7 +90,7 @@ services: PMM_AGENT_SETUP: 1 PMM_AGENT_SETUP_FORCE: 1 PMM_AGENT_SIDECAR: 1 -# PMM_AGENT_PRERUN_SCRIPT: "pmm-admin status --wait=10s; pmm-admin add mysql --username=pmm-agent --password=pmm-agent-password --host=pmm-agent_mysql --port=3306" + # PMM_AGENT_PRERUN_SCRIPT: "pmm-admin status --wait=10s; pmm-admin add mysql --username=pmm-agent --password=pmm-agent-password --host=pmm-agent_mysql --port=3306" sysbench: image: perconalab/sysbench From a28c448fe812a2ae311744cdd35d49af13fda491 Mon Sep 17 00:00:00 2001 From: Alex Demidoff Date: Sat, 29 Nov 2025 22:18:30 +0300 Subject: [PATCH 6/8] PMM-14442 Use modern syntax --- managed/cmd/pmm-managed/main.go | 48 +++++++++++---------------------- 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/managed/cmd/pmm-managed/main.go b/managed/cmd/pmm-managed/main.go index 714ed0d0a8a..29dce9c1b98 100644 --- a/managed/cmd/pmm-managed/main.go +++ b/managed/cmd/pmm-managed/main.go @@ -1082,40 +1082,30 @@ func main() { //nolint:maintidx,cyclop l.Info("Starting services...") var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { authServer.Run(ctx) - }() + }) - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { vmalert.Run(ctx) - }() + }) - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { vmdb.Run(ctx) - }() + }) haService.AddLeaderService(ha.NewContextService("checks", func(ctx context.Context) error { checksService.Run(ctx) return nil })) - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { supervisord.Run(ctx) - }() + }) - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { updater.Run(ctx) - }() + }) wg.Add(1) haService.AddLeaderService(ha.NewContextService("telemetry", func(ctx context.Context) error { @@ -1134,9 +1124,7 @@ func main() { //nolint:maintidx,cyclop return nil })) - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { runGRPCServer(ctx, &gRPCServerDeps{ actions: actionsService, @@ -1170,22 +1158,18 @@ func main() { //nolint:maintidx,cyclop vmClient: &vmClient, vmdb: vmdb, }) - }() + }) - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { runHTTP1Server(ctx, &http1ServerDeps{ logs: logs, authServer: authServer, }) - }() + }) - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { runDebugServer(ctx) - }() + }) haService.AddLeaderService(ha.NewContextService("cleaner", func(ctx context.Context) error { cleaner.Run(ctx, cleanInterval, cleanOlderThan) From c1f2e456a5af8b27ad00d9bc208986db9e3a17bb Mon Sep 17 00:00:00 2001 From: Alex Demidoff Date: Sun, 30 Nov 2025 12:52:16 +0300 Subject: [PATCH 7/8] PMM-14442 Use proper version of Go --- .github/workflows/clean.yml | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/.github/workflows/clean.yml b/.github/workflows/clean.yml index 0df379c2788..f6668a69baa 100644 --- a/.github/workflows/clean.yml +++ b/.github/workflows/clean.yml @@ -12,14 +12,7 @@ jobs: name: Clean caches timeout-minutes: 5 - strategy: - fail-fast: false - matrix: - go: - - version: 1.24.x - may-fail: false - - continue-on-error: ${{ matrix.go.may-fail }} + continue-on-error: false runs-on: ubuntu-22.04 env: @@ -31,15 +24,15 @@ jobs: ROBOT_TOKEN: ${{ secrets.ROBOT_TOKEN }} run: echo "machine github.com login percona-robot password ${{ secrets.ROBOT_TOKEN }}" > $HOME/.netrc - - name: Set up Go release + - name: Set up Go env: # to avoid error due to `go version` accepting -v flag with an argument since 1.15 GOFLAGS: "" uses: actions/setup-go@4dc6199c7b1a012772edbd06daecab0f50c9053c # v6.1.0 with: - go-version: ${{ matrix.go.version }} + go-version-file: ${{ github.workspace }}/go.mod - - name: Check out code into the Go module directory + - name: Check out code uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 with: lfs: true @@ -48,18 +41,17 @@ jobs: uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 with: path: ~/go/pkg/mod - key: ${{ matrix.go.version }}-modules-${{ hashFiles('**/go.sum') }} - restore-keys: | - ${{ matrix.go.version }}-modules- + key: ${{ runner.os }}-go-modules-${{ hashFiles('**/go.sum') }} + restore-keys: ${{ runner.os }}-go-modules- - name: Enable Go build cache uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 with: path: ~/.cache/go-build - key: ${{ matrix.go.version }}-build-${{ github.ref }}-${{ hashFiles('**') }} + key: ${{ runner.os }}-go-build-${{ github.ref }}-${{ hashFiles('**') }} restore-keys: | - ${{ matrix.go.version }}-build-${{ github.ref }}- - ${{ matrix.go.version }}-build- + ${{ runner.os }}-go-build-${{ github.ref }}- + ${{ runner.os }}-go-build- - name: Clean Go modules cache run: go clean -modcache From d1f3b6fdecb51f95bdc67a65800b81a908b022b9 Mon Sep 17 00:00:00 2001 From: Alex Demidoff Date: Sun, 30 Nov 2025 13:20:58 +0300 Subject: [PATCH 8/8] PMM-14442 Update batch and update time intervals --- managed/services/victoriametrics/victoriametrics.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/managed/services/victoriametrics/victoriametrics.go b/managed/services/victoriametrics/victoriametrics.go index 72018be0f19..6128ad5a5d6 100644 --- a/managed/services/victoriametrics/victoriametrics.go +++ b/managed/services/victoriametrics/victoriametrics.go @@ -43,8 +43,8 @@ import ( ) const ( - updateBatchDelay = time.Second - configurationUpdateTimeout = 3 * time.Second + updateBatchDelay = 3 * time.Second + configurationUpdateTimeout = 5 * time.Second victoriametricsDir = "/srv/victoriametrics" victoriametricsDataDir = "/srv/victoriametrics/data"