Skip to content

Commit f796433

Browse files
Allow configuring a default and override per-collector timeout
This handles a case where, particularly, the user stat collectors can get blocked behind an exclusive lock preventing the scrape of other metrics from completing before Prometheus gives up; therefore losing all metrics and not being able to see that an exclusive lock has been taken for too long. Signed-off-by: Joe Wreschnig <[email protected]>
1 parent 198454c commit f796433

File tree

7 files changed

+62
-4
lines changed

7 files changed

+62
-4
lines changed

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,16 @@ auth_modules:
9090
sslmode: disable
9191
```
9292

93+
### timeouts
94+
This section allows configuring a default and per-collector timeouts to help metrics get scraped even if a query runs or blocks longer than your configured Prometheus scrape timeout. If a timeout is unset or set to 0 (or `0s`, `0m`, etc.) the timeout is disabled.
95+
96+
```yaml
97+
timeouts:
98+
default: 2s
99+
collectors:
100+
stat_user_tables: 5s
101+
```
102+
93103
## Building and running
94104

95105
git clone https://github.com/prometheus-community/postgres_exporter.git

cmd/postgres_exporter/probe.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ func handleProbe(logger *slog.Logger, excludeDatabases []string) http.HandlerFun
8282
registry.MustRegister(exporter)
8383

8484
// Run the probe
85-
pc, err := collector.NewProbeCollector(tl, excludeDatabases, registry, dsn)
85+
pc, err := collector.NewProbeCollector(tl, excludeDatabases, registry, dsn, conf.Timeouts)
8686
if err != nil {
8787
logger.Error("Error creating probe collector", "err", err)
8888
http.Error(w, err.Error(), http.StatusInternalServerError)

collector/probe.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,10 @@ type ProbeCollector struct {
2727
collectors map[string]Collector
2828
logger *slog.Logger
2929
instance *instance
30+
timeouts config.Timeouts
3031
}
3132

32-
func NewProbeCollector(logger *slog.Logger, excludeDatabases []string, registry *prometheus.Registry, dsn config.DSN) (*ProbeCollector, error) {
33+
func NewProbeCollector(logger *slog.Logger, excludeDatabases []string, registry *prometheus.Registry, dsn config.DSN, timeouts config.Timeouts) (*ProbeCollector, error) {
3334
collectors := make(map[string]Collector)
3435
initiatedCollectorsMtx.Lock()
3536
defer initiatedCollectorsMtx.Unlock()
@@ -57,6 +58,12 @@ func NewProbeCollector(logger *slog.Logger, excludeDatabases []string, registry
5758
}
5859
}
5960

61+
for name := range timeouts.Collectors {
62+
if _, ok := collectors[name]; !ok {
63+
logger.Warn("timeout set for non-enabled collector", "collector", name)
64+
}
65+
}
66+
6067
instance, err := newInstance(dsn.GetConnectionString())
6168
if err != nil {
6269
return nil, err
@@ -67,6 +74,7 @@ func NewProbeCollector(logger *slog.Logger, excludeDatabases []string, registry
6774
collectors: collectors,
6875
logger: logger,
6976
instance: instance,
77+
timeouts: timeouts,
7078
}, nil
7179
}
7280

@@ -86,8 +94,10 @@ func (pc *ProbeCollector) Collect(ch chan<- prometheus.Metric) {
8694
wg.Add(len(pc.collectors))
8795
for name, c := range pc.collectors {
8896
go func(name string, c Collector) {
89-
execute(context.TODO(), name, c, pc.instance, ch, pc.logger)
90-
wg.Done()
97+
ctx, cancel := pc.timeouts.Context(context.TODO(), name)
98+
defer cancel()
99+
defer wg.Done()
100+
execute(ctx, name, c, pc.instance, ch, pc.logger)
91101
}(name, c)
92102
}
93103
wg.Wait()

config/config.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@
1414
package config
1515

1616
import (
17+
"context"
1718
"fmt"
1819
"log/slog"
1920
"os"
2021
"sync"
22+
"time"
2123

2224
"github.com/prometheus/client_golang/prometheus"
2325
"github.com/prometheus/client_golang/prometheus/promauto"
@@ -40,6 +42,8 @@ var (
4042

4143
type Config struct {
4244
AuthModules map[string]AuthModule `yaml:"auth_modules"`
45+
46+
Timeouts Timeouts `yaml:"timeouts"`
4347
}
4448

4549
type AuthModule struct {
@@ -54,6 +58,11 @@ type UserPass struct {
5458
Password string `yaml:"password"`
5559
}
5660

61+
type Timeouts struct {
62+
Default time.Duration `yaml:"default"`
63+
Collectors map[string]time.Duration `yaml:"collectors"`
64+
}
65+
5766
type Handler struct {
5867
sync.RWMutex
5968
Config *Config
@@ -118,3 +127,14 @@ func (m AuthModule) ConfigureTarget(target string) (DSN, error) {
118127

119128
return dsn, nil
120129
}
130+
131+
func (t Timeouts) Context(parent context.Context, collector string) (context.Context, context.CancelFunc) {
132+
timeout, ok := t.Collectors[collector]
133+
if !ok {
134+
timeout = t.Default
135+
}
136+
if timeout == 0 {
137+
return context.WithCancel(parent)
138+
}
139+
return context.WithTimeout(parent, timeout)
140+
}

config/config_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ func TestLoadBadConfigs(t *testing.T) {
4545
input: "testdata/config-bad-extra-field.yaml",
4646
want: "error parsing config file \"testdata/config-bad-extra-field.yaml\": yaml: unmarshal errors:\n line 8: field doesNotExist not found in type config.AuthModule",
4747
},
48+
{
49+
input: "testdata/config-bad-timeout-duration.yaml",
50+
want: "error parsing config file \"testdata/config-bad-timeout-duration.yaml\": yaml: unmarshal errors:\n line 10: cannot unmarshal !!str `not a time` into time.Duration",
51+
},
4852
}
4953

5054
for _, test := range tests {
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
auth_modules:
2+
first:
3+
type: userpass
4+
userpass:
5+
username: first
6+
password: firstpass
7+
options:
8+
sslmode: disable
9+
timeouts:
10+
default: not a time

config/testdata/config-good.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,7 @@ auth_modules:
66
password: firstpass
77
options:
88
sslmode: disable
9+
timeouts:
10+
default: 5s
11+
collectors:
12+
example: 10s

0 commit comments

Comments
 (0)