Skip to content

Commit 87439fc

Browse files
committed
Add metrics for tracking live servers
This should allow us to correlate the servers that drone thinks it knows about with those that GCP has
1 parent 290741f commit 87439fc

File tree

4 files changed

+59
-8
lines changed

4 files changed

+59
-8
lines changed

cmd/drone-autoscaler/main.go

+4-2
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,11 @@ func main() {
6262
Fatalln("Invalid or missing hosting provider")
6363
}
6464

65+
collector := metrics.New()
66+
6567
// instruments the provider with prometheus metrics.
6668
provider = metrics.ServerCreate(provider)
67-
provider = metrics.ServerDelete(provider)
69+
provider = metrics.ServerDelete(provider, collector)
6870

6971
db, err := store.Connect(conf.Database.Driver, conf.Database.Datasource)
7072
if err != nil {
@@ -90,7 +92,7 @@ func main() {
9092
conf,
9193
servers,
9294
provider,
93-
metrics.New(),
95+
collector,
9496
)
9597

9698
//

engine/alloc.go

+2
Original file line numberDiff line numberDiff line change
@@ -123,5 +123,7 @@ func (a *allocator) allocate(ctx context.Context, server *autoscaler.Server) err
123123
return err
124124
}
125125

126+
a.metrics.RegisterKnownInstance(instance)
127+
126128
return nil
127129
}

metrics/metrics.go

+44
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import (
99
"time"
1010

1111
"github.com/prometheus/client_golang/prometheus"
12+
13+
"github.com/drone/autoscaler"
1214
)
1315

1416
var noContext = context.Background()
@@ -40,6 +42,10 @@ type Collector interface {
4042
// IncrServerSetupError keeps a count of errors encountered
4143
// when installing software on servers.
4244
IncrServerSetupError()
45+
46+
RegisterKnownInstance(instance *autoscaler.Instance)
47+
48+
UnregisterKnownInstance(instance *autoscaler.Instance)
4349
}
4450

4551
// Prometheus is a Prometheus metrics collector.
@@ -50,6 +56,7 @@ type Prometheus struct {
5056
countServerCreateErr prometheus.Counter
5157
countServerInitErr prometheus.Counter
5258
countServerSetupErr prometheus.Counter
59+
knownInstance *prometheus.GaugeVec
5360
}
5461

5562
// New returns a new Prometheus metrics provider.
@@ -82,12 +89,23 @@ func New() *Prometheus {
8289
Name: "drone_server_install_errors_total",
8390
Help: "Total number of errors installing software on a server.",
8491
})
92+
p.knownInstance = prometheus.NewGaugeVec(prometheus.GaugeOpts{
93+
Name: "drone_server_known_instance",
94+
Help: "Known server instances.",
95+
},
96+
[]string{
97+
"name",
98+
"provider",
99+
"region",
100+
"size",
101+
})
85102
prometheus.MustRegister(p.trackServerCreateTime)
86103
prometheus.MustRegister(p.trackServerInitTime)
87104
prometheus.MustRegister(p.trackServerSetupTime)
88105
prometheus.MustRegister(p.countServerCreateErr)
89106
prometheus.MustRegister(p.countServerInitErr)
90107
prometheus.MustRegister(p.countServerSetupErr)
108+
prometheus.MustRegister(p.knownInstance)
91109
return p
92110
}
93111

@@ -135,6 +153,26 @@ func (m *Prometheus) IncrServerSetupError() {
135153
m.countServerSetupErr.Inc()
136154
}
137155

156+
// RegisterKnownInstance registers that we know about a server.
157+
func (m *Prometheus) RegisterKnownInstance(instance *autoscaler.Instance) {
158+
m.knownInstance.With(prometheus.Labels{
159+
"name": instance.Name,
160+
"provider": string(instance.Provider),
161+
"region": instance.Region,
162+
"size": instance.Size,
163+
}).Set(1)
164+
}
165+
166+
// UnregisterKnownInstance forgets a server we once knew.
167+
func (m *Prometheus) UnregisterKnownInstance(instance *autoscaler.Instance) {
168+
m.knownInstance.Delete(prometheus.Labels{
169+
"name": instance.Name,
170+
"provider": string(instance.Provider),
171+
"region": instance.Region,
172+
"size": instance.Size,
173+
})
174+
}
175+
138176
// NopCollector provides a no-op metrics collector.
139177
type NopCollector struct{}
140178

@@ -163,3 +201,9 @@ func (*NopCollector) IncrServerInitError() {}
163201
// IncrServerSetupError keeps a count of errors encountered
164202
// when installing software on servers.
165203
func (*NopCollector) IncrServerSetupError() {}
204+
205+
// RegisterKnownInstance registers that we know about a server.
206+
func (*NopCollector) RegisterKnownInstance(instance *autoscaler.Instance) {}
207+
208+
// UnregisterKnownInstance forgets a server we once knew.
209+
func (*NopCollector) UnregisterKnownInstance(instance *autoscaler.Instance) {}

metrics/server_delete.go

+9-6
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import (
1212
)
1313

1414
// ServerDelete provides metrics for servers deleted.
15-
func ServerDelete(provider autoscaler.Provider) autoscaler.Provider {
15+
func ServerDelete(provider autoscaler.Provider, collector Collector) autoscaler.Provider {
1616
created := prometheus.NewCounter(prometheus.CounterOpts{
1717
Name: "drone_servers_deleted",
1818
Help: "Total number of servers deleted.",
@@ -24,17 +24,19 @@ func ServerDelete(provider autoscaler.Provider) autoscaler.Provider {
2424
prometheus.MustRegister(created)
2525
prometheus.MustRegister(errors)
2626
return &providerWrapDestroy{
27-
Provider: provider,
28-
created: created,
29-
errors: errors,
27+
Provider: provider,
28+
collector: collector,
29+
created: created,
30+
errors: errors,
3031
}
3132
}
3233

3334
// instruments the Provider to count server destroy events.
3435
type providerWrapDestroy struct {
3536
autoscaler.Provider
36-
created prometheus.Counter
37-
errors prometheus.Counter
37+
collector Collector
38+
created prometheus.Counter
39+
errors prometheus.Counter
3840
}
3941

4042
func (p *providerWrapDestroy) Destroy(ctx context.Context, instance *autoscaler.Instance) error {
@@ -44,5 +46,6 @@ func (p *providerWrapDestroy) Destroy(ctx context.Context, instance *autoscaler.
4446
} else {
4547
p.errors.Add(1)
4648
}
49+
p.collector.UnregisterKnownInstance(instance)
4750
return err
4851
}

0 commit comments

Comments
 (0)