Skip to content

Commit 2cfe928

Browse files
committed
prometheus metrics endpoint
1 parent cabcf84 commit 2cfe928

File tree

9 files changed

+215
-19
lines changed

9 files changed

+215
-19
lines changed

cmd/curio/rpc/rpc.go

+4-2
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,15 @@ import (
3030
"github.com/filecoin-project/curio/api/client"
3131
"github.com/filecoin-project/curio/build"
3232
"github.com/filecoin-project/curio/deps"
33+
"github.com/filecoin-project/curio/lib/metrics"
3334
"github.com/filecoin-project/curio/lib/paths"
3435
"github.com/filecoin-project/curio/lib/repo"
3536
"github.com/filecoin-project/curio/web"
3637

3738
lapi "github.com/filecoin-project/lotus/api"
3839
cliutil "github.com/filecoin-project/lotus/cli/util"
3940
"github.com/filecoin-project/lotus/lib/rpcenc"
40-
"github.com/filecoin-project/lotus/metrics"
41+
lotusmetrics "github.com/filecoin-project/lotus/metrics"
4142
"github.com/filecoin-project/lotus/metrics/proxy"
4243
"github.com/filecoin-project/lotus/storage/pipeline/piece"
4344
"github.com/filecoin-project/lotus/storage/sealer/fsutil"
@@ -71,6 +72,7 @@ func CurioHandler(
7172
mux.Handle("/rpc/v0", rpcServer)
7273
mux.Handle("/rpc/streams/v0/push/{uuid}", readerHandler)
7374
mux.PathPrefix("/remote").HandlerFunc(remote)
75+
mux.Handle("/debug/metrics", metrics.Exporter())
7476
mux.PathPrefix("/").Handler(http.DefaultServeMux) // pprof
7577

7678
if !permissioned {
@@ -283,7 +285,7 @@ func ListenAndServe(ctx context.Context, dependencies *deps.Deps, shutdownChan c
283285
permissioned),
284286
ReadHeaderTimeout: time.Minute * 3,
285287
BaseContext: func(listener net.Listener) context.Context {
286-
ctx, _ := tag.New(context.Background(), tag.Upsert(metrics.APIInterface, "lotus-worker"))
288+
ctx, _ := tag.New(context.Background(), tag.Upsert(lotusmetrics.APIInterface, "curio"))
287289
return ctx
288290
},
289291
Addr: dependencies.ListenAddr,

cmd/curio/run.go

+1-8
Original file line numberDiff line numberDiff line change
@@ -106,14 +106,7 @@ var runCmd = &cli.Command{
106106
ctxclose()
107107
}()
108108
}
109-
// Register all metric views
110-
/*
111-
if err := view.Register(
112-
metrics.MinerNodeViews...,
113-
); err != nil {
114-
log.Fatalf("Cannot register the view: %v", err)
115-
}
116-
*/
109+
117110
// Set the metric to one so it is published to the exporter
118111
stats.Record(ctx, metrics.LotusInfo.M(1))
119112

go.mod

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ module github.com/filecoin-project/curio
33
go 1.22.3
44

55
require (
6+
contrib.go.opencensus.io/exporter/prometheus v0.4.2
67
github.com/BurntSushi/toml v1.3.2
78
github.com/KarpelesLab/reflink v1.0.1
89
github.com/alecthomas/jsonschema v0.0.0-20200530073317-71f438968921
@@ -56,9 +57,11 @@ require (
5657
github.com/multiformats/go-multiaddr v0.12.4
5758
github.com/open-rpc/meta-schema v0.0.0-20201029221707-1b72ef2ea333
5859
github.com/pkg/errors v0.9.1
60+
github.com/prometheus/client_golang v1.19.1
5961
github.com/puzpuzpuz/xsync/v2 v2.4.0
6062
github.com/raulk/clock v1.1.0
6163
github.com/samber/lo v1.39.0
64+
github.com/sirupsen/logrus v1.9.2
6265
github.com/snadrus/must v0.0.0-20240605044437-98cedd57f8eb
6366
github.com/stretchr/testify v1.9.0
6467
github.com/urfave/cli/v2 v2.25.5
@@ -77,7 +80,6 @@ require (
7780
)
7881

7982
require (
80-
contrib.go.opencensus.io/exporter/prometheus v0.4.2 // indirect
8183
github.com/GeertJohan/go.incremental v1.0.0 // indirect
8284
github.com/GeertJohan/go.rice v1.0.3 // indirect
8385
github.com/Gurpartap/async v0.0.0-20180927173644-4f7f499dd9ee // indirect
@@ -267,7 +269,6 @@ require (
267269
github.com/pion/webrtc/v3 v3.2.40 // indirect
268270
github.com/pmezard/go-difflib v1.0.0 // indirect
269271
github.com/polydawn/refmt v0.89.0 // indirect
270-
github.com/prometheus/client_golang v1.19.1 // indirect
271272
github.com/prometheus/client_model v0.6.1 // indirect
272273
github.com/prometheus/common v0.55.0 // indirect
273274
github.com/prometheus/procfs v0.15.1 // indirect
@@ -279,7 +280,6 @@ require (
279280
github.com/rivo/uniseg v0.4.7 // indirect
280281
github.com/russross/blackfriday/v2 v2.1.0 // indirect
281282
github.com/shirou/gopsutil v2.18.12+incompatible // indirect
282-
github.com/sirupsen/logrus v1.9.2 // indirect
283283
github.com/spaolacci/murmur3 v1.1.0 // indirect
284284
github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7 // indirect
285285
github.com/triplewz/poseidon v0.0.0-20230828015038-79d8165c88ed // indirect

harmony/harmonytask/harmonytask.go

+29-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ import (
77
"sync/atomic"
88
"time"
99

10+
"go.opencensus.io/stats"
11+
"go.opencensus.io/tag"
12+
1013
"github.com/filecoin-project/curio/harmony/harmonydb"
1114
"github.com/filecoin-project/curio/harmony/resources"
1215
)
@@ -256,6 +259,8 @@ func (e *TaskEngine) GracefullyTerminate() {
256259
func (e *TaskEngine) poller() {
257260
nextWait := POLL_NEXT_DURATION
258261
for {
262+
stats.Record(context.Background(), TaskMeasures.PollerIterations.M(1))
263+
259264
select {
260265
case <-time.After(nextWait): // Find work periodically
261266
case <-e.ctx.Done(): ///////////////////// Graceful exit
@@ -270,6 +275,22 @@ func (e *TaskEngine) poller() {
270275
if time.Since(e.lastFollowTime) > FOLLOW_FREQUENCY {
271276
e.followWorkInDB()
272277
}
278+
279+
// update resource usage
280+
availableResources := e.ResourcesAvailable()
281+
totalResources := e.Resources()
282+
283+
cpuUsage := 1 - float64(availableResources.Cpu)/float64(totalResources.Cpu)
284+
stats.Record(context.Background(), TaskMeasures.CpuUsage.M(cpuUsage*100))
285+
286+
if totalResources.Gpu > 0 {
287+
gpuUsage := 1 - availableResources.Gpu/totalResources.Gpu
288+
stats.Record(context.Background(), TaskMeasures.GpuUsage.M(gpuUsage*100))
289+
}
290+
291+
ramUsage := 1 - float64(availableResources.Ram)/float64(totalResources.Ram)
292+
stats.Record(context.Background(), TaskMeasures.RamUsage.M(ramUsage*100))
293+
273294
}
274295
}
275296

@@ -401,6 +422,13 @@ func (e *TaskEngine) Resources() resources.Resources {
401422
var Registry = map[string]TaskInterface{}
402423

403424
func Reg(t TaskInterface) bool {
404-
Registry[t.TypeDetails().Name] = t
425+
name := t.TypeDetails().Name
426+
Registry[name] = t
427+
428+
// reset metrics
429+
_ = stats.RecordWithTags(context.Background(), []tag.Mutator{
430+
tag.Upsert(taskNameTag, name),
431+
}, TaskMeasures.ActiveTasks.M(0))
432+
405433
return true
406434
}

harmony/harmonytask/metrics.go

+105
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
package harmonytask
2+
3+
import (
4+
promclient "github.com/prometheus/client_golang/prometheus"
5+
"go.opencensus.io/stats"
6+
"go.opencensus.io/stats/view"
7+
"go.opencensus.io/tag"
8+
)
9+
10+
var (
11+
taskNameTag, _ = tag.NewKey("task_name")
12+
sourceTag, _ = tag.NewKey("source")
13+
pre = "harmonytask_"
14+
15+
// tasks can be short, but can extend to hours
16+
durationBuckets = []float64{0.5, 1, 3, 6, 10, 20, 30, 60, 120, 300, 600, 1800, 3600, 7200, 18000, 36000}
17+
)
18+
19+
// TaskMeasures groups all harmonytask metrics.
20+
var TaskMeasures = struct {
21+
TasksStarted *stats.Int64Measure
22+
TasksCompleted *stats.Int64Measure
23+
TasksFailed *stats.Int64Measure
24+
TaskDuration promclient.Histogram
25+
ActiveTasks *stats.Int64Measure
26+
CpuUsage *stats.Float64Measure
27+
GpuUsage *stats.Float64Measure
28+
RamUsage *stats.Float64Measure
29+
PollerIterations *stats.Int64Measure
30+
AddedTasks *stats.Int64Measure
31+
}{
32+
TasksStarted: stats.Int64(pre+"tasks_started", "Total number of tasks started.", stats.UnitDimensionless),
33+
TasksCompleted: stats.Int64(pre+"tasks_completed", "Total number of tasks completed successfully.", stats.UnitDimensionless),
34+
TasksFailed: stats.Int64(pre+"tasks_failed", "Total number of tasks that failed.", stats.UnitDimensionless),
35+
TaskDuration: promclient.NewHistogram(promclient.HistogramOpts{
36+
Name: pre + "task_duration_seconds",
37+
Buckets: durationBuckets,
38+
Help: "The histogram of task durations in seconds.",
39+
}),
40+
ActiveTasks: stats.Int64(pre+"active_tasks", "Current number of active tasks.", stats.UnitDimensionless),
41+
CpuUsage: stats.Float64(pre+"cpu_usage", "Percentage of CPU in use.", stats.UnitDimensionless),
42+
GpuUsage: stats.Float64(pre+"gpu_usage", "Percentage of GPU in use.", stats.UnitDimensionless),
43+
RamUsage: stats.Float64(pre+"ram_usage", "Percentage of RAM in use.", stats.UnitDimensionless),
44+
PollerIterations: stats.Int64(pre+"poller_iterations", "Total number of poller iterations.", stats.UnitDimensionless),
45+
AddedTasks: stats.Int64(pre+"added_tasks", "Total number of tasks added.", stats.UnitDimensionless),
46+
}
47+
48+
// TaskViews groups all harmonytask-related default views.
49+
func init() {
50+
err := view.Register(
51+
&view.View{
52+
Measure: TaskMeasures.TasksStarted,
53+
Aggregation: view.Sum(),
54+
TagKeys: []tag.Key{taskNameTag, sourceTag},
55+
},
56+
&view.View{
57+
Measure: TaskMeasures.TasksCompleted,
58+
Aggregation: view.Sum(),
59+
TagKeys: []tag.Key{taskNameTag},
60+
},
61+
&view.View{
62+
Measure: TaskMeasures.TasksFailed,
63+
Aggregation: view.Sum(),
64+
TagKeys: []tag.Key{taskNameTag},
65+
},
66+
&view.View{
67+
Measure: TaskMeasures.ActiveTasks,
68+
Aggregation: view.LastValue(),
69+
TagKeys: []tag.Key{taskNameTag},
70+
},
71+
&view.View{
72+
Measure: TaskMeasures.CpuUsage,
73+
Aggregation: view.LastValue(),
74+
TagKeys: []tag.Key{},
75+
},
76+
&view.View{
77+
Measure: TaskMeasures.GpuUsage,
78+
Aggregation: view.LastValue(),
79+
TagKeys: []tag.Key{},
80+
},
81+
&view.View{
82+
Measure: TaskMeasures.RamUsage,
83+
Aggregation: view.LastValue(),
84+
TagKeys: []tag.Key{},
85+
},
86+
&view.View{
87+
Measure: TaskMeasures.PollerIterations,
88+
Aggregation: view.Sum(),
89+
TagKeys: []tag.Key{},
90+
},
91+
&view.View{
92+
Measure: TaskMeasures.AddedTasks,
93+
Aggregation: view.Sum(),
94+
TagKeys: []tag.Key{taskNameTag},
95+
},
96+
)
97+
if err != nil {
98+
panic(err)
99+
}
100+
101+
err = promclient.Register(TaskMeasures.TaskDuration)
102+
if err != nil {
103+
panic(err)
104+
}
105+
}

harmony/harmonytask/task_type_handler.go

+40
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ import (
1010
"time"
1111

1212
logging "github.com/ipfs/go-log/v2"
13+
"go.opencensus.io/stats"
14+
"go.opencensus.io/tag"
1315

1416
"github.com/filecoin-project/curio/harmony/harmonydb"
1517
)
@@ -50,6 +52,13 @@ retryAddTask:
5052
log.Errorw("Could not add task. AddTasFunc failed", "error", err, "type", h.Name)
5153
return
5254
}
55+
56+
err = stats.RecordWithTags(context.Background(), []tag.Mutator{
57+
tag.Upsert(taskNameTag, h.Name),
58+
}, TaskMeasures.AddedTasks.M(1))
59+
if err != nil {
60+
log.Errorw("Could not record added task", "error", err)
61+
}
5362
}
5463

5564
const (
@@ -154,7 +163,16 @@ canAcceptAgain:
154163
}
155164
}
156165

166+
_ = stats.RecordWithTags(context.Background(), []tag.Mutator{
167+
tag.Upsert(taskNameTag, h.Name),
168+
tag.Upsert(sourceTag, from),
169+
}, TaskMeasures.TasksStarted.M(1))
170+
157171
h.Count.Add(1)
172+
_ = stats.RecordWithTags(context.Background(), []tag.Mutator{
173+
tag.Upsert(taskNameTag, h.Name),
174+
}, TaskMeasures.ActiveTasks.M(int64(h.Count.Load())))
175+
158176
go func() {
159177
log.Infow("Beginning work on Task", "id", *tID, "from", from, "name", h.Name)
160178

@@ -204,6 +222,28 @@ canAcceptAgain:
204222
func (h *taskTypeHandler) recordCompletion(tID TaskID, workStart time.Time, done bool, doErr error) {
205223
workEnd := time.Now()
206224
retryWait := time.Millisecond * 100
225+
226+
{
227+
// metrics
228+
229+
_ = stats.RecordWithTags(context.Background(), []tag.Mutator{
230+
tag.Upsert(taskNameTag, h.Name),
231+
}, TaskMeasures.ActiveTasks.M(int64(h.Count.Load())))
232+
233+
duration := workEnd.Sub(workStart).Seconds()
234+
TaskMeasures.TaskDuration.Observe(duration)
235+
236+
if done {
237+
_ = stats.RecordWithTags(context.Background(), []tag.Mutator{
238+
tag.Upsert(taskNameTag, h.Name),
239+
}, TaskMeasures.TasksCompleted.M(1))
240+
} else {
241+
_ = stats.RecordWithTags(context.Background(), []tag.Mutator{
242+
tag.Upsert(taskNameTag, h.Name),
243+
}, TaskMeasures.TasksFailed.M(1))
244+
}
245+
}
246+
207247
retryRecordCompletion:
208248
cm, err := h.TaskEngine.db.BeginTransaction(h.TaskEngine.ctx, func(tx *harmonydb.Tx) (bool, error) {
209249
var postedTime time.Time

lib/metrics/exporter.go

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package metrics
2+
3+
import (
4+
"net/http"
5+
6+
"contrib.go.opencensus.io/exporter/prometheus"
7+
promclient "github.com/prometheus/client_golang/prometheus"
8+
log "github.com/sirupsen/logrus"
9+
)
10+
11+
func Exporter() http.Handler {
12+
// Prometheus globals are exposed as interfaces, but the prometheus
13+
// OpenCensus exporter expects a concrete *Registry. The concrete type of
14+
// the globals are actually *Registry, so we downcast them, staying
15+
// defensive in case things change under the hood.
16+
registry, ok := promclient.DefaultRegisterer.(*promclient.Registry)
17+
if !ok {
18+
log.Warnf("failed to export default prometheus registry; some metrics will be unavailable; unexpected type: %T", promclient.DefaultRegisterer)
19+
}
20+
exporter, err := prometheus.NewExporter(prometheus.Options{
21+
Registry: registry,
22+
Namespace: "curio",
23+
})
24+
if err != nil {
25+
log.Errorf("could not create the prometheus stats exporter: %v", err)
26+
}
27+
28+
return exporter
29+
}

market/lmrpc/minerhandler.go

-4
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ import (
1111

1212
"github.com/filecoin-project/lotus/api"
1313
"github.com/filecoin-project/lotus/lib/rpcenc"
14-
"github.com/filecoin-project/lotus/metrics"
1514
"github.com/filecoin-project/lotus/metrics/proxy"
1615
"github.com/filecoin-project/lotus/node/impl"
1716
)
@@ -51,9 +50,6 @@ func MinerHandler(a api.StorageMiner, permissioned bool) (http.Handler, error) {
5150
m := mux.NewRouter()
5251
m.Handle("/rpc/v0", rpcServer)
5352
m.Handle("/rpc/streams/v0/push/{uuid}", readerHandler)
54-
// debugging
55-
m.Handle("/debug/metrics", metrics.Exporter())
56-
m.PathPrefix("/").Handler(http.DefaultServeMux) // pprof
5753

5854
var hnd http.Handler = m
5955
if permissioned {

web/static/pages/node_info/node-info.mjs

+4-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,10 @@ customElements.define('node-info',class NodeInfoElement extends LitElement {
3838
<td>${this.data.Info.CPU}</td>
3939
<td>${this.toHumanBytes(this.data.Info.Memory)}</td>
4040
<td>${this.data.Info.GPU}</td>
41-
<td><a href="http://${this.data.Info.Host}/debug/pprof">[pprof]</a></td>
41+
<td>
42+
<a href="http://${this.data.Info.Host}/debug/pprof">[pprof]</a>
43+
<a href="http://${this.data.Info.Host}/debug/metrics">[metrics]</a>
44+
</td>
4245
</tr>
4346
</table>
4447
<hr>

0 commit comments

Comments
 (0)