Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server: add EngineStats endpoint in multitenant setup #143633

Merged
merged 1 commit into from
Apr 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions pkg/ccl/changefeedccl/mocks/tenant_status_server_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 1 addition & 3 deletions pkg/cli/testdata/zip/testzip_external_process_virtualization
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[node 1] requesting stacks... received response... writing binary output: debug/nodes/1/stacks.txt... done
[node 1] requesting stacks with labels... received response... writing binary output: debug/nodes/1/stacks_with_labels.txt... done
[node 1] requesting heap profile... received response... writing binary output: debug/nodes/1/heap.pprof... done
[node 1] requesting engine stats... received response...
[node 1] requesting engine stats: last request failed: rpc error: ...
[node 1] requesting engine stats: creating error output: debug/nodes/1/lsm.txt.err.txt... done
[node 1] requesting engine stats... received response... writing binary output: debug/nodes/1/lsm.txt... done
[node 1] requesting heap profile list... received response... done
[node ?] ? heap profiles found
[node 1] requesting goroutine dump list... received response... done
Expand Down
4 changes: 1 addition & 3 deletions pkg/cli/testdata/zip/testzip_shared_process_virtualization
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[node 1] requesting stacks... received response... writing binary output: debug/cluster/test-tenant/nodes/1/stacks.txt... done
[node 1] requesting stacks with labels... received response... writing binary output: debug/cluster/test-tenant/nodes/1/stacks_with_labels.txt... done
[node 1] requesting heap profile... received response... writing binary output: debug/cluster/test-tenant/nodes/1/heap.pprof... done
[node 1] requesting engine stats... received response...
[node 1] requesting engine stats: last request failed: rpc error: ...
[node 1] requesting engine stats: creating error output: debug/cluster/test-tenant/nodes/1/lsm.txt.err.txt... done
[node 1] requesting engine stats... received response... writing binary output: debug/cluster/test-tenant/nodes/1/lsm.txt... done
[node 1] requesting heap profile list... received response...
[node 1] requesting heap profile list: last request failed: rpc error: ...
[node 1] requesting heap profile list: creating error output: debug/cluster/test-tenant/nodes/1/heapprof.err.txt... done
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[node 1] requesting stacks... received response... writing binary output: debug/cluster/test-tenant/nodes/1/stacks.txt... done
[node 1] requesting stacks with labels... received response... writing binary output: debug/cluster/test-tenant/nodes/1/stacks_with_labels.txt... done
[node 1] requesting heap profile... received response... writing binary output: debug/cluster/test-tenant/nodes/1/heap.pprof... done
[node 1] requesting engine stats... received response...
[node 1] requesting engine stats: last request failed: rpc error: ...
[node 1] requesting engine stats: creating error output: debug/cluster/test-tenant/nodes/1/lsm.txt.err.txt... done
[node 1] requesting engine stats... received response... writing binary output: debug/cluster/test-tenant/nodes/1/lsm.txt... done
[node 1] requesting heap profile list... received response...
[node 1] requesting heap profile list: last request failed: rpc error: ...
[node 1] requesting heap profile list: creating error output: debug/cluster/test-tenant/nodes/1/heapprof.err.txt... done
Expand Down
10 changes: 10 additions & 0 deletions pkg/kv/kvclient/kvtenant/connector.go
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,16 @@ func (c *connector) Gossip(
return
}

func (c *connector) EngineStats(
ctx context.Context, req *serverpb.EngineStatsRequest,
) (resp *serverpb.EngineStatsResponse, retErr error) {
retErr = c.withClient(ctx, func(ctx context.Context, client *client) (err error) {
resp, err = client.EngineStats(ctx, req)
return
})
return
}

// NewIterator implements the rangedesc.IteratorFactory interface.
func (c *connector) NewIterator(
ctx context.Context, span roachpb.Span,
Expand Down
2 changes: 1 addition & 1 deletion pkg/rpc/auth_tenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ func (a tenantAuthorizer) authorize(
case "/cockroach.server.serverpb.Status/NetworkConnectivity":
return a.capabilitiesAuthorizer.HasProcessDebugCapability(ctx, tenID)

case "/cockroach.server.serverpb.Status/Gossip":
case "/cockroach.server.serverpb.Status/Gossip", "/cockroach.server.serverpb.Status/EngineStats":
return a.capabilitiesAuthorizer.HasNodeStatusCapability(ctx, tenID)

case "/cockroach.server.serverpb.Status/TransactionContentionEvents":
Expand Down
1 change: 1 addition & 0 deletions pkg/server/serverpb/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ type TenantStatusServer interface {
DownloadSpan(ctx context.Context, request *DownloadSpanRequest) (*DownloadSpanResponse, error)
NetworkConnectivity(context.Context, *NetworkConnectivityRequest) (*NetworkConnectivityResponse, error)
Gossip(context.Context, *GossipRequest) (*gossip.InfoStatus, error)
EngineStats(context.Context, *EngineStatsRequest) (*EngineStatsResponse, error)
}

// OptionalNodesStatusServer returns the wrapped NodesStatusServer, if it is
Expand Down
16 changes: 16 additions & 0 deletions pkg/server/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -785,6 +785,22 @@ func (s *statusServer) redactGossipResponse(resp *gossip.InfoStatus) *gossip.Inf
return resp
}

// EngineStats returns statistical information of storage layer on the given node
// which is crucial for diagnosing issues related to disk usage,compaction efficiency,
// read/write amplification and other storage engine metrics critical for database
// performance.
func (t *statusServer) EngineStats(
ctx context.Context, req *serverpb.EngineStatsRequest,
) (*serverpb.EngineStatsResponse, error) {
ctx = t.AnnotateCtx(ctx)

if err := t.privilegeChecker.RequireViewClusterMetadataPermission(ctx); err != nil {
return nil, err
}

return t.sqlServer.tenantConnect.EngineStats(ctx, req)
}

func (s *systemStatusServer) EngineStats(
ctx context.Context, req *serverpb.EngineStatsRequest,
) (*serverpb.EngineStatsResponse, error) {
Expand Down
46 changes: 43 additions & 3 deletions pkg/server/storage_api/engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@ package storage_api_test
import (
"context"
"regexp"
"strings"
"testing"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/multitenant/tenantcapabilitiespb"
"github.com/cockroachdb/cockroach/pkg/server/debug"
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
"github.com/cockroachdb/cockroach/pkg/server/srvtestutils"
Expand All @@ -19,25 +21,34 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/pkg/errors"
"github.com/stretchr/testify/require"
)

// TestStatusEngineStatsJson ensures that the output response for the engine
// stats contains the required fields.
func TestStatusEngineStatsJson(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
ctx := context.Background()

dir, cleanupFn := testutils.TempDir(t)
defer cleanupFn()

srv := serverutils.StartServerOnly(t, base.TestServerArgs{
DefaultTestTenant: base.TestIsForStuffThatShouldWorkWithSecondaryTenantsButDoesntYet(110020),

StoreSpecs: []base.StoreSpec{{
Path: dir,
}},
})
defer srv.Stopper().Stop(context.Background())
defer srv.Stopper().Stop(ctx)

if srv.DeploymentMode().IsExternal() {
// Explicitly enabling CanViewNodeInfo capability for the secondary/application tenant
// when in external process mode, as shared process mode already has all capabilities.
require.NoError(t, srv.GrantTenantCapabilities(
ctx, serverutils.TestTenantID(),
map[tenantcapabilitiespb.ID]string{tenantcapabilitiespb.CanViewNodeInfo: "true"}))
}

s := srv.ApplicationLayer()

t.Logf("using admin URL %s", s.AdminURL())
Expand All @@ -57,3 +68,32 @@ func TestStatusEngineStatsJson(t *testing.T) {
t.Fatal(errors.Errorf("expected engine metrics to be correctly formatted, got:\n %s", formattedStats))
}
}

func TestStatusEngineStatsJsonWithoutTenantCapability(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

srv := serverutils.StartServerOnly(t, base.TestServerArgs{
// Note: We're only testing external-process mode because shared service
// mode tenants have all capabilities.
DefaultTestTenant: base.ExternalTestTenantAlwaysEnabled,
})
defer srv.Stopper().Stop(context.Background())

s := srv.ApplicationLayer()

var engineStats serverpb.EngineStatsResponse
// Using SucceedsSoon because we have seen in the wild that
// occasionally requests don't go through with error "transport:
// error while dialing: connection interrupted (did the remote node
// shut down or are there networking issues?)"
testutils.SucceedsSoon(t, func() error {
actualErr := srvtestutils.GetStatusJSONProto(s, "enginestats/local", &engineStats)
require.Error(t, actualErr)
if !strings.Contains(actualErr.Error(), "client tenant does not have capability to query cluster node metadata") {
return errors.Wrap(actualErr, "unexpected error message")
}
return nil
})

}
Loading