Skip to content

Commit

Permalink
Capture EC2 serial console output of nodes
Browse files Browse the repository at this point in the history
This will help with situations where the node doesn't join the cluster
and we have no clue why.

Normally, if the problem is nodeadm init or install, this gets reflected
in the cloud-init output log, which is captured by the logs collector
and uploaded to S3. However, there are times where log collector fails
or we can't even ssh into the node to run it. In these cases, we are
left with no information about what happened.

By capturing the serial output, not only we will have some information
in those otherwise indebuggable scenarios, but we should be able to see
in real time the node init process in the test output as well as any
other useful information printed during boot. In addition to stdout, the
serial console logs are also written to a file in disk. The CI pipeline
will persist this file with the other pipeline artifacts in S3.

Co-authored-by: Jackson West <[email protected]>
Co-authored-by: Guillermo Gaston <[email protected]>
  • Loading branch information
jaxesn and g-gaston committed Feb 6, 2025
1 parent 735f9be commit a9f6474
Show file tree
Hide file tree
Showing 12 changed files with 584 additions and 62 deletions.
6 changes: 5 additions & 1 deletion buildspecs/test-nodeadm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,14 @@ phases:
build:
commands:
- SANITIZED_CODEBUILD_BUILD_ID=$(echo $CODEBUILD_BUILD_ID | tr ':' '-')
- ./hack/run-e2e.sh $SANITIZED_CODEBUILD_BUILD_ID $AWS_REGION $KUBERNETES_VERSION $CNI s3://$ARTIFACTS_BUCKET/latest-pre/linux/amd64/nodeadm s3://$ARTIFACTS_BUCKET/latest-pre/linux/arm64/nodeadm $LOGS_BUCKET $ENDPOINT
- ./hack/run-e2e.sh $SANITIZED_CODEBUILD_BUILD_ID $AWS_REGION $KUBERNETES_VERSION $CNI s3://$ARTIFACTS_BUCKET/latest-pre/linux/amd64/nodeadm s3://$ARTIFACTS_BUCKET/latest-pre/linux/arm64/nodeadm $LOGS_BUCKET e2e-artifacts $ENDPOINT

reports:
e2e-reports:
files:
- e2e-reports/junit-nodeadm.xml
file-format: "JUNITXML"

artifacts:
files:
- "e2e-artifacts/*"
6 changes: 4 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ require (
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.27
github.com/aws/aws-sdk-go-v2/service/cloudformation v1.56.10
github.com/aws/aws-sdk-go-v2/service/ec2 v1.202.2
github.com/aws/aws-sdk-go-v2/service/ec2instanceconnect v1.27.12
github.com/aws/aws-sdk-go-v2/service/ecr v1.40.1
github.com/aws/aws-sdk-go-v2/service/eks v1.57.2
github.com/aws/aws-sdk-go-v2/service/iam v1.38.10
Expand All @@ -27,6 +28,7 @@ require (
github.com/stretchr/testify v1.10.0
github.com/tredoe/osutil v1.5.0
go.uber.org/zap v1.27.0
golang.org/x/crypto v0.31.0
golang.org/x/mod v0.22.0
k8s.io/apimachinery v0.32.1
k8s.io/client-go v0.32.1
Expand Down Expand Up @@ -119,8 +121,8 @@ require (
golang.org/x/net v0.33.0 // indirect
golang.org/x/oauth2 v0.23.0 // indirect
golang.org/x/sync v0.10.0 // indirect
golang.org/x/sys v0.28.0 // indirect
golang.org/x/term v0.27.0 // indirect
golang.org/x/sys v0.29.0 // indirect
golang.org/x/term v0.28.0 // indirect
golang.org/x/text v0.21.0 // indirect
golang.org/x/time v0.7.0 // indirect
golang.org/x/tools v0.28.0 // indirect
Expand Down
12 changes: 8 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ github.com/aws/aws-sdk-go-v2/service/cloudformation v1.56.10 h1:5asGyAEAXc1A5FWC
github.com/aws/aws-sdk-go-v2/service/cloudformation v1.56.10/go.mod h1:synt0SxqCGxfepb1DUAMK7TxtRvucnmk80qXbp9Cfcs=
github.com/aws/aws-sdk-go-v2/service/ec2 v1.202.2 h1:qas57zkkMX8OM+MVz+4sMaOaD9HRmeFJRb8nzMdYkx0=
github.com/aws/aws-sdk-go-v2/service/ec2 v1.202.2/go.mod h1:2omfxRebtpbbFqQGqeurDzlyB7Txa2e1xe9rCDFqlwA=
github.com/aws/aws-sdk-go-v2/service/ec2instanceconnect v1.27.12 h1:UK36TdbN1GB8FwZYTT0ZPlsg7ZqVpMHMcee0b6BYpSY=
github.com/aws/aws-sdk-go-v2/service/ec2instanceconnect v1.27.12/go.mod h1:qvHbeq93l0rsTolEGpOZ12yHcVQ/yfHiqVEmh8AVw1U=
github.com/aws/aws-sdk-go-v2/service/ecr v1.40.1 h1:1C4JhM1VGzObZBPY0t+ymBRvF9BuCFToJ/j8RaW/9p8=
github.com/aws/aws-sdk-go-v2/service/ecr v1.40.1/go.mod h1:XGy5jWOLlZK6nN3NWkoux6vN0QfaEIB91xbPCiX8gao=
github.com/aws/aws-sdk-go-v2/service/eks v1.57.2 h1:Uxm6iUIEaRtyvcp8Gj45viJmM2KksMLNBRCd8DBxuJA=
Expand Down Expand Up @@ -237,6 +239,8 @@ go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U=
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.22.0 h1:D4nJWe9zXqHOmWqj4VMOJhvzj7bEZg4wEYa759z1pH4=
Expand All @@ -258,10 +262,10 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q=
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
golang.org/x/sys v0.29.0 h1:TPYlXGxvx1MGTn2GiZDhnjPA9wZzZeGKHHmKhHYvgaU=
golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.28.0 h1:/Ts8HFuMR2E6IP/jlo7QVLZHggjKQbhu/7H0LJFr3Gg=
golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
Expand Down
12 changes: 8 additions & 4 deletions hack/run-e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ KUBERNETES_VERSION="${3?Please specify the Kubernetes version}"
CNI="${4?Please specify the cni}"
NODEADM_AMD_URL="${5?Please specify the nodeadm amd url}"
NODEADM_ARM_URL="${6?Please specify the nodeadm arm url}"
LOGS_BUCKET="${7-}"
ENDPOINT="${8-}"
LOGS_BUCKET="${7-?Please specify the bucket for logs}"
ARTIFACTS_FOLDER="${8-?Please specify the folder for artifacts}"
ENDPOINT="${9-}"

CONFIG_DIR="$REPO_ROOT/e2e-config"
ARCH="$([ "x86_64" = "$(uname -m)" ] && echo amd64 || echo arm64)"
Expand Down Expand Up @@ -65,22 +66,25 @@ trap "cleanup" EXIT

$BIN_DIR/e2e-test setup -s $RESOURCES_YAML

mkdir -p e2e-reports
mkdir -p "$ARTIFACTS_FOLDER"
ARTIFACTS_FOLDER=$(realpath "$ARTIFACTS_FOLDER")

cat <<EOF > $CONFIG_DIR/e2e-param.yaml
clusterName: "$CLUSTER_NAME"
clusterRegion: "$REGION"
nodeadmUrlAMD: "$NODEADM_AMD_URL"
nodeadmUrlARM: "$NODEADM_ARM_URL"
logsBucket: "$LOGS_BUCKET"
endpoint: "$ENDPOINT"
artifactsFolder: "$ARTIFACTS_FOLDER"
EOF


SKIP_FILE=$REPO_ROOT/hack/SKIPPED_TESTS.yaml
# Extract skipped_tests field from SKIP_FILE file and join entries with ' || '
skip=$(yq '.skipped_tests | join("|")' ${SKIP_FILE})

mkdir -p e2e-reports

# We explicitly specify procs instead of letting ginkgo decide (with -p) because in if not
# ginkgo will use all available CPUs, which could be a small number depending
# on how the CI runner has been configured. However, even if only one CPU is available,
Expand Down
1 change: 1 addition & 0 deletions hybrid-nodes-cdk/lib/nodeadm-stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ export class NodeadmBuildStack extends cdk.Stack {
'ec2:RebootInstances',
'ec2:StopInstances',
'ec2:TerminateInstances',
'ec2-instance-connect:SendSerialConsoleSSHPublicKey',
],
resources: ['*'],
effect: iam.Effect.ALLOW,
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/ec2/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ func DeleteEC2Instance(ctx context.Context, client *ec2.Client, instanceID strin

func WaitForEC2InstanceRunning(ctx context.Context, ec2Client *ec2.Client, instanceID string) error {
waiter := ec2.NewInstanceRunningWaiter(ec2Client, func(isowo *ec2.InstanceRunningWaiterOptions) {
isowo.LogWaitAttempts = true
isowo.MinDelay = 1 * time.Second
})
return waiter.Wait(ctx, &ec2.DescribeInstancesInput{InstanceIds: []string{instanceID}}, nodeRunningTimeout)
}
Expand Down
48 changes: 30 additions & 18 deletions test/e2e/kubernetes/verify.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strings"

"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"
clientgo "k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
)
Expand All @@ -24,32 +25,43 @@ type VerifyNode struct {
NodeIPAddress string
}

func (t VerifyNode) Run(ctx context.Context) error {
func (v VerifyNode) WaitForNodeReady(ctx context.Context) (*corev1.Node, error) {
// get the hybrid node registered using nodeadm by the internal IP of an EC2 Instance
node, err := WaitForNode(ctx, t.K8s, t.NodeIPAddress, t.Logger)
node, err := WaitForNode(ctx, v.K8s, v.NodeIPAddress, v.Logger)
if err != nil {
return err
return nil, err
}
if node == nil {
return fmt.Errorf("returned node is nil")
return nil, fmt.Errorf("returned node is nil")
}

nodeName := node.Name

t.Logger.Info("Waiting for hybrid node to be ready...")
if err = WaitForHybridNodeToBeReady(ctx, t.K8s, nodeName, t.Logger); err != nil {
v.Logger.Info("Waiting for hybrid node to be ready...")
if err = WaitForHybridNodeToBeReady(ctx, v.K8s, nodeName, v.Logger); err != nil {
return nil, err
}

return node, nil
}

func (v VerifyNode) Run(ctx context.Context) error {
node, err := v.WaitForNodeReady(ctx)
if err != nil {
return err
}

t.Logger.Info("Creating a test pod on the hybrid node...")
nodeName := node.Name

v.Logger.Info("Creating a test pod on the hybrid node...")
podName := GetNginxPodName(nodeName)
if err = CreateNginxPodInNode(ctx, t.K8s, nodeName, testPodNamespace, t.Region, t.Logger); err != nil {
if err = CreateNginxPodInNode(ctx, v.K8s, nodeName, testPodNamespace, v.Region, v.Logger); err != nil {
return err
}
t.Logger.Info(fmt.Sprintf("Pod %s created and running on node %s", podName, nodeName))
v.Logger.Info(fmt.Sprintf("Pod %s created and running on node %s", podName, nodeName))

t.Logger.Info("Exec-ing nginx -version", "pod", podName)
stdout, stderr, err := ExecPodWithRetries(ctx, t.ClientConfig, t.K8s, podName, testPodNamespace, "/sbin/nginx", "-version")
v.Logger.Info("Exec-ing nginx -version", "pod", podName)
stdout, stderr, err := ExecPodWithRetries(ctx, v.ClientConfig, v.K8s, podName, testPodNamespace, "/sbin/nginx", "-version")
if err != nil {
return err
}
Expand All @@ -59,23 +71,23 @@ func (t VerifyNode) Run(ctx context.Context) error {
if stderr != "" {
return fmt.Errorf("pod exec stderr should be empty %s", stderr)
}
t.Logger.Info("Successfully exec'd nginx -version", "pod", podName)
v.Logger.Info("Successfully exec'd nginx -version", "pod", podName)

t.Logger.Info("Checking logs for nginx output", "pod", podName)
logs, err := GetPodLogsWithRetries(ctx, t.K8s, podName, testPodNamespace)
v.Logger.Info("Checking logs for nginx output", "pod", podName)
logs, err := GetPodLogsWithRetries(ctx, v.K8s, podName, testPodNamespace)
if err != nil {
return err
}
if !strings.Contains(logs, "nginx") {
return fmt.Errorf("pod log does not contain expected value %s: %s", logs, "nginx")
}
t.Logger.Info("Successfully validated log output", "pod", podName)
v.Logger.Info("Successfully validated log output", "pod", podName)

t.Logger.Info("Deleting test pod", "pod", podName)
if err = DeletePod(ctx, t.K8s, podName, testPodNamespace); err != nil {
v.Logger.Info("Deleting test pod", "pod", podName)
if err = DeletePod(ctx, v.K8s, podName, testPodNamespace); err != nil {
return err
}
t.Logger.Info("Pod deleted successfully", "pod", podName)
v.Logger.Info("Pod deleted successfully", "pod", podName)

return nil
}
118 changes: 117 additions & 1 deletion test/e2e/logger.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
package e2e

import (
"bytes"
"io"
"os"
"sync"

"github.com/go-logr/logr"
"github.com/go-logr/zapr"
Expand All @@ -11,10 +14,14 @@ import (

type LoggerConfig struct {
NoColor bool
syncer zapcore.WriteSyncer
}

func (c LoggerConfig) Apply(opts *LoggerConfig) {
opts.NoColor = c.NoColor
if c.syncer != nil {
opts.syncer = c.syncer
}
}

type LoggerOption interface {
Expand All @@ -27,13 +34,122 @@ func NewLogger(opts ...LoggerOption) logr.Logger {
opt.Apply(cfg)
}

if cfg.syncer == nil {
cfg.syncer = zapcore.AddSync(os.Stdout)
}

encoderCfg := zap.NewDevelopmentEncoderConfig()
if !cfg.NoColor {
encoderCfg.EncodeLevel = zapcore.CapitalColorLevelEncoder
}

consoleEncoder := zapcore.NewConsoleEncoder(encoderCfg)
core := zapcore.NewCore(consoleEncoder, zapcore.AddSync(os.Stdout), zapcore.DebugLevel)
core := zapcore.NewCore(consoleEncoder, cfg.syncer, zapcore.DebugLevel)
log := zap.New(core)

return zapr.NewLogger(log)
}

// NewPausableLogger returns a logger that can be paused and resumed.
func NewPausableLogger(opts ...LoggerOption) PausableLogger {
cfg := &LoggerConfig{}
for _, opt := range opts {
opt.Apply(cfg)
}

activeSyncer := zapcore.AddSync(os.Stdout)
syncer := newSwitchSyncer(activeSyncer)
cfg.syncer = syncer

return PausableLogger{
Logger: NewLogger(cfg),
syncer: syncer,
}
}

// PausableLogger can be paused and resumed. It wraps a logr.Logger.
// When paused, writes go to a buffer; when resumed, writes go to stdout.
// After it's resumed, the buffer is flushed to stdout.
type PausableLogger struct {
logr.Logger
syncer *switchSyncer
}

func (l PausableLogger) Pause() {
l.syncer.Pause()
}

func (l PausableLogger) Resume() error {
return l.syncer.Resume()
}

// switchSyncer implements zapcore.WriteSyncer.
// When paused, writes go to a buffer; when resumed, writes go to the active writer.
type switchSyncer struct {
*SwitchWriter
active zapcore.WriteSyncer // normally stdout
}

func newSwitchSyncer(active zapcore.WriteSyncer) *switchSyncer {
return &switchSyncer{
active: active,
SwitchWriter: NewSwitchWriter(active),
}
}

func (s *switchSyncer) Sync() error {
s.mu.Lock()
defer s.mu.Unlock()
if s.paused {
return nil
}
return s.active.Sync()
}

// SwitchWriter implements io.Writer.
// When paused, writes go to a buffer; when resumed, writes go to the active writer.
type SwitchWriter struct {
mu sync.Mutex
active io.Writer // actual writer where we want to output
buffer *bytes.Buffer // holds bytes while paused
paused bool
}

var _ io.Writer = &SwitchWriter{}

func NewSwitchWriter(active io.Writer) *SwitchWriter {
return &SwitchWriter{
active: active,
buffer: &bytes.Buffer{},
}
}

func (s *SwitchWriter) Write(p []byte) (int, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.paused {
return s.buffer.Write(p)
}
return s.active.Write(p)
}

// Pause causes subsequent writes to be buffered.
func (s *SwitchWriter) Pause() {
s.mu.Lock()
s.paused = true
s.mu.Unlock()
}

// Resume flushes the buffer to the active writer and resumes normal output.
func (s *SwitchWriter) Resume() error {
s.mu.Lock()
defer s.mu.Unlock()
if s.buffer.Len() > 0 {
if _, err := s.active.Write(s.buffer.Bytes()); err != nil {
return err
}
s.buffer.Reset()
}
s.paused = false
return nil
}
Loading

0 comments on commit a9f6474

Please sign in to comment.