From 53d479a4c910c13de0c71e1535f7413980a69dd5 Mon Sep 17 00:00:00 2001
From: Pavol Loffay
Date: Thu, 4 Jul 2024 14:48:56 +0200
Subject: [PATCH] Update tempo to 2.5.0 (#958)
* Update tempo to 2.5.0
Signed-off-by: Pavol Loffay
* Upgrade procedure to tempo 2.5.0
Signed-off-by: Pavol Loffay
* Fix
Signed-off-by: Pavol Loffay
* Fix monolithic
Signed-off-by: Pavol Loffay
* Fix
Signed-off-by: Pavol Loffay
* Fix
Signed-off-by: Pavol Loffay
* Fix
Signed-off-by: Pavol Loffay
* Fix
Signed-off-by: Pavol Loffay
---------
Signed-off-by: Pavol Loffay
---
.chloggen/bump-tempo.yaml | 20 ++
Dockerfile | 1 +
Makefile | 6 +-
.../tempo-operator.clusterserviceversion.yaml | 26 ++-
.../tempo-operator.clusterserviceversion.yaml | 26 ++-
config/manager/manager.yaml | 4 +-
config/rbac/role.yaml | 16 ++
controllers/tempo/tempostack_controller.go | 4 +
internal/upgrade/v0_11_0.go | 198 ++++++++++++++++++
internal/upgrade/versions.go | 5 +
10 files changed, 291 insertions(+), 15 deletions(-)
create mode 100755 .chloggen/bump-tempo.yaml
create mode 100644 internal/upgrade/v0_11_0.go
diff --git a/.chloggen/bump-tempo.yaml b/.chloggen/bump-tempo.yaml
new file mode 100755
index 000000000..dacd0fc4d
--- /dev/null
+++ b/.chloggen/bump-tempo.yaml
@@ -0,0 +1,20 @@
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: breaking
+
+# The name of the component, or a single word describing the area of concern, (e.g. operator, github action)
+component: operator
+
+# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Update Tempo to 2.5.0
+
+# One or more tracking issues related to the change
+issues: [958]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext: |
+ Upstream Tempo 2.5.0 image switched user from `root` to `tempo` (10001:10001) and ownership of `/var/tempo`.
+ Therefore ingester's `/var/tempo/wal` created by previous deployment using Tempo 2.4.1 needs to be updated and
+ changed ownership. The operator upgrades the `/var/tempo` ownership by deploying a `job` with `securityContext.runAsUser(0)`
+ and it runs `chown -R /var/tempo 10001:10001`.
diff --git a/Dockerfile b/Dockerfile
index d21a8b058..f16cb3f00 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,6 +18,7 @@ COPY . .
# Build
ARG OPERATOR_VERSION
+ARG TEMPO_VERSION
RUN make build
# Use distroless as minimal base image to package the manager binary
diff --git a/Makefile b/Makefile
index 8df75f590..840c6a352 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
# Current Operator version
OPERATOR_VERSION ?= 0.10.0
-TEMPO_VERSION ?= 2.4.1
-TEMPO_QUERY_VERSION ?= 2.4.1
+TEMPO_VERSION ?= 2.5.0
+TEMPO_QUERY_VERSION ?= 2.5.0
TEMPO_GATEWAY_VERSION ?= main-2024-05-29-ca8d2de
TEMPO_GATEWAY_OPA_VERSION ?= main-2024-04-29-914c13f
OAUTH_PROXY_VERSION=4.12
@@ -159,7 +159,7 @@ run: manifests generate ## Run a controller from your host.
.PHONY: docker-build
docker-build: ## Build docker image with the manager.
- docker buildx build --load --platform linux/${ARCH} --build-arg OPERATOR_VERSION -t ${IMG} .
+ docker buildx build --load --platform linux/${ARCH} --build-arg OPERATOR_VERSION --build-arg TEMPO_VERSION -t ${IMG} .
.PHONY: docker-push
docker-push: ## Push docker image with the manager.
diff --git a/bundle/community/manifests/tempo-operator.clusterserviceversion.yaml b/bundle/community/manifests/tempo-operator.clusterserviceversion.yaml
index 4596e5fd6..a36cd0ae1 100644
--- a/bundle/community/manifests/tempo-operator.clusterserviceversion.yaml
+++ b/bundle/community/manifests/tempo-operator.clusterserviceversion.yaml
@@ -74,7 +74,7 @@ metadata:
capabilities: Deep Insights
categories: Logging & Tracing,Monitoring
containerImage: ghcr.io/grafana/tempo-operator/tempo-operator:v0.10.0
- createdAt: "2024-06-28T12:21:20Z"
+ createdAt: "2024-07-04T12:01:25Z"
description: Create and manage deployments of Tempo, a high-scale distributed
tracing backend.
operatorframework.io/cluster-monitoring: "true"
@@ -1190,6 +1190,15 @@ spec:
- deployments/finalizers
verbs:
- update
+ - apiGroups:
+ - batch
+ resources:
+ - jobs
+ verbs:
+ - create
+ - get
+ - list
+ - watch
- apiGroups:
- config.openshift.io
resources:
@@ -1198,6 +1207,13 @@ spec:
- get
- list
- watch
+ - apiGroups:
+ - ""
+ resources:
+ - persistentvolumeclaims
+ verbs:
+ - list
+ - watch
- apiGroups:
- grafana.integreatly.org
resources:
@@ -1366,9 +1382,9 @@ spec:
- --config=controller_manager_config.yaml
env:
- name: RELATED_IMAGE_TEMPO
- value: docker.io/grafana/tempo:2.4.1
+ value: docker.io/grafana/tempo:2.5.0
- name: RELATED_IMAGE_TEMPO_QUERY
- value: docker.io/grafana/tempo-query:2.4.1
+ value: docker.io/grafana/tempo-query:2.5.0
- name: RELATED_IMAGE_TEMPO_GATEWAY
value: quay.io/observatorium/api:main-2024-05-29-ca8d2de
- name: RELATED_IMAGE_TEMPO_GATEWAY_OPA
@@ -1513,9 +1529,9 @@ spec:
provider:
name: Grafana Tempo Operator SIG
relatedImages:
- - image: docker.io/grafana/tempo:2.4.1
+ - image: docker.io/grafana/tempo:2.5.0
name: tempo
- - image: docker.io/grafana/tempo-query:2.4.1
+ - image: docker.io/grafana/tempo-query:2.5.0
name: tempo-query
- image: quay.io/observatorium/api:main-2024-05-29-ca8d2de
name: tempo-gateway
diff --git a/bundle/openshift/manifests/tempo-operator.clusterserviceversion.yaml b/bundle/openshift/manifests/tempo-operator.clusterserviceversion.yaml
index 43bb2261a..278e01ed5 100644
--- a/bundle/openshift/manifests/tempo-operator.clusterserviceversion.yaml
+++ b/bundle/openshift/manifests/tempo-operator.clusterserviceversion.yaml
@@ -74,7 +74,7 @@ metadata:
capabilities: Deep Insights
categories: Logging & Tracing,Monitoring
containerImage: ghcr.io/grafana/tempo-operator/tempo-operator:v0.10.0
- createdAt: "2024-06-28T12:21:19Z"
+ createdAt: "2024-07-04T12:01:24Z"
description: Create and manage deployments of Tempo, a high-scale distributed
tracing backend.
operatorframework.io/cluster-monitoring: "true"
@@ -1200,6 +1200,15 @@ spec:
- deployments/finalizers
verbs:
- update
+ - apiGroups:
+ - batch
+ resources:
+ - jobs
+ verbs:
+ - create
+ - get
+ - list
+ - watch
- apiGroups:
- config.openshift.io
resources:
@@ -1208,6 +1217,13 @@ spec:
- get
- list
- watch
+ - apiGroups:
+ - ""
+ resources:
+ - persistentvolumeclaims
+ verbs:
+ - list
+ - watch
- apiGroups:
- grafana.integreatly.org
resources:
@@ -1376,9 +1392,9 @@ spec:
- --config=controller_manager_config.yaml
env:
- name: RELATED_IMAGE_TEMPO
- value: docker.io/grafana/tempo:2.4.1
+ value: docker.io/grafana/tempo:2.5.0
- name: RELATED_IMAGE_TEMPO_QUERY
- value: docker.io/grafana/tempo-query:2.4.1
+ value: docker.io/grafana/tempo-query:2.5.0
- name: RELATED_IMAGE_TEMPO_GATEWAY
value: quay.io/observatorium/api:main-2024-05-29-ca8d2de
- name: RELATED_IMAGE_TEMPO_GATEWAY_OPA
@@ -1534,9 +1550,9 @@ spec:
provider:
name: Grafana Tempo Operator SIG
relatedImages:
- - image: docker.io/grafana/tempo:2.4.1
+ - image: docker.io/grafana/tempo:2.5.0
name: tempo
- - image: docker.io/grafana/tempo-query:2.4.1
+ - image: docker.io/grafana/tempo-query:2.5.0
name: tempo-query
- image: quay.io/observatorium/api:main-2024-05-29-ca8d2de
name: tempo-gateway
diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml
index 9f9cc8cfe..bc9c24d14 100644
--- a/config/manager/manager.yaml
+++ b/config/manager/manager.yaml
@@ -40,9 +40,9 @@ spec:
- --leader-elect
env:
- name: RELATED_IMAGE_TEMPO
- value: docker.io/grafana/tempo:2.4.1
+ value: docker.io/grafana/tempo:2.5.0
- name: RELATED_IMAGE_TEMPO_QUERY
- value: docker.io/grafana/tempo-query:2.4.1
+ value: docker.io/grafana/tempo-query:2.5.0
- name: RELATED_IMAGE_TEMPO_GATEWAY
value: quay.io/observatorium/api:main-2024-05-29-ca8d2de
- name: RELATED_IMAGE_TEMPO_GATEWAY_OPA
diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
index c9b71324e..5cade67bd 100644
--- a/config/rbac/role.yaml
+++ b/config/rbac/role.yaml
@@ -46,6 +46,15 @@ rules:
- deployments/finalizers
verbs:
- update
+- apiGroups:
+ - batch
+ resources:
+ - jobs
+ verbs:
+ - create
+ - get
+ - list
+ - watch
- apiGroups:
- config.openshift.io
resources:
@@ -54,6 +63,13 @@ rules:
- get
- list
- watch
+- apiGroups:
+ - ""
+ resources:
+ - persistentvolumeclaims
+ verbs:
+ - list
+ - watch
- apiGroups:
- grafana.integreatly.org
resources:
diff --git a/controllers/tempo/tempostack_controller.go b/controllers/tempo/tempostack_controller.go
index 4405c9965..3d82a0539 100644
--- a/controllers/tempo/tempostack_controller.go
+++ b/controllers/tempo/tempostack_controller.go
@@ -59,6 +59,10 @@ type TempoStackReconciler struct {
// +kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors;prometheusrules,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=grafana.integreatly.org,resources=grafanadatasources,verbs=get;list;watch;create;update;patch;delete
+// Upgrate for 0.11.0 to Tempo 2.5
+// +kubebuilder:rbac:groups="core",resources=persistentvolumeclaims,verbs=list;watch
+// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create
+
//+kubebuilder:rbac:groups=tempo.grafana.com,resources=tempostacks,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=tempo.grafana.com,resources=tempostacks/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=tempo.grafana.com,resources=tempostacks/finalizers,verbs=update
diff --git a/internal/upgrade/v0_11_0.go b/internal/upgrade/v0_11_0.go
new file mode 100644
index 000000000..c7040d5df
--- /dev/null
+++ b/internal/upgrade/v0_11_0.go
@@ -0,0 +1,198 @@
+package upgrade
+
+import (
+ "context"
+ "fmt"
+ "time"
+
+ appsv1 "k8s.io/api/apps/v1"
+ batchv1 "k8s.io/api/batch/v1"
+ corev1 "k8s.io/api/core/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/util/wait"
+ ctrl "sigs.k8s.io/controller-runtime"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+
+ "github.com/grafana/tempo-operator/apis/tempo/v1alpha1"
+ "github.com/grafana/tempo-operator/internal/manifests/manifestutils"
+ "github.com/grafana/tempo-operator/internal/manifests/naming"
+)
+
+const (
+ pollInterval = 2 * time.Second
+ pollTimeout = 5 * time.Minute
+)
+
+// Upstream Tempo 2.5.0 has a breaking change https://github.com/grafana/tempo/releases/tag/v2.5.0
+// The /var/tempo is created in the dockerfile with 10001:10001
+// The user is changed to 10001:10001
+// The previous user in 2.4.2 was root (0)
+// The Red Hat Tempo image does not use root user (it uses 1001) and on OpenShift the /var/tempo PV has a different fsGroup
+// so the issue does not happen on OpenShift.
+func upgrade0_11_0(ctx context.Context, u Upgrade, tempo *v1alpha1.TempoStack) error {
+ // do nothing on OpenShift
+ if u.CtrlConfig.Gates.OpenShift.OpenShiftRoute {
+ return nil
+ }
+
+ image := tempo.Spec.Images.Tempo
+ if image == "" {
+ image = u.CtrlConfig.DefaultImages.Tempo
+ }
+
+ listOps := []client.ListOption{
+ client.MatchingLabels(manifestutils.ComponentLabels(manifestutils.IngesterComponentName, tempo.Name)),
+ }
+ pvcs := &corev1.PersistentVolumeClaimList{}
+ err := u.Client.List(ctx, pvcs, listOps...)
+ if err != nil {
+ return err
+ }
+ if len(pvcs.Items) == 0 {
+ return nil
+ }
+
+ err = scale_down_ingester(ctx, u, client.ObjectKey{Namespace: tempo.GetNamespace(), Name: naming.Name(manifestutils.IngesterComponentName, tempo.GetName())})
+ if err != nil {
+ return err
+ }
+
+ return chown_pvcs(ctx, u, tempo, tempo.Spec.Template.Ingester.NodeSelector, image, pvcs)
+}
+
+func upgrade0_11_0_monolithic(ctx context.Context, u Upgrade, tempo *v1alpha1.TempoMonolithic) error {
+ // do nothing on OpenShift
+ if u.CtrlConfig.Gates.OpenShift.OpenShiftRoute {
+ return nil
+ }
+
+ listOps := []client.ListOption{
+ client.MatchingLabels(manifestutils.ComponentLabels(manifestutils.TempoMonolithComponentName, tempo.Name)),
+ }
+ pvcs := &corev1.PersistentVolumeClaimList{}
+ err := u.Client.List(ctx, pvcs, listOps...)
+ if err != nil {
+ return err
+ }
+ if len(pvcs.Items) == 0 {
+ return nil
+ }
+
+ err = scale_down_ingester(ctx, u, client.ObjectKey{Namespace: tempo.GetNamespace(), Name: naming.Name(manifestutils.TempoMonolithComponentName, tempo.GetName())})
+ if err != nil {
+ return err
+ }
+
+ return chown_pvcs(ctx, u, tempo, tempo.Spec.NodeSelector, u.CtrlConfig.DefaultImages.Tempo, pvcs)
+}
+
+func scale_down_ingester(ctx context.Context, u Upgrade, ingesterQuery client.ObjectKey) error {
+ ingester := &appsv1.StatefulSet{}
+ err := u.Client.Get(ctx, ingesterQuery, ingester)
+ if err != nil {
+ // ingester does not exist, maybe scaled down?
+ if client.IgnoreNotFound(err) == nil {
+ return nil
+ }
+ return err
+ }
+
+ patch := ingester.DeepCopy()
+ zero := int32(0)
+ patch.Spec.Replicas = &zero
+ err = u.Client.Patch(ctx, patch, client.MergeFrom(ingester))
+ if err != nil {
+ return err
+ }
+
+ return wait.PollUntilContextTimeout(ctx, pollInterval, pollTimeout, true, func(ctx context.Context) (done bool, err error) {
+ ingester := &appsv1.StatefulSet{}
+ err = u.Client.Get(ctx, ingesterQuery, ingester)
+ if err != nil {
+ return false, err
+ }
+ if ingester.Status.Replicas == 0 {
+ return true, nil
+ }
+
+ return false, nil
+ })
+}
+
+func chown_pvcs(ctx context.Context, u Upgrade, tempo metav1.Object, nodeSelector map[string]string, image string, pvcs *corev1.PersistentVolumeClaimList) error {
+ var volumes []corev1.Volume
+ var volumeMounts []corev1.VolumeMount
+ for _, pvc := range pvcs.Items {
+ volumes = append(volumes, corev1.Volume{
+ Name: pvc.Name,
+ VolumeSource: corev1.VolumeSource{
+ PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
+ ClaimName: pvc.Name,
+ },
+ },
+ })
+
+ volumeMounts = append(volumeMounts, corev1.VolumeMount{
+ Name: pvc.Name,
+ MountPath: fmt.Sprintf("/var/tempo/%s", pvc.Name),
+ })
+ }
+
+ // keep the jobs around for 1 day
+ ttl := int32(60 * 60 * 24)
+ rootUser := int64(0)
+ upgradeJob := batchv1.Job{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: fmt.Sprintf("chown-%s", tempo.GetName()),
+ Namespace: tempo.GetNamespace(),
+ },
+ Spec: batchv1.JobSpec{
+ Template: corev1.PodTemplateSpec{
+ Spec: corev1.PodSpec{
+ // Make sure the job runs on the same node as ingester
+ NodeSelector: nodeSelector,
+ ServiceAccountName: naming.DefaultServiceAccountName(tempo.GetName()),
+ Volumes: volumes,
+ Containers: []corev1.Container{
+ {
+ Name: "chown",
+ Image: image,
+ Command: []string{"chown", "-R", "10001:10001", "/var/tempo"},
+ VolumeMounts: volumeMounts,
+ },
+ },
+ RestartPolicy: corev1.RestartPolicyNever,
+ SecurityContext: &corev1.PodSecurityContext{
+ RunAsUser: &rootUser,
+ },
+ },
+ },
+ TTLSecondsAfterFinished: &ttl,
+ },
+ }
+
+ if err := ctrl.SetControllerReference(tempo, &upgradeJob, u.Client.Scheme()); err != nil {
+ return err
+ }
+ err := u.Client.Create(ctx, &upgradeJob)
+ if err != nil {
+ return err
+ }
+ return wait.PollUntilContextTimeout(ctx, pollInterval, pollTimeout, true, func(ctx context.Context) (done bool, err error) {
+ job := &batchv1.Job{}
+ objectKey := client.ObjectKey{
+ Namespace: upgradeJob.Namespace,
+ Name: upgradeJob.Name,
+ }
+ err = u.Client.Get(ctx, objectKey, job)
+ if err != nil {
+ return false, err
+ }
+ if job.Status.Succeeded == 1 {
+ return true, nil
+ }
+
+ return false, nil
+ })
+
+}
diff --git a/internal/upgrade/versions.go b/internal/upgrade/versions.go
index 2c0ad6a0b..c4d3386e5 100644
--- a/internal/upgrade/versions.go
+++ b/internal/upgrade/versions.go
@@ -45,5 +45,10 @@ var (
version: *semver.MustParse("0.8.0"),
upgradeTempoStack: upgrade0_8_0,
},
+ {
+ version: *semver.MustParse("0.11.0"),
+ upgradeTempoStack: upgrade0_11_0,
+ upgradeTempoMonolithic: upgrade0_11_0_monolithic,
+ },
}
)