Skip to content

Commit 6230429

Browse files
committed
UPSTREAM: <carry>: migrate olmv1 QE stress cases
1 parent c310c42 commit 6230429

File tree

26 files changed

+834
-8
lines changed

26 files changed

+834
-8
lines changed

openshift/tests-extension/.openshift-tests-extension/openshift_payload_olmv1.json

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,22 @@
565565
"exclude": "topology==\"External\""
566566
}
567567
},
568+
{
569+
"name": "[sig-olmv1][Jira:OLM] OLM v1 for stress PolarionID:81509-[OTP][Skipped:Disconnected][OlmStress]olmv1 create mass operator to see if they all are installed successfully [Slow][Timeout:330m]",
570+
"labels": {
571+
"Extended": {},
572+
"NonHyperShiftHOST": {},
573+
"StressTest": {}
574+
},
575+
"resources": {
576+
"isolation": {}
577+
},
578+
"source": "openshift:payload:olmv1",
579+
"lifecycle": "blocking",
580+
"environmentSelector": {
581+
"exclude": "topology==\"External\""
582+
}
583+
},
568584
{
569585
"name": "[sig-olmv1][OCPFeatureGate:NewOLM][Skipped:Disconnected] OLMv1 Catalogs should be installed",
570586
"labels": {},

openshift/tests-extension/test/qe/README.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,20 @@ All migrated test case code needs the following changes to run in the new test f
244244
245245
**Environment Validation for Disconnected-Supporting Migrated Test Cases:**
246246
247-
If your test case supports disconnected environments, you MUST call `ValidateAccessEnvironment` at the beginning of the test:
247+
**When to use `ValidateAccessEnvironment`:**
248+
249+
1. **Test cases that create ClusterCatalog or ClusterExtension**:
250+
- If your test supports disconnected environments (both connected+disconnected, or disconnected-only)
251+
- AND your test creates ClusterCatalog or ClusterExtension resources
252+
- **MUST** call `ValidateAccessEnvironment(oc)` at the beginning of the test
253+
- This applies to both newly created test cases and migrated test cases
254+
255+
2. **Test cases that do NOT create both ClusterCatalog or ClusterExtension**:
256+
- Optional to use `ValidateAccessEnvironment(oc)`
257+
- Using it won't cause errors, but it's not required
258+
- The validation is primarily for ensuring catalog images can be mirrored
259+
260+
**Usage example:**
248261
249262
```go
250263
g.It("test case supporting disconnected", func() {
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
package specs
2+
3+
import (
4+
"fmt"
5+
"path/filepath"
6+
"time"
7+
8+
g "github.com/onsi/ginkgo/v2"
9+
o "github.com/onsi/gomega"
10+
e2e "k8s.io/kubernetes/test/e2e/framework"
11+
12+
exutil "github.com/openshift/operator-framework-operator-controller/openshift/tests-extension/test/qe/util"
13+
olmv1util "github.com/openshift/operator-framework-operator-controller/openshift/tests-extension/test/qe/util/olmv1util"
14+
)
15+
16+
var _ = g.Describe("[sig-olmv1][Jira:OLM] OLM v1 for stress", func() {
17+
18+
defer g.GinkgoRecover()
19+
var (
20+
oc = exutil.NewCLIWithoutNamespace("default")
21+
)
22+
23+
g.BeforeEach(func() {
24+
exutil.SkipMicroshift(oc)
25+
exutil.SkipNoOLMv1Core(oc)
26+
})
27+
28+
// author: [email protected]
29+
g.It("PolarionID:81509-[OTP][Skipped:Disconnected][OlmStress]olmv1 create mass operator to see if they all are installed successfully [Slow][Timeout:330m]", g.Label("StressTest"), g.Label("NonHyperShiftHOST"), func() {
30+
var (
31+
caseID = "81509"
32+
prefixCatalog = "catalog-" + caseID
33+
prefixSa = "sa-" + caseID
34+
prefixCe = "ce-" + caseID
35+
prefixNs = "ns-" + caseID
36+
prefixPackage = "stress-olmv1-c"
37+
prefixImage = "quay.io/olmqe/stress-index:vokv"
38+
nsOc = "openshift-operator-controller"
39+
nsCatalog = "openshift-catalogd"
40+
catalogLabel = "control-plane=catalogd-controller-manager"
41+
ocLabel = "control-plane=operator-controller-controller-manager"
42+
baseDir = exutil.FixturePath("testdata", "olm")
43+
clustercatalogTemplate = filepath.Join(baseDir, "clustercatalog.yaml")
44+
clusterextensionTemplate = filepath.Join(baseDir, "clusterextension.yaml")
45+
saClusterRoleBindingTemplate = filepath.Join(baseDir, "sa-admin.yaml")
46+
)
47+
48+
if !olmv1util.IsPodReady(oc, nsCatalog, catalogLabel) {
49+
_, _ = olmv1util.Get(oc, "pod", "-n", nsCatalog, "-l", catalogLabel, "-o", "yaml")
50+
exutil.AssertWaitPollNoErr(fmt.Errorf("the pod with %s is not correct", catalogLabel), "the pod with app=catalog-operator is not correct")
51+
}
52+
if !olmv1util.IsPodReady(oc, nsOc, ocLabel) {
53+
_, _ = olmv1util.Get(oc, "pod", "-n", nsOc, "-l", ocLabel, "-o", "yaml")
54+
exutil.AssertWaitPollNoErr(fmt.Errorf("the pod with %s is not correct", ocLabel), "the pod with app=olm-operator is not correct")
55+
}
56+
57+
startTime := time.Now().UTC()
58+
e2e.Logf("Start time: %s", startTime.Format(time.RFC3339))
59+
60+
// for i := 0; i < 500; i++ {
61+
for i := 900; i < 969; i++ {
62+
// it is not enough with 330m for one case if we run 100 times
63+
e2e.Logf("=================it is round %v=================", i)
64+
ns := fmt.Sprintf("%s-%d", prefixNs, i)
65+
clustercatalog := olmv1util.ClusterCatalogDescription{
66+
Name: fmt.Sprintf("%s-%d", prefixCatalog, i),
67+
Imageref: fmt.Sprintf("%s%d", prefixImage, i),
68+
Template: clustercatalogTemplate,
69+
}
70+
saCrb := olmv1util.SaCLusterRolebindingDescription{
71+
Name: fmt.Sprintf("%s-%d", prefixSa, i),
72+
Namespace: ns,
73+
Template: saClusterRoleBindingTemplate,
74+
}
75+
ce := olmv1util.ClusterExtensionDescription{
76+
Name: fmt.Sprintf("%s-%d", prefixCe, i),
77+
PackageName: fmt.Sprintf("%s%d", prefixPackage, i),
78+
Channel: "alpha",
79+
Version: ">=0.0.1",
80+
InstallNamespace: ns,
81+
SaName: fmt.Sprintf("%s-%d", prefixSa, i),
82+
Template: clusterextensionTemplate,
83+
}
84+
g.By(fmt.Sprintf("Create namespace for %d", i))
85+
// defer oc.WithoutNamespace().AsAdmin().Run("delete").Args("ns", ns, "--ignore-not-found").Execute()
86+
// it takes time to delete ns which is not necessary. currently 5.5h is not enough to delete them.
87+
// so I prefer to keep ns to save case duration
88+
err := oc.WithoutNamespace().AsAdmin().Run("create").Args("ns", ns).Execute()
89+
o.Expect(err).NotTo(o.HaveOccurred())
90+
o.Expect(olmv1util.Appearance(oc, exutil.Appear, "ns", ns)).To(o.BeTrue())
91+
92+
g.By(fmt.Sprintf("Create clustercatalog for %d", i))
93+
e2e.Logf("=========Create clustercatalog %v=========", clustercatalog.Name)
94+
defer clustercatalog.Delete(oc)
95+
err = clustercatalog.CreateWithoutCheck(oc)
96+
o.Expect(err).NotTo(o.HaveOccurred())
97+
clustercatalog.WaitCatalogStatus(oc, "true", "Serving", 0)
98+
99+
g.By(fmt.Sprintf("Create SA for clusterextension for %d", i))
100+
defer saCrb.Delete(oc)
101+
saCrb.Create(oc)
102+
103+
g.By(fmt.Sprintf("check ce to be installed for %d", i))
104+
e2e.Logf("=========Create clusterextension %v=========", ce.Name)
105+
defer ce.Delete(oc)
106+
err = ce.CreateWithoutCheck(oc)
107+
o.Expect(err).NotTo(o.HaveOccurred())
108+
ce.CheckClusterExtensionCondition(oc, "Progressing", "reason", "Succeeded", 10, 600, 0)
109+
ce.WaitClusterExtensionCondition(oc, "Installed", "True", 0)
110+
}
111+
112+
endTime := time.Now().UTC()
113+
e2e.Logf("End time: %v", endTime.Format(time.RFC3339))
114+
115+
duration := endTime.Sub(startTime)
116+
minutes := int(duration.Minutes())
117+
if minutes < 1 {
118+
minutes = 1
119+
}
120+
121+
podName, err := oc.AsAdmin().WithoutNamespace().Run("get").Args("pods", "-l", catalogLabel, "-o=jsonpath={.items[0].metadata.name}", "-n", nsCatalog).Output()
122+
if err == nil {
123+
if !olmv1util.WriteErrToArtifactDir(oc, nsCatalog, podName, "error", "Unhandled|Reconciler error|level=info", caseID, minutes) {
124+
e2e.Logf("no error log into artifact for pod %s in %s", podName, nsCatalog)
125+
}
126+
}
127+
podName, err = oc.AsAdmin().WithoutNamespace().Run("get").Args("pods", "-l", ocLabel, "-o=jsonpath={.items[0].metadata.name}", "-n", nsOc).Output()
128+
if err == nil {
129+
if !olmv1util.WriteErrToArtifactDir(oc, nsOc, podName, "error", "Unhandled|Reconciler error|level=info", caseID, minutes) {
130+
e2e.Logf("no error log into artifact for pod %s in %s", podName, nsOc)
131+
}
132+
}
133+
134+
if !olmv1util.IsPodReady(oc, nsCatalog, catalogLabel) {
135+
_, _ = olmv1util.Get(oc, "pod", "-n", nsCatalog, "-l", catalogLabel, "-o", "yaml")
136+
exutil.AssertWaitPollNoErr(fmt.Errorf("the pod with %s is not correct", catalogLabel), "the pod with app=catalog-operator is not correct")
137+
}
138+
if !olmv1util.IsPodReady(oc, nsOc, ocLabel) {
139+
_, _ = olmv1util.Get(oc, "pod", "-n", nsOc, "-l", ocLabel, "-o", "yaml")
140+
exutil.AssertWaitPollNoErr(fmt.Errorf("the pod with %s is not correct", ocLabel), "the pod with app=olm-operator is not correct")
141+
}
142+
143+
})
144+
145+
})

openshift/tests-extension/test/qe/util/architecture/architecture.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,10 @@ func SkipNonAmd64SingleArch(oc *exutil.CLI) Architecture {
8080
func getNodeArchitectures(oc *exutil.CLI) []string {
8181
output, err := oc.WithoutNamespace().AsAdmin().Run("get").Args("nodes", "-o=jsonpath={.items[*].status.nodeInfo.architecture}").Output()
8282
if err != nil {
83-
e2e.Failf("unable to get cluster node architectures: %v", err)
83+
g.Skip(fmt.Sprintf("unable to get cluster node architectures: %v", err))
8484
}
8585
if output == "" {
86-
e2e.Failf("no nodes found or architecture information missing")
86+
g.Skip("no nodes found or architecture information missing")
8787
}
8888
return strings.Fields(output) // Use Fields instead of Split to handle multiple spaces
8989
}
@@ -97,7 +97,7 @@ func getNodeArchitectures(oc *exutil.CLI) []string {
9797
func GetAvailableArchitecturesSet(oc *exutil.CLI) []Architecture {
9898
architectureStrings := getNodeArchitectures(oc)
9999
if len(architectureStrings) == 0 {
100-
e2e.Failf("no node architectures found")
100+
g.Skip("no node architectures found")
101101
}
102102

103103
// Use map for deduplication with Architecture as key
@@ -199,7 +199,7 @@ func (a Architecture) String() string {
199199
func ClusterArchitecture(oc *exutil.CLI) Architecture {
200200
architectureStrings := getNodeArchitectures(oc)
201201
if len(architectureStrings) == 0 {
202-
e2e.Failf("no node architectures found")
202+
g.Skip("no node architectures found")
203203
}
204204

205205
// Filter out empty strings and convert to Architecture
@@ -211,7 +211,7 @@ func ClusterArchitecture(oc *exutil.CLI) Architecture {
211211
}
212212

213213
if len(architectures) == 0 {
214-
e2e.Failf("no valid node architectures found")
214+
g.Skip("no valid node architectures found")
215215
}
216216

217217
// Check if all architectures are the same
@@ -267,7 +267,7 @@ func GetControlPlaneArch(oc *exutil.CLI) Architecture {
267267

268268
architectureStr = strings.TrimSpace(architectureStr)
269269
if architectureStr == "" {
270-
e2e.Failf("Control plane node %s has no architecture information", masterNode)
270+
g.Skip(fmt.Sprintf("Control plane node %s has no architecture information", masterNode))
271271
}
272272

273273
return FromString(architectureStr)

openshift/tests-extension/test/qe/util/olmv1util/helper.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,7 @@ func HasExternalNetworkAccess(oc *exutil.CLI) bool {
590590
// Note: In disconnected environments, curl will fail and bash will return non-zero exit code,
591591
// causing DebugNodeWithChroot to return an error. We ignore this error and rely on output checking.
592592
cmd := `timeout 10 curl -k https://quay.io > /dev/null 2>&1; [ $? -eq 0 ] && echo "connected"`
593-
output, _ := exutil.DebugNodeWithChroot(oc, masterNode, "bash", "-c", cmd)
593+
output, _ := exutil.DebugNodeWithOptionsAndChroot(oc, masterNode, []string{"--to-namespace=default"}, "bash", "-c", cmd)
594594

595595
// Check if the output contains "connected"
596596
// - Connected environment: curl succeeds -> echo "connected" -> output contains "connected"
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# OLM v1 Stress Testing Tools
2+
3+
This directory contains tools and configuration files for OLM v1 stress testing. These tools are used by the `openshift-e2e-test-olm-qe-stress` Prow job chain.
4+
5+
## Directory Structure
6+
7+
```
8+
stress/
9+
├── manifests/
10+
│ └── config/
11+
│ └── pkg-ins-v1/ # OLM v1 stress test configurations
12+
│ ├── pkg-ins-v1.yml # Main kube-burner configuration
13+
│ ├── metrics-endpoint.yml
14+
│ ├── metrics-profiles/
15+
│ │ └── metrics-aggregated.yml
16+
│ └── templates/ # Resource templates for stress testing
17+
│ ├── catalogd.yml # ClusterCatalog template
18+
│ ├── ce.yml # ClusterExtension template
19+
│ ├── sa.yml # ServiceAccount template
20+
│ ├── clusterrole.yml
21+
│ └── clusterrolebinding.yml
22+
└── util/
23+
└── ma/ # Metrics analysis Python package
24+
├── cli/ # CLI commands
25+
│ ├── cmd_group.py
26+
│ └── cmd_check_ccpu.py
27+
└── helper/ # Helper modules
28+
├── util.py
29+
├── containercpu.py
30+
├── exceptions.py
31+
├── algo.py
32+
└── const.py
33+
```
34+
35+
## Purpose
36+
37+
This directory supports stress testing for OLM v1 by:
38+
39+
1. **Kube-burner Configuration**: `pkg-ins-v1.yml` defines the stress test job that creates multiple ClusterCatalogs and ClusterExtensions
40+
2. **Resource Templates**: YAML templates for creating OLM v1 resources at scale
41+
3. **Metrics Analysis**: Python tools for analyzing CPU and performance metrics during stress tests
42+
43+
## Usage Context
44+
45+
These tools are **NOT** used by the Golang test cases directly. Instead, they are used by:
46+
47+
- **Prow Job**: `openshift-e2e-test-olm-qe-stress`
48+
- **Test Chain**: `stress-olm` (referenced in CI operator config)
49+
50+
The Prow job uses kube-burner with these configurations to create hundreds of OLM v1 resources concurrently and measure system behavior under load.
51+
52+
## Relationship to Test Cases
53+
54+
The Golang test case in `specs/olmv1_stress.go` (PolarionID:81509) is a **functional stress test** that validates OLM v1 can install mass operators correctly. It differs from these kube-burner tools:
55+
56+
- **Golang Test (81509)**: Validates functional correctness of installing many operators
57+
- **Kube-burner Tools**: Measure performance and resource usage under extreme load
58+
59+
Both are important for ensuring OLM v1 can handle production workloads at scale.
60+
61+
## Metrics Analysis
62+
63+
The `ma` (Metrics Analysis) Python package provides tools to analyze metrics collected during stress tests:
64+
65+
- **cmd_check_ccpu.py**: Check catalogd controller CPU usage
66+
- **containercpu.py**: Analyze container CPU metrics
67+
- **algo.py**: Statistical analysis algorithms for metrics
68+
69+
## Maintenance Notes
70+
71+
- This directory contains **OLM v1 specific** stress tools only
72+
- OLM v0 stress tools (`pkg-ins`) have been removed during migration
73+
- Template files reference OLM v1 resources: ClusterCatalog, ClusterExtension
74+
- Image prefix: `quay.io/olmqe/stress-index:vokv*` (stress test catalog images)
75+
76+
## Related Documentation
77+
78+
- [Kube-burner Documentation](https://kube-burner.readthedocs.io/)
79+
- [OLM v1 Stress Test Case](../../specs/olmv1_stress.go)
80+
- [CI Operator Config](https://github.com/openshift/release/tree/master/ci-operator/config/openshift/openshift-tests-private)
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
- endpoint: {{.PROMETHEUS_URL}}
2+
token: {{.PROMETHEUS_TOKEN}}
3+
step: 10s
4+
skipTLSVerify: true
5+
metrics:
6+
- metrics-profiles/metrics-aggregated.yml
7+
indexer:
8+
type: local
9+
metricsDirectory: collected-metrics-{{.UUID}}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Containers & pod metrics
2+
3+
- query: (sum(irate(container_cpu_usage_seconds_total{container="manager",namespace="openshift-catalogd"}[2m]) * 100) by (container, pod)) > 0
4+
metricName: containerCPU-Catlogd
5+
6+
- query: (sum(irate(container_cpu_usage_seconds_total{container="manager",namespace="openshift-operator-controller"}[2m]) * 100) by (container, pod)) > 0
7+
metricName: containerCPU-OpCon

0 commit comments

Comments
 (0)