This repository has been archived by the owner on Nov 16, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 44
/
Copy pathframeworkcontroller.yaml
279 lines (267 loc) · 7.45 KB
/
frameworkcontroller.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# Put it directly under frameworkcontroller's current working directory.
# For the full config setting and usage, see ./pkg/apis/frameworkcontroller/v1/config.go
# This is the default config for frameworkcontroller, so most settings are commented out.
#kubeApiServerAddress: http://10.10.10.10:8080
#kubeConfigFilePath: ''
# Based on the --max-mutating-requests-inflight is 500.
#kubeClientQps: 200
#kubeClientBurst: 300
#workerNumber: 500
#largeFrameworkCompression: true
#frameworkCompletedRetainSec: 2592000
#frameworkMinRetryDelaySecForTransientConflictFailed: 60
#frameworkMaxRetryDelaySecForTransientConflictFailed: 900
podFailureSpec:
################################################################################
# [-1199, -1000]: K8S issued failures
################################################################################
- code: -1000
phrase: PodEvicted
type:
attributes: [Transient]
podPatterns:
- reasonRegex: '(?i)^Evicted$'
messageRegex: '(?ms).*'
- code: -1001
phrase: PodNodeLost
type:
attributes: [Transient]
podPatterns:
- reasonRegex: '(?i)^NodeLost$'
messageRegex: '(?ms).*'
- code: -1002
phrase: PodScheduledToInsufficientResourceNode
type:
attributes: [Transient]
podPatterns:
- reasonRegex: '(?i)^OutOf\S+$'
messageRegex: '(?ms).*'
- code: -1003
phrase: PodPreemptedForCriticalPod
type:
attributes: [Transient]
podPatterns:
- reasonRegex: '(?i)^Preempting$'
messageRegex: '(?ms).*'
- code: -1004
phrase: PodDeadlineExceeded
type:
attributes: [Permanent]
podPatterns:
- reasonRegex: '(?i)^DeadlineExceeded$'
messageRegex: '(?ms).*'
- code: -1005
phrase: PodNodeAdmissionForbidden
type:
attributes: [Permanent]
podPatterns:
- reasonRegex: '(?i)^Forbidden$'
messageRegex: '(?ms).*'
- code: -1006
phrase: PodNodeAdmissionUnexpectedError
type:
attributes: [Transient]
podPatterns:
- reasonRegex: '(?i)^UnexpectedAdmissionError$'
messageRegex: '(?ms).*'
- reasonRegex: '(?i)^UnknownReason$'
messageRegex: '(?ms).*'
- reasonRegex: '(?i)^InvalidNodeInfo$'
messageRegex: '(?ms).*'
- reasonRegex: '(?i)^UnexpectedPredicateFailureType$'
messageRegex: '(?ms).*'
################################################################################
# [-1399, -1200]: Docker issued failures
################################################################################
- code: -1200
phrase: ContainerDockerOOMKilled
type:
attributes: [Permanent]
podPatterns:
- containers:
- reasonRegex: '(?i)^OOMKilled$'
codeRange: {min: 1}
nameRegex: '(?ms).*'
messageRegex: '(?ms).*'
- code: -1201
phrase: ContainerDockerCreationError
podPatterns:
- containers:
- reasonRegex: '(?i)^ContainerCannotRun$'
codeRange: {min: 125, max: 125}
nameRegex: '(?ms).*'
messageRegex: '(?ms).*'
- code: -1202
phrase: ContainerDockerStartCmdPermissionDenied
type:
attributes: [Permanent]
podPatterns:
- containers:
- reasonRegex: '(?i)^ContainerCannotRun$'
codeRange: {min: 126, max: 126}
nameRegex: '(?ms).*'
messageRegex: '(?ms).*'
- code: -1203
phrase: ContainerDockerStartCmdNotFound
type:
attributes: [Permanent]
podPatterns:
- containers:
- reasonRegex: '(?i)^ContainerCannotRun$'
codeRange: {min: 127, max: 127}
nameRegex: '(?ms).*'
messageRegex: '(?ms).*'
- containers:
- reasonRegex: '(?i)^ContainerCannotRun$'
codeRange: {min: 128, max: 128}
nameRegex: '(?ms).*'
messageRegex: '(?msi).*(not found|cannot find|no such).*'
- code: -1204
phrase: ContainerDockerStartUnknownError
podPatterns:
- containers:
- reasonRegex: '(?i)^ContainerCannotRun$'
codeRange: {min: 128, max: 128}
nameRegex: '(?ms).*'
messageRegex: '(?ms).*'
################################################################################
# [1, 255]: User Container issued failures
################################################################################
# [129, 192]: Involuntary failures caused by OS Signal
- code: 130
phrase: ContainerSigIntReceived
type:
attributes: [Transient]
podPatterns:
- containers:
- codeRange: {min: 130, max: 130}
nameRegex: '(?ms).*'
messageRegex: '(?ms).*'
- code: 131
phrase: ContainerSigQuitReceived
type:
attributes: [Transient]
podPatterns:
- containers:
- codeRange: {min: 131, max: 131}
nameRegex: '(?ms).*'
messageRegex: '(?ms).*'
- code: 132
phrase: ContainerSigIllReceived
type:
attributes: [Permanent]
podPatterns:
- containers:
- codeRange: {min: 132, max: 132}
nameRegex: '(?ms).*'
messageRegex: '(?ms).*'
- code: 134
phrase: ContainerSigAbrtReceived
podPatterns:
- containers:
- codeRange: {min: 134, max: 134}
nameRegex: '(?ms).*'
messageRegex: '(?ms).*'
- code: 135
phrase: ContainerSigBusReceived
type:
attributes: [Permanent]
podPatterns:
- containers:
- codeRange: {min: 135, max: 135}
nameRegex: '(?ms).*'
messageRegex: '(?ms).*'
- code: 136
phrase: ContainerSigFpeReceived
type:
attributes: [Permanent]
podPatterns:
- containers:
- codeRange: {min: 136, max: 136}
nameRegex: '(?ms).*'
messageRegex: '(?ms).*'
- code: 137
phrase: ContainerSigKillReceived
type:
attributes: [Transient]
podPatterns:
- containers:
- codeRange: {min: 137, max: 137}
nameRegex: '(?ms).*'
messageRegex: '(?ms).*'
- code: 139
phrase: ContainerSigSegvReceived
type:
attributes: [Permanent]
podPatterns:
- containers:
- codeRange: {min: 139, max: 139}
nameRegex: '(?ms).*'
messageRegex: '(?ms).*'
- code: 141
phrase: ContainerSigPipeReceived
type:
attributes: [Permanent]
podPatterns:
- containers:
- codeRange: {min: 141, max: 141}
nameRegex: '(?ms).*'
messageRegex: '(?ms).*'
- code: 143
phrase: ContainerSigTermReceived
type:
attributes: [Transient]
podPatterns:
- containers:
- codeRange: {min: 143, max: 143}
nameRegex: '(?ms).*'
messageRegex: '(?ms).*'
# [1, 255] - [129, 192]: Voluntary failures caused by Container itself
# [200, 219]: Reserved Codes
# [1, 255] - [129, 192] - [200, 219]: Custom Codes
# Example: Directly forwarding Container code and just adding type info.
#- code: 220
# phrase: Container220Failed
# type:
# attributes: [Permanent]
# podPatterns:
# - containers:
# - codeRange: {min: 220, max: 220}
# nameRegex: '(?ms).*'
# messageRegex: '(?ms).*'
# Example: Classification only based on Container termination message.
#- code: 221
# phrase: ContainerTensorflowOOMKilled
# type:
# attributes: [Permanent]
# podPatterns:
# - containers:
# - messageRegex: '(?msi)tensorflow.*ResourceExhaustedError.*OOM.*'
# codeRange: {min: 1}
# nameRegex: '(?ms).*'
#- code: 222
# phrase: ContainerMPISegvFault
# type:
# attributes: [Permanent]
# podPatterns:
# - containers:
# - messageRegex: '(?msi)Signal code: Address not mapped.*'
# codeRange: {min: 1}
# nameRegex: '(?ms).*'
#- code: 223
# phrase: ContainerCudaUncorrectableECCError
# type:
# attributes: [Transient]
# podPatterns:
# - containers:
# - messageRegex: '(?msi)CUDA_ERROR_ECC_UNCORRECTABLE.*'
# codeRange: {min: 1}
# nameRegex: '(?ms).*'
# Example: Redirect all unknown failures to a single comparable code.
#- code: 255
# phrase: ContainerUnknownFailed
# podPatterns:
# - containers:
# - codeRange: {min: 1}
# nameRegex: '(?ms).*'
# messageRegex: '(?ms).*'