Skip to content

MCO-1675: [API 2/6] Update API for Status Reporting needs #2383

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
apiVersion: apiextensions.k8s.io/v1 # Hack because controller-gen complains if we don't have this
name: "[FeatureGate] ImageModeStatusReporting"
crdName: machineconfignodes.machineconfiguration.openshift.io
featureGates:
- ImageModeStatusReporting
tests:
onCreate:
- name: Should be able to create MachineConfigNode when spec.configImage ImageModeStatusReporting is enabled
initial: |
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfigNode
metadata:
name: test-imagemodestatusreporting
spec:
node:
name: test-imagemodestatusreporting
pool:
name: worker
configVersion:
desired: rendered-worker-abc
configImage:
desiredImage: image-registry.openshift-image-registry.svc:5000/openshift-machine-config-operator/ocb-image@sha256:7a539f562d8d5d3b6bd7338ac014e34dabc9626cf344d30e412ef98a55cc1bf6
expected: |
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfigNode
metadata:
name: test-imagemodestatusreporting
spec:
node:
name: test-imagemodestatusreporting
pool:
name: worker
configVersion:
desired: rendered-worker-abc
configImage:
desiredImage: image-registry.openshift-image-registry.svc:5000/openshift-machine-config-operator/ocb-image@sha256:7a539f562d8d5d3b6bd7338ac014e34dabc9626cf344d30e412ef98a55cc1bf6
- name: Should be able to create MachineConfigNode with status.configImage when ImageModeStatusReporting is enabled when the current and desired images are different
initial: |
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfigNode
metadata:
name: test-imagemodestatusreporting
spec:
node:
name: test-imagemodestatusreporting
pool:
name: worker
configVersion:
desired: rendered-worker-abc
configImage:
desiredImage: image-registry.openshift-image-registry.svc:5000/openshift-machine-config-operator/os-images@sha256:c47856f56e1fdb7c9d10a1658e4ea85fbea44d71fb0e82898d152b47e0f894c6
status:
configImage:
desiredImage: image-registry.openshift-image-registry.svc:5000/openshift-machine-config-operator/ocb-image@sha256:7a539f562d8d5d3b6bd7338ac014e34dabc9626cf344d30e412ef98a55cc1bf6
currentImage: image-registry.openshift-image-registry.svc:5000/openshift-machine-config-operator/os-images@sha256:c47856f56e1fdb7c9d10a1658e4ea85fbea44d71fb0e82898d152b47e0f894c6
expected: |
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfigNode
metadata:
name: test-imagemodestatusreporting
spec:
node:
name: test-imagemodestatusreporting
pool:
name: worker
configVersion:
desired: rendered-worker-abc
configImage:
desiredImage: image-registry.openshift-image-registry.svc:5000/openshift-machine-config-operator/os-images@sha256:c47856f56e1fdb7c9d10a1658e4ea85fbea44d71fb0e82898d152b47e0f894c6
- name: Should be able to create MachineConfigNode with status.configImage when ImageModeStatusReporting is enabled when the current and desired images are the same
initial: |
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfigNode
metadata:
name: test-imagemodestatusreporting
spec:
node:
name: test-imagemodestatusreporting
pool:
name: worker
configVersion:
desired: rendered-worker-abc
configImage:
desiredImage: image-registry.openshift-image-registry.svc:5000/openshift-machine-config-operator/ocb-image@sha256:7a539f562d8d5d3b6bd7338ac014e34dabc9626cf344d30e412ef98a55cc1bf6
status:
configImage:
desiredImage: image-registry.openshift-image-registry.svc:5000/openshift-machine-config-operator/ocb-image@sha256:7a539f562d8d5d3b6bd7338ac014e34dabc9626cf344d30e412ef98a55cc1bf6
currentImage: image-registry.openshift-image-registry.svc:5000/openshift-machine-config-operator/ocb-image@sha256:7a539f562d8d5d3b6bd7338ac014e34dabc9626cf344d30e412ef98a55cc1bf6
expected: |
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfigNode
metadata:
name: test-imagemodestatusreporting
spec:
node:
name: test-imagemodestatusreporting
pool:
name: worker
configVersion:
desired: rendered-worker-abc
configImage:
desiredImage: image-registry.openshift-image-registry.svc:5000/openshift-machine-config-operator/ocb-image@sha256:7a539f562d8d5d3b6bd7338ac014e34dabc9626cf344d30e412ef98a55cc1bf6
- name: Should be able to create MachineConfigNode with status.configImage when ImageModeStatusReporting is enabled without the current image provided
initial: |
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfigNode
metadata:
name: test-imagemodestatusreporting
spec:
node:
name: test-imagemodestatusreporting
pool:
name: worker
configVersion:
desired: rendered-worker-abc
configImage:
desiredImage: image-registry.openshift-image-registry.svc:5000/openshift-machine-config-operator/ocb-image@sha256:7a539f562d8d5d3b6bd7338ac014e34dabc9626cf344d30e412ef98a55cc1bf6
status:
configImage:
desiredImage: image-registry.openshift-image-registry.svc:5000/openshift-machine-config-operator/os-images@sha256:c47856f56e1fdb7c9d10a1658e4ea85fbea44d71fb0e82898d152b47e0f894c6
currentImage:
expected: |
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfigNode
metadata:
name: test-imagemodestatusreporting
spec:
node:
name: test-imagemodestatusreporting
pool:
name: worker
configVersion:
desired: rendered-worker-abc
configImage:
desiredImage: image-registry.openshift-image-registry.svc:5000/openshift-machine-config-operator/ocb-image@sha256:7a539f562d8d5d3b6bd7338ac014e34dabc9626cf344d30e412ef98a55cc1bf6
- name: spec.configImage.desiredImage should enforce MaxLength.
initial: |
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfigNode
metadata:
name: test-imagemodestatusreporting
spec:
node:
name: test-imagemodestatusreporting
pool:
name: worker
configVersion:
desired: rendered-worker-abc
configImage:
desiredImage: registry.example.com/a/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very//very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/very/j
expectedError: "the OCI Image reference must end with a valid '@sha256:<digest>' suffix, where '<digest>' is 64 characters long"
60 changes: 60 additions & 0 deletions machineconfiguration/v1/types_machineconfignode.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@ type MachineConfigNodeSpec struct {
// the new machine config against the current machine config.
// +required
ConfigVersion MachineConfigNodeSpecMachineConfigVersion `json:"configVersion"`

// configImage holds the desired image for the node targeted by this machine config node resource.
// The desired image represents the image the node will attempt to update to and gets set before the machine config operator validates
// the new image against the current image. This field will be used only when OCL is enabled. This will be empty/omitted otherwise.
// +openshift:enable:FeatureGate=ImageModeStatusReporting
// +optional
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is expected behavior if this value is not provided?

Lets explicitly mention in the GoDoc that this field is optional and what it means when it is omitted/empty.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comments have been updated to describe the expected behavior when the field is empty. Please provide any additional suggestions/changes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The link shared is to comments on a totally different type. My original comment still stands

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry about the confusion, it should be updated now as seen here.

ConfigImage MachineConfigNodeSpecConfigImage `json:"configImage"`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the description above, should this also be omitEmpty (and a pointer as a result) to match the status?

}

// MachineConfigNodeStatus holds the reported information on a particular machine config node.
Expand All @@ -106,6 +113,8 @@ type MachineConfigNodeStatus struct {
// UpdatePrepared, UpdateExecuted, UpdatePostActionComplete, UpdateComplete, Updated, Resumed,
// Drained, AppliedFilesAndOS, Cordoned, Uncordoned, RebootedNode, NodeDegraded, PinnedImageSetsProgressing,
// and PinnedImageSetsDegraded.
// The following types are only available when the ImageModeStatusReporting feature gate is enabled: ImagePulledFromRegistry,
// AppliedOSImage, AppliedFiles
// +listType=map
// +listMapKey=type
// +kubebuilder:validation:MaxItems=20
Expand All @@ -120,6 +129,10 @@ type MachineConfigNodeStatus struct {
// configVersion describes the current and desired machine config version for this node.
// +optional
ConfigVersion *MachineConfigNodeStatusMachineConfigVersion `json:"configVersion,omitempty"`
// configImage describes the current and desired image for this node. OCL must be enabled for this to be populated. It will be omitted/empty otherwise.
// +openshift:enable:FeatureGate=ImageModeStatusReporting
// +optional
ConfigImage *MachineConfigNodeStatusConfigImage `json:"configImage,omitempty"`
// pinnedImageSets describes the current and desired pinned image sets for this node.
// +listType=map
// +listMapKey=name
Expand Down Expand Up @@ -209,6 +222,47 @@ type MachineConfigNodeSpecMachineConfigVersion struct {
Desired string `json:"desired"`
}

// MachineConfigNodeSpecConfigImage holds the desired image for the node.
// This structure is populated from the `machineconfiguration.openshift.io/desiredImage`
// annotation on the target node, which is set by the Machine Config Pool controller
// to signal the desired image pullspec for the node to update to.
type MachineConfigNodeSpecConfigImage struct {
// desiredImage is the fully qualified image pull spec of the image that the Machine
// Config Operator (MCO) intends to apply to the node. This field is optional. When
// OCL is not enabled, this field will be omitted/empty.
// The format of the push spec is: host[:port][/namespace]/name@sha256:<digest>,
// where the digest must be 64 characters long, and consist only of lowercase hexadecimal characters, a-f and 0-9.
// The length of the whole spec must be between 1 to 447 characters.
// +optional
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this field be required? Making this field optional means the parent field that is already optional has no required members.

That leads to being able to configure the field to do the same thing by:

  • omitting the parent field
  • setting the parent field to {}
  • setting it like:
configImage:
  desiredImage: ""

We generally try to only have a single way to "say" something for users so there isn't confusion on which approach should be used.

We generally try to avoid allowing fields to be set to {}.

To narrow it down further as to which approach is preferred, what does it mean for the parent field configImage to be omitted? What does it mean to set configImage.desiredImage to ""?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This validation rule should take care of this issue. When the desiredImage is empty, the idea is that OCL is not enabled. If it is enabled, either the currentImage or the desiredImage needs to be set. I can get an OCL expert to confirm if this would be the optimal approach or if there should be a default message printed as a means to provide clarity to users.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The linked validation doesn't apply to the question I'm asking here. This additional context I think helps me understand the goal here.

So, I think configImage needs to be a pointer as the zero value would not be valid. Because it is a pointer you can do a nil check to detect whether or not configImage has been explicitly configured or not.

Mark desiredImage as required. The existing CEL validation you've written does reject the empty string, but lets explicitly exclude the empty string by adding a +kubebuilder:validation:MinLength:=1 marker.

This approach ensures the following behavior is the only valid behavior:

  • If a user does not want the OCL functionality enabled, they just do not set the configImage setting.
  • If a user wants to enable the OCL functionality, they set the configImage setting, which means they must specify a valid desiredImage.

Now I'm moving to a separate thing - trying to understand what is and is not a valid value to supply here.
Assuming only the previously linked requirements is what is required, that means that values must:

  • start with a valid domain (registry.example.io or name.namespace.svc)
  • optionally provide a port (:8080)
  • optionally provide a "namespace" (/openshift)
  • provide a "name" (/machine-config-operator)
  • provide a SHA256 digest (@sha256:asjdyhfrea7YSDajsdfhals)

Can we write multiple CEL expressions that ensures we are appropriately evaluating these requirements in a way that we can return helpful messages to end users?

For example, when I provide something like registry.example.io:8080/openshift/machine-config-operator:latest, getting the message the OCI Image name should follow the host[:port][/namespace]/name format, resembling a valid URL without the scheme. Or it must be a valid .svc followed by a port, repository, image name, and tag. doesn't tell me what I've done wrong (and I would actually be quite confused because the value I provided actually is valid based on this message). Instead I would expect it to fail with a message like the provided OCI image reference must use a SHA256 digest.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

desiredImage cannot be made required in order to support the edge cases mentioned here. Instead, it is made optional and after confirming with an OCL expert (@umohnani8), it is decided that the desiredImage and currentImage will be of ImageDigestFormat type instead of string. This should fix the sha256 validation requirements.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a user does not want the OCL functionality enabled, they just do not set the configImage setting.
If a user wants to enable the OCL functionality, they set the configImage setting, which means they must specify a valid desiredImage.

As of right now, I'm not aware of any plans to have setting the configImage value trigger the enabling or disabling of OCL--that would still be through creating and applying a MachineOSConfig. The primary goal of the MCN resource as it stands for 4.20 is for reporting status to users.

I understand the idea for future proofing if we want to allow for that architecture shift in the future, but, even then, allowing the desiredImage to be optional seems important for the case of disabling OCL. The steps for disabling (as I understand) are

  1. Delete MachineOSConfig
  2. desiredImage node annotation becomes empty
    ...node updates...
  3. currentImage node annotation becomes empty & image mode is no longer enabled

My view is between steps 2 & 3, while the node is updating, OCL is still technically enabled since the node has a currentImage annotation even though it's not pointing to a desired image. In that case, having an empty desiredImage in the configImage spec would tell the user "you're using image mode, but it's in the process of being disabled." If we remove the config image reference in the spec, users then have to look into the MCN status to see that OCL is enabled, which could be fine, but is a decision I think is worth discussing.

@cheesesashimi @umohnani8 or @yuqi-zhang do any of you have thoughts on the ideas being discussed in this thread?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The primary goal of the MCN resource as it stands for 4.20 is for reporting status to users.

Why are we adding a spec field for this if the goal is for it to just communicate status? Spec fields indicate that it is a configurable field.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be helpful for me as a reviewer if you could answer the following questions for me:

  • Who/what creates a MachineConfigNode object?
  • What does the existence of a MachineConfigNode object mean?
  • Who/what reacts to a MachineConfigNode object?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we adding a spec field for this if the goal is for it to just communicate status? Spec fields indicate that it is a configurable field.

Adding ConfigImage to Spec was a decision made based on being consistent with the ConfigVersion also being in Spec. When this CRD was first ideated & introduced in 4.15, there was conversation on including the config version in Spec (here) as a way to show the node was targeting a new desired config version, but that config version had not yet been validated or accepted by the MCD.

If we should not include the ConfigImage in Spec, that's fine and we can remove it. I was not really aware that everything in Spec should be configurable, as I don't foresee the Name and Pool values in Spec being configurable by the user in the future.

Who/what creates a MachineConfigNode object?

The MachineConfigDaemon owns/manages the resource.

What does the existence of a MachineConfigNode object mean?

There is one MachineConfigNode resource per node, so it existing means a corresponding node exists. The 1-1 relationship is meant to allow users to check the detailed status of a node.

Who/what reacts to a MachineConfigNode object?

The MCO populates all information in the MachineConfigNode objects & users can consume the information how they wish by getting & describing the MCN for the node they are hoping to see more information about.

Since it is relatively new, there is nothing reacting to the MCN object yet. There have informal talks about the idea of using the MCN to trigger node updates instead of node annotations in the future, so that's a future proofing idea to have in mind, but beyond just ideas, I'm only aware of users consuming information from the MCN.

This is the enhancement for the MCN as it stands in 4.19 and may give more information on initial intentions. The WIP 4.20 additions are proposed in this working PR (which will be updated based on any conversations in this PR).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's quite a bit of discussion in the thread already, but I think to add some additional context, one of the main design goal of MCN is to reflect and provide additional context on the existing MCO-managed node-object fields (annotations), which is used primarily both as status as well as a way for the controller and daemon to communicate the current and desired states. We did not want to shove additional fields into the node object, so MCN would be the place for future fields that facilitate that (right now, we have PinnedImageSets dependent on MCN)

In that pattern, for the MCN, the consumption model is that the "spec" is set and managed by the controller, and the "status" should be used by the daemon, (although @isabella-janssen you would need to correct me if I'm wrong there) Long term I think that's the pattern we'd want. The spec, much like node object fields today, is not something we'd like the user to directly set, but rather be set by the controller.

In that vein I think it's fine to match the MachineConfig setup we have today to keep that consistentcy. If we'd like to reconsider that approach, I think the discussion point would be whether MCN should be status only, and consider some other approach to do controller-daemon interactions.

So to the last question of "Who/what reacts to a MachineConfigNode object?" I'd say it's the controller and the daemon reacting to each others updates to it.

DesiredImage ImageDigestFormat `json:"desiredImage"`
}

// MachineConfigNodeStatusConfigImage holds the observed state of the image
// on the node, including both the image targeted for an update and the image
// currently applied. This allows for monitoring the progress of the layering
// rollout. If OCL is enabled, desiredImage must be defined.
// +kubebuilder:validation:MinProperties:=1
type MachineConfigNodeStatusConfigImage struct {
// currentImage is the fully qualified image pull spec of the image that is
// currently applied to the node.
// This field is optional because when image-mode is first enabled on a
// node, there is no currentImage because the node has not yet applied
// the updated image. Only after the updated image is applied will the
// currentImage be populated.
// The format of the push spec is: host[:port][/namespace]/name@sha256:<digest>,
// where the digest must be 64 characters long, and consist only of lowercase hexadecimal characters, a-f and 0-9.
// The length of the whole spec must be between 1 to 447 characters.
// +optional
CurrentImage ImageDigestFormat `json:"currentImage"`
// desiredImage is a mirror of the desired image from the Spec. When the
// current and desired image are not equal, the node is in an updating phase. This field is required.
// The format of the push spec is: host[:port][/namespace]/name@sha256:<digest>,
// where the digest must be 64 characters long, and consist only of lowercase hexadecimal characters, a-f and 0-9.
// The length of the whole spec must be between 1 to 447 characters.
// +optional
DesiredImage ImageDigestFormat `json:"desiredImage"`
}

// StateProgress is each possible state for each possible MachineConfigNodeType
// +enum
type StateProgress string
Expand All @@ -228,8 +282,14 @@ const (
MachineConfigNodeResumed StateProgress = "Resumed"
// MachineConfigNodeUpdateDrained describes the part of the in progress phase where the node drains
MachineConfigNodeUpdateDrained StateProgress = "Drained"
// MachineConfigNodeUpdateFiles describes the part of the in progress phase where the nodes files changes
MachineConfigNodeUpdateFiles StateProgress = "AppliedFiles"
// MachineConfigNodeUpdateOS describes the part of the in progress phase where the OS config changes
MachineConfigNodeUpdateOS StateProgress = "AppliedOSImage"
// MachineConfigNodeUpdateFilesAndOS describes the part of the in progress phase where the nodes files and OS config change
MachineConfigNodeUpdateFilesAndOS StateProgress = "AppliedFilesAndOS"
// MachineConfigNodeImagePulledFromRegistry describes the part of the in progress phase where the update image is pulled from the registry
MachineConfigNodeImagePulledFromRegistry StateProgress = "ImagePulledFromRegistry"
// MachineConfigNodeUpdateCordoned describes the part of the in progress phase where the node cordons
MachineConfigNodeUpdateCordoned StateProgress = "Cordoned"
// MachineConfigNodeUpdateUncordoned describes the part of the completing phase where the node uncordons
Expand Down
Loading