From ad944d666357c9afa6fd80164b097d47455624f9 Mon Sep 17 00:00:00 2001 From: Andrey Kolkov Date: Fri, 3 Jul 2026 10:15:05 +0400 Subject: [PATCH] feat(main): add observed versions of EtcdMembers Observe the etcd version each member is actually running (via the Maintenance Status RPC against the member's own endpoint) into EtcdMember.status.version, surfaced as the "Running" print column. Add an informational VersionDrifted condition comparing the observed running version against the intended spec.version; it is left unset when intent is not yet known. Observation is best-effort and never gates readiness. Extract the operator-side dial config out of discoverMemberID into a shared etcdDialConfig helper. Implements #338. Signed-off-by: Andrey Kolkov --- api/v1alpha2/etcdmember_types.go | 19 ++ ...tcd-operator.cozystack.io_etcdmembers.yaml | 15 ++ controllers/etcd_client.go | 7 + controllers/etcdmember_controller.go | 129 ++++++++++--- controllers/etcdmember_controller_test.go | 175 ++++++++++++++++++ controllers/testing_helpers_test.go | 15 ++ docs/concepts.md | 6 + 7 files changed, 344 insertions(+), 22 deletions(-) diff --git a/api/v1alpha2/etcdmember_types.go b/api/v1alpha2/etcdmember_types.go index 7c50fbd3..89391cb9 100644 --- a/api/v1alpha2/etcdmember_types.go +++ b/api/v1alpha2/etcdmember_types.go @@ -63,6 +63,12 @@ const ( MemberJoined = "Joined" // MemberReady indicates the member is healthy and serving requests. MemberReady = "Ready" + // MemberVersionDrifted is True when the version etcd actually reports + // running (status.version, observed from the endpoint) does not match the + // version the operator asked this member to run (spec.version). It makes + // intent-vs-reality version drift detectable rather than assumed; the + // operator does not act on it (spec.version still drives the image tag). + MemberVersionDrifted = "VersionDrifted" ) // EtcdMemberSpec defines the desired state of a single etcd member. @@ -223,6 +229,18 @@ type EtcdMemberStatus struct { // +optional PVCName string `json:"pvcName,omitempty"` + // Version is the etcd server version this member is actually running, + // observed at runtime from the member's own etcd endpoint via the + // Maintenance Status API (StatusResponse.Version) — i.e. what etcd + // reports, as opposed to spec.version, which is the intended/target + // version the operator asks for (and pins the image tag to). Empty until + // the member's Pod is Ready and the operator has successfully queried it. + // This observed value is the source of truth for detecting version drift + // between intent and reality (see the VersionDrifted condition); the + // observation is best-effort and never gates readiness. + // +optional + Version string `json:"version,omitempty"` + // Conditions represent the latest available observations of the member's state. // +optional Conditions []metav1.Condition `json:"conditions,omitempty"` @@ -233,6 +251,7 @@ type EtcdMemberStatus struct { // +kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas,selectorpath=.status.selector // +kubebuilder:printcolumn:name="Cluster",type=string,JSONPath=`.spec.clusterName` // +kubebuilder:printcolumn:name="Version",type=string,JSONPath=`.spec.version` +// +kubebuilder:printcolumn:name="Running",type=string,JSONPath=`.status.version` // +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].status` // +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` diff --git a/charts/etcd-operator/crd-bases/etcd-operator.cozystack.io_etcdmembers.yaml b/charts/etcd-operator/crd-bases/etcd-operator.cozystack.io_etcdmembers.yaml index 36c945ab..cfa36427 100644 --- a/charts/etcd-operator/crd-bases/etcd-operator.cozystack.io_etcdmembers.yaml +++ b/charts/etcd-operator/crd-bases/etcd-operator.cozystack.io_etcdmembers.yaml @@ -21,6 +21,9 @@ spec: - jsonPath: .spec.version name: Version type: string + - jsonPath: .status.version + name: Running + type: string - jsonPath: .status.conditions[?(@.type=="Ready")].status name: Ready type: string @@ -1683,6 +1686,18 @@ spec: Selector exposes the label-selector that matches this member's Pod via /scale (consumed by the PDB controller; not user-facing). type: string + version: + description: |- + Version is the etcd server version this member is actually running, + observed at runtime from the member's own etcd endpoint via the + Maintenance Status API (StatusResponse.Version) — i.e. what etcd + reports, as opposed to spec.version, which is the intended/target + version the operator asks for (and pins the image tag to). Empty until + the member's Pod is Ready and the operator has successfully queried it. + This observed value is the source of truth for detecting version drift + between intent and reality (see the VersionDrifted condition); the + observation is best-effort and never gates readiness. + type: string type: object type: object served: true diff --git a/controllers/etcd_client.go b/controllers/etcd_client.go index 15a58013..e00d8fc0 100644 --- a/controllers/etcd_client.go +++ b/controllers/etcd_client.go @@ -35,6 +35,13 @@ type EtcdClusterClient interface { MemberPromote(ctx context.Context, id uint64) (*clientv3.MemberPromoteResponse, error) MemberRemove(ctx context.Context, id uint64) (*clientv3.MemberRemoveResponse, error) + // Status returns a single endpoint's server status, including the etcd + // version it is actually running (StatusResponse.Version). Used by the + // member controller to observe the running version into + // EtcdMember.status.version. *clientv3.Client satisfies this via its + // embedded Maintenance interface. + Status(ctx context.Context, endpoint string) (*clientv3.StatusResponse, error) + // Auth surface — used by reconcileAuth to provision the single root // user/role and turn on authentication. The "root" role is built into // etcd, so a RoleAdd is not needed: UserAdd("root", …) + diff --git a/controllers/etcdmember_controller.go b/controllers/etcdmember_controller.go index 6af23952..c87c69ab 100644 --- a/controllers/etcdmember_controller.go +++ b/controllers/etcdmember_controller.go @@ -1040,6 +1040,10 @@ func (r *EtcdMemberReconciler) updateStatus(ctx context.Context, member *lll.Etc } } + // ready tracks whether this reconcile settled on MemberReady=True; only + // then do we bother observing the running etcd version (below). + ready := false + switch { case !podReady: // Self-heal an unrecoverable member. A non-bootstrap PVC member whose @@ -1073,6 +1077,7 @@ func (r *EtcdMemberReconciler) updateStatus(ctx context.Context, member *lll.Etc if id, err := r.discoverMemberID(ctx, member); err == nil { member.Status.MemberID = fmt.Sprintf("%016x", id) changed = true + ready = true if setMemberCondition(member, lll.MemberReady, metav1.ConditionTrue, "PodReady", "etcd member is ready") { changed = true @@ -1084,12 +1089,46 @@ func (r *EtcdMemberReconciler) updateStatus(ctx context.Context, member *lll.Etc } } default: + ready = true if setMemberCondition(member, lll.MemberReady, metav1.ConditionTrue, "PodReady", "etcd member is ready") { changed = true } } + // Observe the running etcd version once the member is Ready and record it + // in status, plus surface a VersionDrifted condition when what etcd + // actually runs diverges from the intended spec.version. Observation is + // best-effort (observeVersion never errors); the drift condition is + // informational — the operator does not act on it. + if ready { + if v := r.observeVersion(ctx, member); v != member.Status.Version { + member.Status.Version = v + changed = true + } + // Only evaluate drift once we know BOTH sides: the observed running + // version and a non-empty intended spec.version. spec.version is + // propagated from cluster.Status.Observed.Version for scale-up members, + // which is transiently "" before the first latch — treat unknown intent + // as "no drift" rather than flagging a spurious mismatch. + if member.Status.Version != "" && member.Spec.Version != "" { + condStatus := metav1.ConditionFalse + reason := "VersionMatched" + msg := fmt.Sprintf("running etcd version %q matches intended spec.version", member.Status.Version) + if member.Status.Version != member.Spec.Version { + condStatus = metav1.ConditionTrue + reason = "VersionMismatch" + msg = fmt.Sprintf("running etcd version %q does not match intended spec.version %q", + member.Status.Version, member.Spec.Version) + log.Info("etcd member version drift detected", + "running", member.Status.Version, "intended", member.Spec.Version) + } + if setMemberCondition(member, lll.MemberVersionDrifted, condStatus, reason, msg) { + changed = true + } + } + } + if changed { if err := r.Status().Update(ctx, member); err != nil { return ctrl.Result{}, err @@ -1162,28 +1201,9 @@ func (r *EtcdMemberReconciler) discoverMemberID(ctx context.Context, member *lll endpoints = append(endpoints, clientURL(scheme, member.Name, memberServiceName(member), member.Namespace)) } - // Build the operator-side dial config. Only TLS clusters need the parent - // EtcdCluster fetched: auth requires client TLS (CEL-enforced), which is - // propagated to member.Spec.TLS.ClientServerSecretRef, so a member with - // no client TLS can never have auth enabled — its dial is plaintext and - // anonymous, exactly as before. When TLS is set we fetch the cluster once - // and derive both the TLS config and the credentials from it (after auth - // is enabled the voter peers we dial here reject anonymous access). - var tlsCfg *tls.Config - var user, pass string - if member.Spec.TLS != nil && member.Spec.TLS.ClientServerSecretRef != nil { - cluster, err := r.clusterFor(ctx, member) - if err != nil { - return 0, err - } - tlsCfg, err = buildOperatorTLSConfig(ctx, r.Client, cluster) - if err != nil { - return 0, err - } - user, pass, _, err = resolveEtcdCredentials(ctx, r.Client, cluster) - if err != nil { - return 0, err - } + tlsCfg, user, pass, err := r.etcdDialConfig(ctx, member) + if err != nil { + return 0, err } c, err := r.EtcdClientFactory(ctx, endpoints, tlsCfg, user, pass) if err != nil { @@ -1214,6 +1234,71 @@ func (r *EtcdMemberReconciler) discoverMemberID(ctx context.Context, member *lll return 0, fmt.Errorf("member %q not found in etcd member list", member.Name) } +// etcdDialConfig builds the operator-side dial config for reaching this +// member's etcd cluster: the TLS config and the auth credentials. Only TLS +// clusters need the parent EtcdCluster fetched: auth requires client TLS +// (CEL-enforced), which is propagated to member.Spec.TLS.ClientServerSecretRef, +// so a member with no client TLS can never have auth enabled — its dial is +// plaintext and anonymous. When TLS is set we fetch the cluster once and +// derive both the TLS config and the credentials from it (after auth is +// enabled the peers we dial reject anonymous access). Returns (nil, "", "") +// for plaintext, anonymous clusters. +func (r *EtcdMemberReconciler) etcdDialConfig(ctx context.Context, member *lll.EtcdMember) (*tls.Config, string, string, error) { + if member.Spec.TLS == nil || member.Spec.TLS.ClientServerSecretRef == nil { + return nil, "", "", nil + } + cluster, err := r.clusterFor(ctx, member) + if err != nil { + return nil, "", "", err + } + tlsCfg, err := buildOperatorTLSConfig(ctx, r.Client, cluster) + if err != nil { + return nil, "", "", err + } + user, pass, _, err := resolveEtcdCredentials(ctx, r.Client, cluster) + if err != nil { + return nil, "", "", err + } + return tlsCfg, user, pass, nil +} + +// observeVersion reads the etcd version this member is actually running from +// its own etcd endpoint (the Maintenance Status API, StatusResponse.Version) +// and returns it. Best-effort: on any dial/RPC failure it logs and returns the +// member's previous status.Version unchanged, so version observation never +// fails the reconcile or affects readiness — Ready stays driven purely by Pod +// readiness and MemberID discovery. The dial targets this member's own client +// URL (Status is per-endpoint, so it reports this member's version, not a +// peer's). +func (r *EtcdMemberReconciler) observeVersion(ctx context.Context, member *lll.EtcdMember) string { + log := log.FromContext(ctx) + prev := member.Status.Version + + self := clientURL(memberClientScheme(member), member.Name, memberServiceName(member), member.Namespace) + + tlsCfg, user, pass, err := r.etcdDialConfig(ctx, member) + if err != nil { + log.V(1).Info("skipping version observation: cannot build dial config", "error", err) + return prev + } + c, err := r.EtcdClientFactory(ctx, []string{self}, tlsCfg, user, pass) + if err != nil { + log.V(1).Info("skipping version observation: cannot dial etcd", "error", err) + return prev + } + defer c.Close() + + statusCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + resp, err := c.Status(statusCtx, self) + if err != nil { + log.V(1).Info("skipping version observation: Status RPC failed", "error", err) + return prev + } + return resp.Version +} + // ── Manager wiring ─────────────────────────────────────────────────────── func (r *EtcdMemberReconciler) SetupWithManager(mgr ctrl.Manager) error { diff --git a/controllers/etcdmember_controller_test.go b/controllers/etcdmember_controller_test.go index 8fc2ab8a..56096f8a 100644 --- a/controllers/etcdmember_controller_test.go +++ b/controllers/etcdmember_controller_test.go @@ -692,6 +692,181 @@ func TestUpdateStatus_PopulatesMemberIDAndFlipsReady(t *testing.T) { } } +// findMemberCondition returns the condition of the given type, or nil. +func findMemberCondition(member *lll.EtcdMember, condType string) *metav1.Condition { + for i := range member.Status.Conditions { + if member.Status.Conditions[i].Type == condType { + return &member.Status.Conditions[i] + } + } + return nil +} + +// readyMemberWithFake builds a Ready member (Pod ready, etcd knows it by name) +// plus a fakeEtcd wired to report statusVersion, and runs updateStatus. It +// drives the Ready path so observeVersion runs. +func readyMemberWithFake(t *testing.T, specVersion, statusVersion string, statusErr error) *lll.EtcdMember { + t.Helper() + ctx := context.Background() + member := &lll.EtcdMember{ + ObjectMeta: metav1.ObjectMeta{Name: "test-0", Namespace: "ns", Labels: memberLabels("test", "test-0")}, + Spec: lll.EtcdMemberSpec{ClusterName: "test", Version: specVersion, Storage: lll.StorageSpec{Size: quickQty(t, "1Gi")}, InitialCluster: "x", ClusterToken: "test"}, + } + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "test-0", Namespace: "ns"}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{readyPodCondition()}, + }, + } + c, _ := newTestClient(t, member, pod) + fe := newFakeEtcd(0xdeadbeef, + &etcdserverpb.Member{ID: 0xae36f238164a08ad, Name: "test-0", PeerURLs: []string{peerURL("http", "test-0", "test", "ns")}}, + ) + fe.statusVersion = statusVersion + fe.statusErr = statusErr + r := &EtcdMemberReconciler{Client: c, Scheme: testScheme(t), EtcdClientFactory: factoryReturning(fe)} + if _, err := r.updateStatus(ctx, member); err != nil { + t.Fatalf("updateStatus: %v", err) + } + return mustGet(t, c, "test-0", "ns", member) +} + +// TestUpdateStatus_ObservesRunningVersion: once the member is Ready, the +// controller records the version etcd actually reports into status.version and, +// when it equals spec.version, marks VersionDrifted=False. +func TestUpdateStatus_ObservesRunningVersion(t *testing.T) { + member := readyMemberWithFake(t, "3.5.17", "3.5.17", nil) + + if member.Status.Version != "3.5.17" { + t.Fatalf("status.Version = %q, want 3.5.17", member.Status.Version) + } + drift := findMemberCondition(member, lll.MemberVersionDrifted) + if drift == nil || drift.Status != metav1.ConditionFalse { + t.Fatalf("VersionDrifted = %+v, want False", drift) + } + if drift.Reason != "VersionMatched" { + t.Fatalf("VersionDrifted reason = %q, want VersionMatched", drift.Reason) + } +} + +// TestUpdateStatus_VersionDriftDetected: when etcd reports a version different +// from the intended spec.version, status.version reflects reality and +// VersionDrifted flips True (the signal the operator surfaces but does not act +// on). +func TestUpdateStatus_VersionDriftDetected(t *testing.T) { + member := readyMemberWithFake(t, "3.5.17", "3.6.4", nil) + + if member.Status.Version != "3.6.4" { + t.Fatalf("status.Version = %q, want observed 3.6.4", member.Status.Version) + } + drift := findMemberCondition(member, lll.MemberVersionDrifted) + if drift == nil || drift.Status != metav1.ConditionTrue { + t.Fatalf("VersionDrifted = %+v, want True", drift) + } + if drift.Reason != "VersionMismatch" { + t.Fatalf("VersionDrifted reason = %q, want VersionMismatch", drift.Reason) + } + // Readiness must be unaffected by drift. + if ready := findMemberCondition(member, lll.MemberReady); ready == nil || ready.Status != metav1.ConditionTrue { + t.Fatalf("Ready condition = %+v, want True", ready) + } +} + +// TestUpdateStatus_NoDriftWhenIntentUnknown: when spec.version is empty (a +// scale-up member stamped from a transiently-unlatched Observed.Version), the +// observed running version is still recorded, but VersionDrifted must NOT be +// set — unknown intent is treated as "no drift", not a spurious mismatch. +func TestUpdateStatus_NoDriftWhenIntentUnknown(t *testing.T) { + member := readyMemberWithFake(t, "", "3.6.4", nil) + + if member.Status.Version != "3.6.4" { + t.Fatalf("status.Version = %q, want observed 3.6.4", member.Status.Version) + } + if drift := findMemberCondition(member, lll.MemberVersionDrifted); drift != nil { + t.Fatalf("VersionDrifted = %+v, want unset when spec.version is empty", drift) + } +} + +// TestUpdateStatus_VersionDriftResolves: a member that was drifted +// (VersionDrifted=True) and then comes up on the intended version must flip the +// condition back to False/VersionMatched and record the new observed version. +func TestUpdateStatus_VersionDriftResolves(t *testing.T) { + ctx := context.Background() + member := &lll.EtcdMember{ + ObjectMeta: metav1.ObjectMeta{Name: "test-0", Namespace: "ns", Labels: memberLabels("test", "test-0")}, + // Steady-state Ready (MemberID set) with a prior drifted observation. + Spec: lll.EtcdMemberSpec{ClusterName: "test", Version: "3.6.4", Storage: lll.StorageSpec{Size: quickQty(t, "1Gi")}, InitialCluster: "x", ClusterToken: "test"}, + Status: lll.EtcdMemberStatus{ + MemberID: "ae36f238164a08ad", + Version: "3.5.17", + Conditions: []metav1.Condition{{ + Type: lll.MemberVersionDrifted, Status: metav1.ConditionTrue, Reason: "VersionMismatch", + LastTransitionTime: metav1.Now(), Message: "was drifted", + }}, + }, + } + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "test-0", Namespace: "ns"}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{readyPodCondition()}, + }, + } + c, _ := newTestClient(t, member, pod) + fe := newFakeEtcd(0xdeadbeef) + fe.statusVersion = "3.6.4" // member has caught up to intent + r := &EtcdMemberReconciler{Client: c, Scheme: testScheme(t), EtcdClientFactory: factoryReturning(fe)} + if _, err := r.updateStatus(ctx, member); err != nil { + t.Fatalf("updateStatus: %v", err) + } + member = mustGet(t, c, "test-0", "ns", member) + + if member.Status.Version != "3.6.4" { + t.Fatalf("status.Version = %q, want 3.6.4", member.Status.Version) + } + drift := findMemberCondition(member, lll.MemberVersionDrifted) + if drift == nil || drift.Status != metav1.ConditionFalse || drift.Reason != "VersionMatched" { + t.Fatalf("VersionDrifted = %+v, want False/VersionMatched after catch-up", drift) + } +} + +// TestUpdateStatus_VersionObservationErrorIsNonFatal: a failing Status RPC must +// leave the previously observed version intact and must not disturb readiness — +// version observation is best-effort. +func TestUpdateStatus_VersionObservationErrorIsNonFatal(t *testing.T) { + ctx := context.Background() + member := &lll.EtcdMember{ + ObjectMeta: metav1.ObjectMeta{Name: "test-0", Namespace: "ns", Labels: memberLabels("test", "test-0")}, + // MemberID pre-set → steady-state Ready path (default case), so + // observeVersion runs even though discovery is skipped. + Spec: lll.EtcdMemberSpec{ClusterName: "test", Version: "3.5.17", Storage: lll.StorageSpec{Size: quickQty(t, "1Gi")}, InitialCluster: "x", ClusterToken: "test"}, + Status: lll.EtcdMemberStatus{MemberID: "ae36f238164a08ad", Version: "3.5.17"}, + } + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "test-0", Namespace: "ns"}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{readyPodCondition()}, + }, + } + c, _ := newTestClient(t, member, pod) + fe := newFakeEtcd(0xdeadbeef) + fe.statusErr = errors.New("dial timeout") + r := &EtcdMemberReconciler{Client: c, Scheme: testScheme(t), EtcdClientFactory: factoryReturning(fe)} + if _, err := r.updateStatus(ctx, member); err != nil { + t.Fatalf("updateStatus: %v", err) + } + member = mustGet(t, c, "test-0", "ns", member) + + if member.Status.Version != "3.5.17" { + t.Fatalf("status.Version = %q, want prior value 3.5.17 preserved on Status error", member.Status.Version) + } + if ready := findMemberCondition(member, lll.MemberReady); ready == nil || ready.Status != metav1.ConditionTrue { + t.Fatalf("Ready condition = %+v, want True despite Status error", ready) + } +} + // TestEtcdContainerStuck pins the self-heal detection: an etcd container is // "stuck" only when it is not ready, has restarted at least the threshold, and // was not OOMKilled — and never while the Pod is itself being deleted. diff --git a/controllers/testing_helpers_test.go b/controllers/testing_helpers_test.go index f3800f15..5c63020a 100644 --- a/controllers/testing_helpers_test.go +++ b/controllers/testing_helpers_test.go @@ -43,6 +43,11 @@ type fakeEtcd struct { removeErr error promoteErr error + // statusVersion is the etcd server version Status reports (observed + // running version); statusErr, when set, fails the Status RPC. + statusVersion string + statusErr error + addCalls []string addLearnerCalls []string promoteCalls []uint64 @@ -176,6 +181,16 @@ func (f *fakeEtcd) UserGrantRole(_ context.Context, user, role string) (*clientv return &clientv3.AuthUserGrantRoleResponse{Header: &etcdserverpb.ResponseHeader{ClusterId: f.clusterID}}, nil } +func (f *fakeEtcd) Status(_ context.Context, _ string) (*clientv3.StatusResponse, error) { + if f.statusErr != nil { + return nil, f.statusErr + } + return &clientv3.StatusResponse{ + Header: &etcdserverpb.ResponseHeader{ClusterId: f.clusterID}, + Version: f.statusVersion, + }, nil +} + func (f *fakeEtcd) Close() error { f.closed = true; return nil } func factoryReturning(c EtcdClusterClient) EtcdClientFactory { diff --git a/docs/concepts.md b/docs/concepts.md index a0daf24c..a39c983e 100644 --- a/docs/concepts.md +++ b/docs/concepts.md @@ -434,6 +434,12 @@ The cluster surfaces three conditions: `Available`, `Progressing`, `Degraded`. T All conditions carry `observedGeneration` so consumers can tell whether a condition reflects the latest spec. Status writes are gated on "did anything actually change" — the operator does not bump `resourceVersion` every 30 s just because of the periodic reconcile. +### Observed member version + +`spec.version` is *intent* — it pins the image tag (`v`) and drives the restore version-compat gate. What etcd is **actually running** is a separate, observed fact. Once a member's Pod is Ready, the member controller reads the running version from that member's own etcd endpoint (the Maintenance `Status` RPC) and records it in `EtcdMember.status.version` (surfaced as the `Running` print column). The read is best-effort: a dial or RPC failure leaves the previous value in place and never affects `Ready` — readiness stays driven by Pod readiness and member-ID discovery alone. + +When the observed version diverges from the member's intended `spec.version`, the member surfaces `VersionDrifted=True/VersionMismatch`; when they agree it is `False/VersionMatched`; when intent is not yet known (`spec.version` empty) the condition is left unset. This condition is **informational only** — the operator does not act on it (it never keys reconciliation off the observed value). It exists so intent-vs-reality drift is *detectable rather than assumed*, which is the prerequisite for safely reconsidering a per-cluster image/version override. + ## Snapshots & restore Two surfaces, one agent. The operator image doubles as a snapshot agent: `main.go` dispatches on `os.Args[1]` so `manager snapshot-agent` / `manager restore-agent` run the agent and exit, while a bare `manager` runs the controller. This keeps one binary and one image — no separate agent build — and means the agent always matches the operator it ships with.