operator-framework · jianzhangbjz · Dec 4, 2025 · joelanford · Dec 10, 2025 · tmshort
@@ -161,7 +161,7 @@ MANIFESTS ?= $(STANDARD_MANIFEST) $(STANDARD_E2E_MANIFEST) $(EXPERIMENTAL_MANIFE
 $(STANDARD_MANIFEST) ?= helm/cert-manager.yaml
 $(STANDARD_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/e2e.yaml
 $(EXPERIMENTAL_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml
-$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml
+$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml helm/high-availability.yaml
 HELM_SETTINGS ?=
 .PHONY: $(MANIFESTS)
 $(MANIFESTS): $(HELM)
@@ -489,8 +489,8 @@ run-experimental: run-internal #HELP Build the operator-controller then deploy i
 CATD_NAMESPACE := olmv1-system
 .PHONY: wait
 wait:
-	kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=60s
-	kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert # Avoid upgrade test flakes when reissuing cert
+	kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=3m
+	kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert --timeout=3m # Avoid upgrade test flakes when reissuing cert
 
 .PHONY: docker-build
 docker-build: build-linux #EXHELP Build docker image for operator-controller and catalog with GOOS=linux and local GOARCH.

@@ -155,5 +155,5 @@ spec:
       version: 1.0.0
 EOF
 
-kubectl wait --for=condition=Serving --timeout=60s ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
-kubectl wait --for=condition=Installed --timeout=60s ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
+kubectl wait --for=condition=Serving --timeout=5m ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
+kubectl wait --for=condition=Installed --timeout=5m ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
@@ -0,0 +1,33 @@
+# High Availability (HA) configuration for OLMv1
+# Sets replicas to 2 for both operator-controller and catalogd to enable HA setup
+# This is used in experimental-e2e.yaml to test multi-replica deployments
+#
+# Pod anti-affinity is configured as "preferred" (not "required") to ensure:
+# - In multi-node clusters: replicas are scheduled on different nodes for better availability
+# - In single-node clusters (like kind): both replicas can still be scheduled on the same node
+options:
+  operatorController:
+    deployment:
+      replicas: 2
+  catalogd:
+    deployment:
+      replicas: 2
+
+# Pod anti-affinity configuration to prefer spreading replicas across different nodes
+# Uses preferredDuringSchedulingIgnoredDuringExecution (soft constraint) to allow
+# scheduling on the same node when necessary (e.g., single-node kind clusters for e2e tests)
+deployments:
+  templateSpec:
+    affinity:
+      podAntiAffinity:
+        preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            podAffinityTerm:
+              labelSelector:
+                matchExpressions:
+                  - key: control-plane
+                    operator: In
+                    values:
+                      - operator-controller-controller-manager
+                      - catalogd-controller-manager
+              topologyKey: kubernetes.io/hostname
@@ -12,11 +12,11 @@ metadata:
   namespace: {{ .Values.namespaces.olmv1.name }}
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: {{ .Values.options.catalogd.deployment.replicas }}
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:

@@ -11,11 +11,11 @@ metadata:
   name: operator-controller-controller-manager
   namespace: {{ .Values.namespaces.olmv1.name }}
 spec:
-  replicas: 1
+  replicas: {{ .Values.options.operatorController.deployment.replicas }}
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:

@@ -8,6 +8,7 @@ options:
     enabled: true
     deployment:
       image: quay.io/operator-framework/operator-controller:devel
+      replicas: 1
       extraArguments: []
     features:
       enabled: []
@@ -19,6 +20,7 @@ options:
     enabled: true
     deployment:
       image: quay.io/operator-framework/catalogd:devel
+      replicas: 1
       extraArguments: []
     features:
       enabled: []

@@ -2198,11 +2198,11 @@ metadata:
   namespace: olmv1-system
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -2315,6 +2315,18 @@ spec:
                     operator: In
                     values:
                       - linux
+        podAntiAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - podAffinityTerm:
+                labelSelector:
+                  matchExpressions:
+                    - key: control-plane
+                      operator: In
+                      values:
+                        - operator-controller-controller-manager
+                        - catalogd-controller-manager
+                topologyKey: kubernetes.io/hostname
+              weight: 100
       nodeSelector:
         kubernetes.io/os: linux
         node-role.kubernetes.io/control-plane: ""
@@ -2349,11 +2361,11 @@ metadata:
   name: operator-controller-controller-manager
   namespace: olmv1-system
 spec:
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -2474,6 +2486,18 @@ spec:
                     operator: In
                     values:
                       - linux
+        podAntiAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - podAffinityTerm:
+                labelSelector:
+                  matchExpressions:
+                    - key: control-plane
+                      operator: In
+                      values:
+                        - operator-controller-controller-manager
+                        - catalogd-controller-manager
+                topologyKey: kubernetes.io/hostname
+              weight: 100
       nodeSelector:
         kubernetes.io/os: linux
         node-role.kubernetes.io/control-plane: ""

@@ -2123,7 +2123,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -2261,7 +2261,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:

@@ -1803,7 +1803,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -1953,7 +1953,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:

@@ -1724,7 +1724,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -1861,7 +1861,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:

@@ -34,7 +34,15 @@ var (
 )
 
 const (
-	pollDuration         = time.Minute
+	pollDuration = time.Minute
+	// catalogPollDuration is used for catalog operations (unpacking, serving) which involve
+	// I/O-bound operations like pulling OCI images and unpacking catalog content.
+	catalogPollDuration = 3 * time.Minute
+	// extendedPollDuration is used for operations that involve pod restarts (like upgrades)
+	// or webhook installations with cert-manager. In the worst case of a pod crash during upgrade,
+	// leader election can take up to 163 seconds (LeaseDuration: 137s + RetryPeriod: 26s).
+	// With LeaderElectionReleaseOnCancel: true, graceful shutdowns only take ~26s (RetryPeriod).
+	extendedPollDuration = 5 * time.Minute
 	pollInterval         = time.Second
 	testCatalogName      = "test-catalog"
 	testCatalogRefEnvVar = "CATALOG_IMG"
@@ -268,7 +276,7 @@ func ValidateCatalogUnpackWithName(t *testing.T, catalogName string) {
 		require.NotNil(ct, cond)
 		require.Equal(ct, metav1.ConditionTrue, cond.Status)
 		require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
-	}, pollDuration, pollInterval)
+	}, catalogPollDuration, pollInterval)
 
 	t.Log("Checking that catalog has the expected metadata label")
 	require.NotNil(t, catalog.Labels)
@@ -283,37 +291,41 @@ func ValidateCatalogUnpackWithName(t *testing.T, catalogName string) {
 		require.NotNil(ct, cond)
 		require.Equal(ct, metav1.ConditionTrue, cond.Status)
 		require.Equal(ct, ocv1.ReasonAvailable, cond.Reason)
-	}, pollDuration, pollInterval)
+	}, catalogPollDuration, pollInterval)
 }
 
 func EnsureNoExtensionResources(t *testing.T, clusterExtensionName string) {
 	ls := labels.Set{"olm.operatorframework.io/owner-name": clusterExtensionName}
 
-	// CRDs may take an extra long time to be deleted, and may run into the following error:
+	// Use 2 minute timeout for cleanup operations to ensure they complete within the test timeout.
+	// This is shorter than pollDuration (3 min) to leave buffer for the overall test suite.
+	cleanupTimeout := 2 * time.Minute
+
+	// CRDs may take extra time to be deleted, and may run into the following error:
 	// Condition=Terminating Status=True Reason=InstanceDeletionFailed Message="could not list instances: storage is (re)initializing"
 	t.Logf("By waiting for CustomResourceDefinitions of %q to be deleted", clusterExtensionName)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		list := &apiextensionsv1.CustomResourceDefinitionList{}
 		err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()})
 		require.NoError(ct, err)
 		require.Empty(ct, list.Items)
-	}, 5*pollDuration, pollInterval)
+	}, cleanupTimeout, pollInterval)
 
 	t.Logf("By waiting for ClusterRoleBindings of %q to be deleted", clusterExtensionName)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		list := &rbacv1.ClusterRoleBindingList{}
 		err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()})
 		require.NoError(ct, err)
 		require.Empty(ct, list.Items)
-	}, 2*pollDuration, pollInterval)
+	}, cleanupTimeout, pollInterval)
 
 	t.Logf("By waiting for ClusterRoles of %q to be deleted", clusterExtensionName)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		list := &rbacv1.ClusterRoleList{}
 		err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()})
 		require.NoError(ct, err)
 		require.Empty(ct, list.Items)
-	}, 2*pollDuration, pollInterval)
+	}, cleanupTimeout, pollInterval)
 }
 
 func TestCleanup(t *testing.T, cat *ocv1.ClusterCatalog, clusterExtension *ocv1.ClusterExtension, sa *corev1.ServiceAccount, ns *corev1.Namespace) {
@@ -348,10 +360,12 @@ func TestCleanup(t *testing.T, cat *ocv1.ClusterCatalog, clusterExtension *ocv1.
 	if ns != nil {
 		t.Logf("By deleting Namespace %q", ns.Name)
 		require.NoError(t, c.Delete(context.Background(), ns))
+		// Namespace deletion may take longer as it needs to delete all resources within it.
+		// Use extendedPollDuration to allow sufficient time for graceful cleanup.
 		require.Eventually(t, func() bool {
 			err := c.Get(context.Background(), types.NamespacedName{Name: ns.Name}, &corev1.Namespace{})
 			return errors.IsNotFound(err)
-		}, pollDuration, pollInterval)
+		}, extendedPollDuration, pollInterval)
 	}
 }