Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ MANIFESTS ?= $(STANDARD_MANIFEST) $(STANDARD_E2E_MANIFEST) $(EXPERIMENTAL_MANIFE
$(STANDARD_MANIFEST) ?= helm/cert-manager.yaml
$(STANDARD_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/e2e.yaml
$(EXPERIMENTAL_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml
$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml
$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml helm/high-availability.yaml
HELM_SETTINGS ?=
.PHONY: $(MANIFESTS)
$(MANIFESTS): $(HELM)
Expand Down Expand Up @@ -489,8 +489,8 @@ run-experimental: run-internal #HELP Build the operator-controller then deploy i
CATD_NAMESPACE := olmv1-system
.PHONY: wait
wait:
kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=60s
kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert # Avoid upgrade test flakes when reissuing cert
kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=3m
kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert --timeout=3m # Avoid upgrade test flakes when reissuing cert

.PHONY: docker-build
docker-build: build-linux #EXHELP Build docker image for operator-controller and catalog with GOOS=linux and local GOARCH.
Expand Down
4 changes: 2 additions & 2 deletions hack/test/pre-upgrade-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -155,5 +155,5 @@ spec:
version: 1.0.0
EOF

kubectl wait --for=condition=Serving --timeout=60s ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
kubectl wait --for=condition=Installed --timeout=60s ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
kubectl wait --for=condition=Serving --timeout=5m ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
kubectl wait --for=condition=Installed --timeout=5m ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
33 changes: 33 additions & 0 deletions helm/high-availability.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# High Availability (HA) configuration for OLMv1
# Sets replicas to 2 for both operator-controller and catalogd to enable HA setup
# This is used in experimental-e2e.yaml to test multi-replica deployments
#
# Pod anti-affinity is configured as "preferred" (not "required") to ensure:
# - In multi-node clusters: replicas are scheduled on different nodes for better availability
# - In single-node clusters (like kind): both replicas can still be scheduled on the same node
options:
operatorController:
deployment:
replicas: 2
catalogd:
deployment:
replicas: 2

# Pod anti-affinity configuration to prefer spreading replicas across different nodes
# Uses preferredDuringSchedulingIgnoredDuringExecution (soft constraint) to allow
# scheduling on the same node when necessary (e.g., single-node kind clusters for e2e tests)
deployments:
templateSpec:
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: control-plane
operator: In
values:
- operator-controller-controller-manager
- catalogd-controller-manager
topologyKey: kubernetes.io/hostname
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ metadata:
namespace: {{ .Values.namespaces.olmv1.name }}
spec:
minReadySeconds: 5
replicas: 1
replicas: {{ .Values.options.catalogd.deployment.replicas }}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we already have node anti affinity configured to make sure these replicas do not end up on the same node? If not, we need that as well (but only when replicas > 1).

Copy link
Contributor

@tmshort tmshort Dec 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, I will point out that this may cause an issue on our single-node kind experimental-e2e tests where we have two replicas (such that we are validating that two replicas does not cause issues with the e2e tests).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! I added podAntiAffinity and used the preferred rule. Besides, I created openshift/release#72395 to add SNO upgrade test for the downstream OLMv1 and OLMv0, please take a look, thanks!

  podAntiAffinity:
    preferredDuringSchedulingIgnoredDuringExecution:
      - weight: 100
        podAffinityTerm:
          labelSelector:
            matchExpressions:
              - key: control-plane
                operator: In
                values:
                  - operator-controller-controller-manager
                  - catalogd-controller-manager
          topologyKey: kubernetes.io/hostname

strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ metadata:
name: operator-controller-controller-manager
namespace: {{ .Values.namespaces.olmv1.name }}
spec:
replicas: 1
replicas: {{ .Values.options.operatorController.deployment.replicas }}
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
2 changes: 2 additions & 0 deletions helm/olmv1/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ options:
enabled: true
deployment:
image: quay.io/operator-framework/operator-controller:devel
replicas: 1
extraArguments: []
features:
enabled: []
Expand All @@ -19,6 +20,7 @@ options:
enabled: true
deployment:
image: quay.io/operator-framework/catalogd:devel
replicas: 1
extraArguments: []
features:
enabled: []
Expand Down
32 changes: 28 additions & 4 deletions manifests/experimental-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2198,11 +2198,11 @@ metadata:
namespace: olmv1-system
spec:
minReadySeconds: 5
replicas: 1
replicas: 2
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -2315,6 +2315,18 @@ spec:
operator: In
values:
- linux
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- podAffinityTerm:
labelSelector:
matchExpressions:
- key: control-plane
operator: In
values:
- operator-controller-controller-manager
- catalogd-controller-manager
topologyKey: kubernetes.io/hostname
weight: 100
nodeSelector:
kubernetes.io/os: linux
node-role.kubernetes.io/control-plane: ""
Expand Down Expand Up @@ -2349,11 +2361,11 @@ metadata:
name: operator-controller-controller-manager
namespace: olmv1-system
spec:
replicas: 1
replicas: 2
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -2474,6 +2486,18 @@ spec:
operator: In
values:
- linux
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- podAffinityTerm:
labelSelector:
matchExpressions:
- key: control-plane
operator: In
values:
- operator-controller-controller-manager
- catalogd-controller-manager
topologyKey: kubernetes.io/hostname
weight: 100
nodeSelector:
kubernetes.io/os: linux
node-role.kubernetes.io/control-plane: ""
Expand Down
4 changes: 2 additions & 2 deletions manifests/experimental.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2123,7 +2123,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -2261,7 +2261,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
4 changes: 2 additions & 2 deletions manifests/standard-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1803,7 +1803,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -1953,7 +1953,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
4 changes: 2 additions & 2 deletions manifests/standard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1724,7 +1724,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -1861,7 +1861,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
30 changes: 22 additions & 8 deletions test/helpers/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,15 @@ var (
)

const (
pollDuration = time.Minute
pollDuration = time.Minute
// catalogPollDuration is used for catalog operations (unpacking, serving) which involve
// I/O-bound operations like pulling OCI images and unpacking catalog content.
catalogPollDuration = 3 * time.Minute
// extendedPollDuration is used for operations that involve pod restarts (like upgrades)
// or webhook installations with cert-manager. In the worst case of a pod crash during upgrade,
// leader election can take up to 163 seconds (LeaseDuration: 137s + RetryPeriod: 26s).
// With LeaderElectionReleaseOnCancel: true, graceful shutdowns only take ~26s (RetryPeriod).
extendedPollDuration = 5 * time.Minute
pollInterval = time.Second
testCatalogName = "test-catalog"
testCatalogRefEnvVar = "CATALOG_IMG"
Expand Down Expand Up @@ -268,7 +276,7 @@ func ValidateCatalogUnpackWithName(t *testing.T, catalogName string) {
require.NotNil(ct, cond)
require.Equal(ct, metav1.ConditionTrue, cond.Status)
require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
}, pollDuration, pollInterval)
}, catalogPollDuration, pollInterval)

t.Log("Checking that catalog has the expected metadata label")
require.NotNil(t, catalog.Labels)
Expand All @@ -283,37 +291,41 @@ func ValidateCatalogUnpackWithName(t *testing.T, catalogName string) {
require.NotNil(ct, cond)
require.Equal(ct, metav1.ConditionTrue, cond.Status)
require.Equal(ct, ocv1.ReasonAvailable, cond.Reason)
}, pollDuration, pollInterval)
}, catalogPollDuration, pollInterval)
}

func EnsureNoExtensionResources(t *testing.T, clusterExtensionName string) {
ls := labels.Set{"olm.operatorframework.io/owner-name": clusterExtensionName}

// CRDs may take an extra long time to be deleted, and may run into the following error:
// Use 2 minute timeout for cleanup operations to ensure they complete within the test timeout.
// This is shorter than pollDuration (3 min) to leave buffer for the overall test suite.
cleanupTimeout := 2 * time.Minute
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is actually a shorter timeout. It was originally 5*pollDuration which is equal to 5 * time.Minute.
Do we need this change?


// CRDs may take extra time to be deleted, and may run into the following error:
// Condition=Terminating Status=True Reason=InstanceDeletionFailed Message="could not list instances: storage is (re)initializing"
t.Logf("By waiting for CustomResourceDefinitions of %q to be deleted", clusterExtensionName)
require.EventuallyWithT(t, func(ct *assert.CollectT) {
list := &apiextensionsv1.CustomResourceDefinitionList{}
err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()})
require.NoError(ct, err)
require.Empty(ct, list.Items)
}, 5*pollDuration, pollInterval)
}, cleanupTimeout, pollInterval)

t.Logf("By waiting for ClusterRoleBindings of %q to be deleted", clusterExtensionName)
require.EventuallyWithT(t, func(ct *assert.CollectT) {
list := &rbacv1.ClusterRoleBindingList{}
err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()})
require.NoError(ct, err)
require.Empty(ct, list.Items)
}, 2*pollDuration, pollInterval)
}, cleanupTimeout, pollInterval)

t.Logf("By waiting for ClusterRoles of %q to be deleted", clusterExtensionName)
require.EventuallyWithT(t, func(ct *assert.CollectT) {
list := &rbacv1.ClusterRoleList{}
err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()})
require.NoError(ct, err)
require.Empty(ct, list.Items)
}, 2*pollDuration, pollInterval)
}, cleanupTimeout, pollInterval)
}

func TestCleanup(t *testing.T, cat *ocv1.ClusterCatalog, clusterExtension *ocv1.ClusterExtension, sa *corev1.ServiceAccount, ns *corev1.Namespace) {
Expand Down Expand Up @@ -348,10 +360,12 @@ func TestCleanup(t *testing.T, cat *ocv1.ClusterCatalog, clusterExtension *ocv1.
if ns != nil {
t.Logf("By deleting Namespace %q", ns.Name)
require.NoError(t, c.Delete(context.Background(), ns))
// Namespace deletion may take longer as it needs to delete all resources within it.
// Use extendedPollDuration to allow sufficient time for graceful cleanup.
require.Eventually(t, func() bool {
err := c.Get(context.Background(), types.NamespacedName{Name: ns.Name}, &corev1.Namespace{})
return errors.IsNotFound(err)
}, pollDuration, pollInterval)
}, extendedPollDuration, pollInterval)
}
}

Expand Down
Loading