Skip to content

Commit 3f11356

Browse files
authored
fix: Harden informer cache with label selectors and memory optimizations (#6242)
* fix: Harden informer cache with label selectors and memory optimizations Signed-off-by: Jitendra Yejare <11752425+jyejare@users.noreply.github.com> * Additional Fixes on caching with PVC and HPA Signed-off-by: Jitendra Yejare <11752425+jyejare@users.noreply.github.com> --------- Signed-off-by: Jitendra Yejare <11752425+jyejare@users.noreply.github.com>
1 parent f6b015c commit 3f11356

File tree

11 files changed

+120
-53
lines changed

11 files changed

+120
-53
lines changed

infra/feast-operator/bundle/manifests/feast-operator.clusterserviceversion.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,8 @@ spec:
266266
command:
267267
- /manager
268268
env:
269+
- name: GOMEMLIMIT
270+
value: "230MiB"
269271
- name: RELATED_IMAGE_FEATURE_SERVER
270272
value: quay.io/feastdev/feature-server:0.62.0
271273
- name: RELATED_IMAGE_CRON_JOB

infra/feast-operator/cmd/main.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,18 @@ import (
2525
// to ensure that exec-entrypoint and run can make use of them.
2626
_ "k8s.io/client-go/plugin/pkg/client/auth"
2727

28+
appsv1 "k8s.io/api/apps/v1"
29+
autoscalingv2 "k8s.io/api/autoscaling/v2"
30+
batchv1 "k8s.io/api/batch/v1"
2831
corev1 "k8s.io/api/core/v1"
32+
policyv1 "k8s.io/api/policy/v1"
33+
rbacv1 "k8s.io/api/rbac/v1"
34+
"k8s.io/apimachinery/pkg/labels"
2935
"k8s.io/apimachinery/pkg/runtime"
3036
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
3137
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
3238
ctrl "sigs.k8s.io/controller-runtime"
39+
"sigs.k8s.io/controller-runtime/pkg/cache"
3340
"sigs.k8s.io/controller-runtime/pkg/client"
3441
"sigs.k8s.io/controller-runtime/pkg/healthz"
3542
"sigs.k8s.io/controller-runtime/pkg/log/zap"
@@ -59,6 +66,29 @@ func init() {
5966
// +kubebuilder:scaffold:scheme
6067
}
6168

69+
func newCacheOptions() cache.Options {
70+
managedBySelector := labels.SelectorFromSet(labels.Set{
71+
services.ManagedByLabelKey: services.ManagedByLabelValue,
72+
})
73+
managedByFilter := cache.ByObject{Label: managedBySelector}
74+
75+
return cache.Options{
76+
DefaultTransform: cache.TransformStripManagedFields(),
77+
ByObject: map[client.Object]cache.ByObject{
78+
&corev1.ConfigMap{}: managedByFilter,
79+
&appsv1.Deployment{}: managedByFilter,
80+
&corev1.Service{}: managedByFilter,
81+
&corev1.ServiceAccount{}: managedByFilter,
82+
&corev1.PersistentVolumeClaim{}: managedByFilter,
83+
&rbacv1.RoleBinding{}: managedByFilter,
84+
&rbacv1.Role{}: managedByFilter,
85+
&batchv1.CronJob{}: managedByFilter,
86+
&autoscalingv2.HorizontalPodAutoscaler{}: managedByFilter,
87+
&policyv1.PodDisruptionBudget{}: managedByFilter,
88+
},
89+
}
90+
}
91+
6292
func main() {
6393
var metricsAddr string
6494
var enableLeaderElection bool
@@ -145,11 +175,26 @@ func main() {
145175
// if you are doing or is intended to do any operation such as perform cleanups
146176
// after the manager stops then its usage might be unsafe.
147177
// LeaderElectionReleaseOnCancel: true,
178+
Cache: newCacheOptions(),
148179
Client: client.Options{
149180
Cache: &client.CacheOptions{
181+
// Bypass the label-filtered informer cache for all reads so that
182+
// pre-existing resources without the managed-by label are still
183+
// visible to the reconciler. The ByObject cache filter above still
184+
// restricts the watch to managed-by-labeled objects, limiting
185+
// memory usage while avoiding upgrade deadlocks.
150186
DisableFor: []client.Object{
151187
&corev1.ConfigMap{},
152188
&corev1.Secret{},
189+
&appsv1.Deployment{},
190+
&corev1.Service{},
191+
&corev1.ServiceAccount{},
192+
&corev1.PersistentVolumeClaim{},
193+
&rbacv1.RoleBinding{},
194+
&rbacv1.Role{},
195+
&batchv1.CronJob{},
196+
&autoscalingv2.HorizontalPodAutoscaler{},
197+
&policyv1.PodDisruptionBudget{},
153198
},
154199
},
155200
},
Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
1-
- op: replace
2-
path: "/spec/template/spec/containers/0/env/0"
3-
value:
4-
name: RELATED_IMAGE_FEATURE_SERVER
5-
value: ${FS_IMG}
6-
- op: replace
7-
path: "/spec/template/spec/containers/0/env/1"
8-
value:
9-
name: RELATED_IMAGE_CRON_JOB
10-
value: ${CJ_IMG}
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: controller-manager
5+
spec:
6+
template:
7+
spec:
8+
containers:
9+
- name: manager
10+
env:
11+
- name: RELATED_IMAGE_FEATURE_SERVER
12+
value: ${FS_IMG}
13+
- name: RELATED_IMAGE_CRON_JOB
14+
value: ${CJ_IMG}
Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
1-
- op: replace
2-
path: "/spec/template/spec/containers/0/env/0"
3-
value:
4-
name: RELATED_IMAGE_FEATURE_SERVER
5-
value: quay.io/feastdev/feature-server:0.62.0
6-
- op: replace
7-
path: "/spec/template/spec/containers/0/env/1"
8-
value:
9-
name: RELATED_IMAGE_CRON_JOB
10-
value: quay.io/openshift/origin-cli:4.17
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: controller-manager
5+
spec:
6+
template:
7+
spec:
8+
containers:
9+
- name: manager
10+
env:
11+
- name: RELATED_IMAGE_FEATURE_SERVER
12+
value: quay.io/feastdev/feature-server:0.62.0
13+
- name: RELATED_IMAGE_CRON_JOB
14+
value: quay.io/openshift/origin-cli:4.17

infra/feast-operator/config/manager/manager.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ spec:
7373
drop:
7474
- "ALL"
7575
env:
76+
- name: GOMEMLIMIT
77+
value: "230MiB"
7678
- name: RELATED_IMAGE_FEATURE_SERVER
7779
value: feast:latest
7880
- name: RELATED_IMAGE_CRON_JOB

infra/feast-operator/dist/install.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20868,6 +20868,8 @@ spec:
2086820868
value: quay.io/feastdev/feature-server:0.62.0
2086920869
- name: RELATED_IMAGE_CRON_JOB
2087020870
value: quay.io/openshift/origin-cli:4.17
20871+
- name: GOMEMLIMIT
20872+
value: 230MiB
2087120873
- name: OIDC_ISSUER_URL
2087220874
value: ""
2087320875
image: quay.io/feastdev/feast-operator:0.62.0

infra/feast-operator/internal/controller/authz/authz.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,7 @@ func (authz *FeastAuthorization) getLabels() map[string]string {
331331
return map[string]string{
332332
services.NameLabelKey: authz.Handler.FeatureStore.Name,
333333
services.ServiceTypeLabelKey: string(services.AuthzFeastType),
334+
services.ManagedByLabelKey: services.ManagedByLabelValue,
334335
}
335336
}
336337

infra/feast-operator/internal/controller/services/namespace_registry.go

Lines changed: 8 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ import (
2020
"encoding/json"
2121
"fmt"
2222
"os"
23-
"reflect"
2423

2524
corev1 "k8s.io/api/core/v1"
2625
rbacv1 "k8s.io/api/rbac/v1"
@@ -181,30 +180,17 @@ func (feast *FeastServices) setNamespaceRegistryRoleBinding(rb *rbacv1.RoleBindi
181180
Namespace: rb.Namespace,
182181
},
183182
}
184-
role.Rules = desiredRules
183+
role.SetGroupVersionKind(rbacv1.SchemeGroupVersion.WithKind("Role"))
185184

186-
// Attempt to create; tolerate AlreadyExists so concurrent reconcilers don't fail.
187-
if err := feast.Handler.Client.Create(feast.Handler.Context, role); err != nil && !apierrors.IsAlreadyExists(err) {
188-
return fmt.Errorf("failed to create namespace registry Role: %w", err)
189-
}
190-
191-
// Re-fetch the authoritative copy to compare rules and obtain the latest resourceVersion.
192-
existingRole := &rbacv1.Role{}
193-
if err := feast.Handler.Client.Get(feast.Handler.Context, types.NamespacedName{
194-
Name: roleName,
195-
Namespace: rb.Namespace,
196-
}, existingRole); err != nil {
197-
return fmt.Errorf("failed to get namespace registry Role: %w", err)
198-
}
199-
200-
if !reflect.DeepEqual(existingRole.Rules, desiredRules) {
201-
existingRole.Rules = desiredRules
202-
// On conflict the reconciler will re-queue automatically.
203-
if err := feast.Handler.Client.Update(feast.Handler.Context, existingRole); err != nil {
204-
return fmt.Errorf("failed to update namespace registry Role: %w", err)
205-
}
185+
if _, err := controllerutil.CreateOrUpdate(feast.Handler.Context, feast.Handler.Client, role, func() error {
186+
role.Labels = feast.getLabels()
187+
role.Rules = desiredRules
188+
return nil
189+
}); err != nil {
190+
return fmt.Errorf("failed to reconcile namespace registry Role: %w", err)
206191
}
207192

193+
rb.Labels = feast.getLabels()
208194
rb.RoleRef = rbacv1.RoleRef{
209195
APIGroup: "rbac.authorization.k8s.io",
210196
Kind: "Role",

infra/feast-operator/internal/controller/services/scaling.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ func (feast *FeastServices) buildPDBApplyConfig() *pdbac.PodDisruptionBudgetAppl
231231
WithBlockOwnerDeletion(true),
232232
).
233233
WithSpec(pdbac.PodDisruptionBudgetSpec().
234-
WithSelector(metaac.LabelSelector().WithMatchLabels(feast.getLabels())),
234+
WithSelector(metaac.LabelSelector().WithMatchLabels(feast.getSelectorLabels())),
235235
)
236236

237237
if pdbConfig.MinAvailable != nil {
@@ -249,8 +249,7 @@ func (feast *FeastServices) updateScalingStatus(deploy *appsv1.Deployment) {
249249
cr := feast.Handler.FeatureStore
250250

251251
cr.Status.Replicas = deploy.Status.ReadyReplicas
252-
labels := feast.getLabels()
253-
cr.Status.Selector = metav1.FormatLabelSelector(metav1.SetAsLabelSelector(labels))
252+
cr.Status.Selector = metav1.FormatLabelSelector(metav1.SetAsLabelSelector(feast.getSelectorLabels()))
254253

255254
if !isScalingEnabled(cr) {
256255
cr.Status.ScalingStatus = nil

infra/feast-operator/internal/controller/services/services.go

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -384,10 +384,13 @@ func (feast *FeastServices) createPVC(pvcCreate *feastdevv1.PvcCreate, feastType
384384
}
385385

386386
// PVCs are immutable, so we only create... we don't update an existing one.
387+
// Treat AlreadyExists as success: a pre-existing PVC without the managed-by label
388+
// won't appear in the filtered cache (Client.Get returns NotFound), but Create
389+
// will hit AlreadyExists on the API server — both cases mean the PVC is present.
387390
err = feast.Handler.Client.Get(feast.Handler.Context, client.ObjectKeyFromObject(pvc), pvc)
388391
if err != nil && apierrors.IsNotFound(err) {
389392
err = feast.Handler.Client.Create(feast.Handler.Context, pvc)
390-
if err != nil {
393+
if err != nil && !apierrors.IsAlreadyExists(err) {
391394
return err
392395
}
393396
logger.Info("Successfully created", "PersistentVolumeClaim", pvc.Name)
@@ -408,9 +411,10 @@ func (feast *FeastServices) setDeployment(deploy *appsv1.Deployment) error {
408411
}
409412

410413
deploy.Labels = feast.getLabels()
414+
selectorLabels := feast.getSelectorLabels()
411415
deploy.Spec = appsv1.DeploymentSpec{
412416
Replicas: replicas,
413-
Selector: metav1.SetAsLabelSelector(deploy.GetLabels()),
417+
Selector: metav1.SetAsLabelSelector(selectorLabels),
414418
Strategy: feast.getDeploymentStrategy(),
415419
Template: corev1.PodTemplateSpec{
416420
ObjectMeta: metav1.ObjectMeta{
@@ -818,7 +822,7 @@ func (feast *FeastServices) setService(svc *corev1.Service, feastType FeastServi
818822
}
819823

820824
svc.Spec = corev1.ServiceSpec{
821-
Selector: feast.getLabels(),
825+
Selector: feast.getSelectorLabels(),
822826
Type: corev1.ServiceTypeClusterIP,
823827
Ports: []corev1.ServicePort{
824828
{
@@ -868,6 +872,7 @@ func (feast *FeastServices) setServiceAccount(sa *corev1.ServiceAccount) error {
868872

869873
func (feast *FeastServices) createNewPVC(pvcCreate *feastdevv1.PvcCreate, feastType FeastServiceType) (*corev1.PersistentVolumeClaim, error) {
870874
pvc := feast.initPVC(feastType)
875+
pvc.Labels = feast.getFeastTypeLabels(feastType)
871876

872877
pvc.Spec = corev1.PersistentVolumeClaimSpec{
873878
AccessModes: pvcCreate.AccessModes,
@@ -976,7 +981,7 @@ func (feast *FeastServices) applyTopologySpread(podSpec *corev1.PodSpec) {
976981
MaxSkew: 1,
977982
TopologyKey: "topology.kubernetes.io/zone",
978983
WhenUnsatisfiable: corev1.ScheduleAnyway,
979-
LabelSelector: metav1.SetAsLabelSelector(feast.getLabels()),
984+
LabelSelector: metav1.SetAsLabelSelector(feast.getSelectorLabels()),
980985
}}
981986
}
982987

@@ -999,7 +1004,7 @@ func (feast *FeastServices) applyAffinity(podSpec *corev1.PodSpec) {
9991004
Weight: 100,
10001005
PodAffinityTerm: corev1.PodAffinityTerm{
10011006
TopologyKey: "kubernetes.io/hostname",
1002-
LabelSelector: metav1.SetAsLabelSelector(feast.getLabels()),
1007+
LabelSelector: metav1.SetAsLabelSelector(feast.getSelectorLabels()),
10031008
},
10041009
}},
10051010
},
@@ -1060,12 +1065,24 @@ func (feast *FeastServices) getFeastTypeLabels(feastType FeastServiceType) map[s
10601065
return labels
10611066
}
10621067

1063-
func (feast *FeastServices) getLabels() map[string]string {
1068+
// getSelectorLabels returns the minimal label set used for immutable selectors
1069+
// (Deployment spec.selector, Service spec.selector, TopologySpreadConstraints, PodAffinity).
1070+
// This must NOT change after initial resource creation.
1071+
func (feast *FeastServices) getSelectorLabels() map[string]string {
10641072
return map[string]string{
10651073
NameLabelKey: feast.Handler.FeatureStore.Name,
10661074
}
10671075
}
10681076

1077+
// getLabels returns the full label set for mutable metadata (ObjectMeta.Labels).
1078+
// Includes the managed-by label used by the informer cache filter.
1079+
func (feast *FeastServices) getLabels() map[string]string {
1080+
return map[string]string{
1081+
NameLabelKey: feast.Handler.FeatureStore.Name,
1082+
ManagedByLabelKey: ManagedByLabelValue,
1083+
}
1084+
}
1085+
10691086
func (feast *FeastServices) setServiceHostnames() error {
10701087
feast.Handler.FeatureStore.Status.ServiceHostnames = feastdevv1.ServiceHostnames{}
10711088
domain := svcDomain + ":"
@@ -1438,10 +1455,10 @@ func IsDeploymentAvailable(conditions []appsv1.DeploymentCondition) bool {
14381455
// container that is in a failing state. Returns empty string if no failure found.
14391456
func (feast *FeastServices) GetPodContainerFailureMessage(deploy appsv1.Deployment) string {
14401457
podList := corev1.PodList{}
1441-
labels := feast.getLabels()
1458+
selectorLabels := feast.getSelectorLabels()
14421459
if err := feast.Handler.Client.List(feast.Handler.Context, &podList,
14431460
client.InNamespace(deploy.Namespace),
1444-
client.MatchingLabels(labels),
1461+
client.MatchingLabels(selectorLabels),
14451462
); err != nil {
14461463
return ""
14471464
}

0 commit comments

Comments
 (0)