From 379f164d925b7cd3b01f69e2380e1c538cb0174c Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 29 Jan 2026 11:10:47 -0800 Subject: [PATCH] cleanup stale driver daemonsets Signed-off-by: Rahul Sharma --- internal/state/driver.go | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/internal/state/driver.go b/internal/state/driver.go index b0e6484de..5db3148bc 100644 --- a/internal/state/driver.go +++ b/internal/state/driver.go @@ -130,14 +130,14 @@ func (s *stateDriver) Sync(ctx context.Context, customResource interface{}, info return SyncStateError, fmt.Errorf("NVIDIADriver CR not provided as input to Sync()") } - err := s.cleanupStaleDriverDaemonsets(ctx, cr) + objs, err := s.getManifestObjects(ctx, cr, infoCatalog) if err != nil { - return SyncStateNotReady, fmt.Errorf("failed to cleanup stale driver DaemonSets: %w", err) + return SyncStateNotReady, fmt.Errorf("failed to create k8s objects from manifests: %v", err) } - objs, err := s.getManifestObjects(ctx, cr, infoCatalog) + err = s.cleanupStaleDriverDaemonsets(ctx, cr, objs) if err != nil { - return SyncStateNotReady, fmt.Errorf("failed to create k8s objects from manifests: %v", err) + return SyncStateNotReady, fmt.Errorf("failed to cleanup stale driver DaemonSets: %w", err) } // Create objects if they don't exist, Update objects if they do exist @@ -176,10 +176,18 @@ func (s *stateDriver) GetWatchSources(mgr ctrlManager) map[string]SyncingSource return wr } -func (s *stateDriver) cleanupStaleDriverDaemonsets(ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver) error { +func (s *stateDriver) cleanupStaleDriverDaemonsets(ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver, desiredObjs []*unstructured.Unstructured) error { logger := log.FromContext(ctx) logger.V(consts.LogLevelInfo).Info("Cleaning up stale driver DaemonSets") + // Build a set of desired DaemonSet names from the manifest objects + desiredDaemonSetNames := make(map[string]bool) + for _, obj := range desiredObjs { + if obj.GetKind() == "DaemonSet" { + desiredDaemonSetNames[obj.GetName()] = true + } + } + // List all DaemonSets owned by the CR instance list := &appsv1.DaemonSetList{} err := s.client.List(ctx, list, client.MatchingFields{consts.NVIDIADriverControllerIndexKey: cr.Name}) @@ -189,6 +197,17 @@ func (s *stateDriver) cleanupStaleDriverDaemonsets(ctx context.Context, cr *nvid for _, ds := range list.Items { ds := ds + // Delete DaemonSets that are not in the desired list. This handles the case where + // the CR's nodeSelector changes and certain node pools no longer match. + if !desiredDaemonSetNames[ds.Name] { + logger.V(consts.LogLevelInfo).Info("Deleting DaemonSet no longer managed by this CR", "Name", ds.Name) + err = s.client.Delete(ctx, &ds) + if err != nil { + return fmt.Errorf("error deleting DaemonSet '%s': %w", ds.Name, err) + } + continue + } + // We consider a DaemonSet to be stale when all three conditions are true: // // 1. The desired number of pods reported by the DaemonSet controller is 0