Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 24 additions & 5 deletions internal/state/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,14 +130,14 @@ func (s *stateDriver) Sync(ctx context.Context, customResource interface{}, info
return SyncStateError, fmt.Errorf("NVIDIADriver CR not provided as input to Sync()")
}

err := s.cleanupStaleDriverDaemonsets(ctx, cr)
objs, err := s.getManifestObjects(ctx, cr, infoCatalog)
if err != nil {
return SyncStateNotReady, fmt.Errorf("failed to cleanup stale driver DaemonSets: %w", err)
return SyncStateNotReady, fmt.Errorf("failed to create k8s objects from manifests: %v", err)
}

objs, err := s.getManifestObjects(ctx, cr, infoCatalog)
err = s.cleanupStaleDriverDaemonsets(ctx, cr, objs)
if err != nil {
return SyncStateNotReady, fmt.Errorf("failed to create k8s objects from manifests: %v", err)
return SyncStateNotReady, fmt.Errorf("failed to cleanup stale driver DaemonSets: %w", err)
}

// Create objects if they don't exist, Update objects if they do exist
Expand Down Expand Up @@ -176,10 +176,18 @@ func (s *stateDriver) GetWatchSources(mgr ctrlManager) map[string]SyncingSource
return wr
}

func (s *stateDriver) cleanupStaleDriverDaemonsets(ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver) error {
func (s *stateDriver) cleanupStaleDriverDaemonsets(ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver, desiredObjs []*unstructured.Unstructured) error {
logger := log.FromContext(ctx)
logger.V(consts.LogLevelInfo).Info("Cleaning up stale driver DaemonSets")

// Build a set of desired DaemonSet names from the manifest objects
desiredDaemonSetNames := make(map[string]bool)
for _, obj := range desiredObjs {
if obj.GetKind() == "DaemonSet" {
desiredDaemonSetNames[obj.GetName()] = true
}
}

// List all DaemonSets owned by the CR instance
list := &appsv1.DaemonSetList{}
err := s.client.List(ctx, list, client.MatchingFields{consts.NVIDIADriverControllerIndexKey: cr.Name})
Expand All @@ -189,6 +197,17 @@ func (s *stateDriver) cleanupStaleDriverDaemonsets(ctx context.Context, cr *nvid

for _, ds := range list.Items {
ds := ds
// Delete DaemonSets that are not in the desired list. This handles the case where
// the CR's nodeSelector changes and certain node pools no longer match.
if !desiredDaemonSetNames[ds.Name] {
logger.V(consts.LogLevelInfo).Info("Deleting DaemonSet no longer managed by this CR", "Name", ds.Name)
err = s.client.Delete(ctx, &ds)
if err != nil {
return fmt.Errorf("error deleting DaemonSet '%s': %w", ds.Name, err)
}
continue
}

// We consider a DaemonSet to be stale when all three conditions are true:
//
// 1. The desired number of pods reported by the DaemonSet controller is 0
Expand Down