-
Notifications
You must be signed in to change notification settings - Fork 516
add support for default nvidiadriver and migration from clusterpolicy to nvidiadriver #2518
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -45,6 +45,7 @@ import ( | |
| "github.com/NVIDIA/gpu-operator/controllers/clusterinfo" | ||
| "github.com/NVIDIA/gpu-operator/internal/conditions" | ||
| "github.com/NVIDIA/gpu-operator/internal/consts" | ||
| nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver" | ||
| "github.com/NVIDIA/gpu-operator/internal/state" | ||
| "github.com/NVIDIA/gpu-operator/internal/validator" | ||
| ) | ||
|
|
@@ -83,9 +84,8 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request | |
| if err := r.Get(ctx, req.NamespacedName, instance); err != nil { | ||
| if apierrors.IsNotFound(err) { | ||
| // Request object not found, could have been deleted after reconcile request. | ||
| // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers. | ||
| // Return and don't requeue | ||
| return reconcile.Result{}, nil | ||
| // Re-run owner assignment so deleting the last NVIDIADriver clears stale node owner labels. | ||
| return r.cleanupNVIDIADriverOwnerLabels(ctx) | ||
| } | ||
| wrappedErr := fmt.Errorf("error getting NVIDIADriver object: %w", err) | ||
| logger.Error(err, "error getting NVIDIADriver object") | ||
|
|
@@ -96,6 +96,10 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request | |
| // Error reading the object - requeue the request. | ||
| return reconcile.Result{}, wrappedErr | ||
| } | ||
| if instance.HasDeletionTimestamp() { | ||
| logger.Info("NVIDIADriver delete requested; cleaning up owner labels") | ||
| return r.cleanupNVIDIADriverOwnerLabels(ctx) | ||
| } | ||
|
|
||
| // Get the singleton NVIDIA ClusterPolicy object in the cluster. | ||
| clusterPolicyList := &gpuv1.ClusterPolicyList{} | ||
|
|
@@ -152,6 +156,19 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request | |
| return reconcile.Result{}, nil | ||
| } | ||
|
|
||
| changed, err := nvidiadriverutil.AssignOwners(ctx, r.Client) | ||
| if err != nil { | ||
| logger.Error(err, "failed to assign NVIDIADriver owners to nodes") | ||
| instance.Status.State = nvidiav1alpha1.NotReady | ||
| if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil { | ||
| logger.Error(condErr, "failed to set condition") | ||
| } | ||
| return reconcile.Result{}, err | ||
| } | ||
| if changed { | ||
| return reconcile.Result{RequeueAfter: time.Second}, nil | ||
| } | ||
|
|
||
| if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) { | ||
| err := errors.New("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers") | ||
| logger.Error(err, "unsupported driver combination detected") | ||
|
|
@@ -188,6 +205,11 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request | |
| // Sync state and update status | ||
| managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog) | ||
|
|
||
| if err := r.labelNodesWithOrphanedDriverPods(ctx); err != nil { | ||
| logger.Error(err, "failed to label nodes with orphaned NVIDIA driver pods") | ||
| return reconcile.Result{}, err | ||
| } | ||
|
|
||
| // update CR status | ||
| if err := r.updateCrStatus(ctx, instance, managerStatus); err != nil { | ||
| return ctrl.Result{}, err | ||
|
|
@@ -276,6 +298,63 @@ func (r *NVIDIADriverReconciler) enqueueAllNVIDIADrivers(ctx context.Context) [] | |
| return reconcileRequests | ||
| } | ||
|
|
||
| // enqueueNVIDIADriverReconcilers enqueues the NVIDIADriver that triggered the | ||
| // event and all current NVIDIADriver instances. The triggering object is | ||
| // included even for delete events so the NotFound reconcile path can clear | ||
| // stale node owner labels. | ||
| func (r *NVIDIADriverReconciler) enqueueNVIDIADriverReconcilers(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) []reconcile.Request { | ||
| requests := r.enqueueAllNVIDIADrivers(ctx) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of doing multiple list-alls of NVIDIADriver CR, how about creating a struct field (a map/set) in the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will create a second source of truth beside the controller-runtime cache. We'll have to deal with startup correctness, delete correctness, concurrency, etc. The current r.List(ctx, list) is normally served from the informer cache, not a live API-server request, so it is much cheaper than it looks. It is also much safer because it reflects the cache’s current view after watch/list reconciliation. Are you seeing any performance issues with current approach? Just want to be clear on the issue you might be seeing with current approach.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While it may not be an issue in clusters with a few NVIDIADrivers. This becomes a problem in large clusters where it's also possible to have several NVIDIADriver CRs to loop through. In these scenarios, we are bound to hit scaling problems.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should move the assignOwners() method to its own controller, that way it can reconcile on its own when needed and add ownership label. Then, we don't need to redo those calculations again and again on each reconcile in nvidiadriver CR and it can focus just on driver enablement. I'll add ownership-reconciler in a follow up PR which will improve things a lot in large clusters with lots of nvidiadrivers. I'll open follow up PRs for this.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @rahulait in addition to moving |
||
| if driver != nil { | ||
| requests = append(requests, reconcile.Request{ | ||
| NamespacedName: types.NamespacedName{ | ||
| Name: driver.GetName(), | ||
| Namespace: driver.GetNamespace(), | ||
| }, | ||
| }) | ||
| } | ||
| return dedupeReconcileRequests(requests) | ||
| } | ||
|
|
||
| func dedupeReconcileRequests(requests []reconcile.Request) []reconcile.Request { | ||
| seen := map[types.NamespacedName]struct{}{} | ||
| deduped := make([]reconcile.Request, 0, len(requests)) | ||
| for _, request := range requests { | ||
| if _, ok := seen[request.NamespacedName]; ok { | ||
| continue | ||
| } | ||
| seen[request.NamespacedName] = struct{}{} | ||
| deduped = append(deduped, request) | ||
| } | ||
| return deduped | ||
| } | ||
|
|
||
| // cleanupNVIDIADriverOwnerLabels re-runs owner assignment after a delete event | ||
| // when ClusterPolicy is still configured for NVIDIADriver mode. This lets | ||
| // AssignOwners remove stale nvidia.com/gpu-operator.driver.owner labels when no remaining | ||
| // NVIDIADriver matches a GPU node, including deletion of the last NVIDIADriver. | ||
| func (r *NVIDIADriverReconciler) cleanupNVIDIADriverOwnerLabels(ctx context.Context) (reconcile.Result, error) { | ||
| clusterPolicyList := &gpuv1.ClusterPolicyList{} | ||
| if err := r.List(ctx, clusterPolicyList); err != nil { | ||
| wrappedErr := fmt.Errorf("error getting ClusterPolicy list: %w", err) | ||
| log.FromContext(ctx).Error(wrappedErr, "failed to cleanup NVIDIADriver owner labels") | ||
| return reconcile.Result{}, wrappedErr | ||
| } | ||
|
|
||
| if len(clusterPolicyList.Items) == 0 { | ||
| return reconcile.Result{}, nil | ||
|
rahulait marked this conversation as resolved.
|
||
| } | ||
|
|
||
| if !clusterPolicyList.Items[0].Spec.Driver.UseNvidiaDriverCRDType() { | ||
| return reconcile.Result{}, nil | ||
| } | ||
|
|
||
| if _, err := nvidiadriverutil.AssignOwners(ctx, r.Client); err != nil { | ||
| log.FromContext(ctx).Error(err, "failed to cleanup NVIDIADriver owner labels") | ||
| return reconcile.Result{}, err | ||
| } | ||
| return reconcile.Result{}, nil | ||
| } | ||
|
|
||
| // SetupWithManager sets up the controller with the Manager. | ||
| func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error { | ||
| // Create state manager | ||
|
|
@@ -307,8 +386,8 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl. | |
|
|
||
| // Watch for changes to NVIDIADriver CRs. Whenever an event is generated for a NVIDIADriver CR, | ||
| // enqueue a reconcile request for all NVIDIADriver instances. | ||
| nvidiaDriverMapFn := func(ctx context.Context, _ *nvidiav1alpha1.NVIDIADriver) []reconcile.Request { | ||
| return r.enqueueAllNVIDIADrivers(ctx) | ||
| nvidiaDriverMapFn := func(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) []reconcile.Request { | ||
| return r.enqueueNVIDIADriverReconcilers(ctx, driver) | ||
| } | ||
|
|
||
| // Watch for changes to the primary resource NVIDIADriver | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.