Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion api/nvidia/v1alpha1/nvidiadriver_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,19 @@ const (
// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.

// NVIDIADriverSpec defines the desired state of NVIDIADriver
// NVIDIADriverSpec defines the desired state of NVIDIADriver.
// The CEL validation allows non-default drivers to use nodeSelector, but requires
// default drivers to leave nodeSelector unset or empty.
// +kubebuilder:validation:XValidation:rule="has(self.default) && self.default ? !has(self.nodeSelector) || size(self.nodeSelector) == 0 : true",message="default NVIDIADriver must not use nodeSelector"
type NVIDIADriverSpec struct {
// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
// Important: Run "make" to regenerate code after modifying this file

// Default indicates that this NVIDIADriver acts as the fallback driver daemon set manager for GPU nodes
// that do not match any non-default NVIDIADriver nodeSelector.
// +kubebuilder:default=false
Default bool `json:"default"`

// +kubebuilder:validation:Enum=gpu;vgpu;vgpu-host-manager
// +kubebuilder:default=gpu
// +kubebuilder:validation:XValidation:rule="self == oldSelf",message="driverType is an immutable field. Please create a new NvidiaDriver resource instead when you want to change this setting."
Expand Down Expand Up @@ -504,6 +512,7 @@ type NVIDIADriverStatus struct {
//+kubebuilder:subresource:status
//+kubebuilder:resource:scope=Cluster,shortName={"nvd","nvdriver","nvdrivers"}
//+kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.state`,priority=0
//+kubebuilder:printcolumn:name="Default",type=boolean,JSONPath=`.spec.default`,priority=0
//+kubebuilder:printcolumn:name="Age",type=string,JSONPath=`.metadata.creationTimestamp`,priority=0

// NVIDIADriver is the Schema for the nvidiadrivers API
Expand All @@ -524,6 +533,31 @@ type NVIDIADriverList struct {
Items []NVIDIADriver `json:"items"`
}

// IsDefault returns true when the NVIDIADriver is marked as the fallback driver.
func (d *NVIDIADriver) IsDefault() bool {
return d != nil && d.Spec.Default
}

// HasDeletionTimestamp returns true when the NVIDIADriver is marked for deletion.
func (d *NVIDIADriver) HasDeletionTimestamp() bool {
return d != nil && !d.GetDeletionTimestamp().IsZero()
}

// ValidateNodeSelector rejects selectors that use operator-managed routing labels
// or scope the default fallback driver.
func (d *NVIDIADriver) ValidateNodeSelector() error {
if d == nil || d.Spec.NodeSelector == nil {
return nil
}
if d.IsDefault() && len(d.Spec.NodeSelector) > 0 {
return fmt.Errorf("default NVIDIADriver %q cannot use nodeSelector", d.Name)
}
if _, ok := d.Spec.NodeSelector[consts.NVIDIADriverOwnerLabel]; ok {
return fmt.Errorf("NVIDIADriver %q nodeSelector cannot use reserved label %q", d.Name, consts.NVIDIADriverOwnerLabel)
}
return nil
}

// UsePrecompiledDrivers returns true if usePrecompiled option is enabled in spec
func (d *NVIDIADriverSpec) UsePrecompiledDrivers() bool {
if d.UsePrecompiled == nil {
Expand Down
14 changes: 14 additions & 0 deletions api/nvidia/v1alpha1/nvidiadriver_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,26 @@ import (
"testing"

"github.com/stretchr/testify/require"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

const (
testDigest = "10d1df8034373061366d4fb17b364b3b28d766b54d5a0b700c1a5a75378cf125"
)

func TestNVIDIADriverHasDeletionTimestamp(t *testing.T) {
var nilDriver *NVIDIADriver
require.False(t, nilDriver.HasDeletionTimestamp())
require.False(t, (&NVIDIADriver{}).HasDeletionTimestamp())

deletionTimestamp := metav1.Now()
require.True(t, (&NVIDIADriver{
ObjectMeta: metav1.ObjectMeta{
DeletionTimestamp: &deletionTimestamp,
},
}).HasDeletionTimestamp())
}

func TestGetImagePath(t *testing.T) {
testCases := []struct {
description string
Expand Down
19 changes: 18 additions & 1 deletion bundle/manifests/nvidia.com_nvidiadrivers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ spec:
- jsonPath: .status.state
name: Status
type: string
- jsonPath: .spec.default
name: Default
type: boolean
- jsonPath: .metadata.creationTimestamp
name: Age
type: string
Expand All @@ -48,7 +51,10 @@ spec:
metadata:
type: object
spec:
description: NVIDIADriverSpec defines the desired state of NVIDIADriver
description: |-
NVIDIADriverSpec defines the desired state of NVIDIADriver.
The CEL validation allows non-default drivers to use nodeSelector, but requires
default drivers to leave nodeSelector unset or empty.
properties:
annotations:
additionalProperties:
Expand All @@ -70,6 +76,12 @@ spec:
name:
type: string
type: object
default:
default: false
description: |-
Default indicates that this NVIDIADriver acts as the fallback driver daemon set manager for GPU nodes
that do not match any non-default NVIDIADriver nodeSelector.
type: boolean
driverType:
default: gpu
description: DriverType defines NVIDIA driver type
Expand Down Expand Up @@ -968,9 +980,14 @@ spec:
type: string
type: object
required:
- default
- driverType
- image
type: object
x-kubernetes-validations:
- message: default NVIDIADriver must not use nodeSelector
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
size(self.nodeSelector) == 0 : true'
status:
description: NVIDIADriverStatus defines the observed state of NVIDIADriver
properties:
Expand Down
19 changes: 18 additions & 1 deletion config/crd/bases/nvidia.com_nvidiadrivers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ spec:
- jsonPath: .status.state
name: Status
type: string
- jsonPath: .spec.default
name: Default
type: boolean
- jsonPath: .metadata.creationTimestamp
name: Age
type: string
Expand All @@ -48,7 +51,10 @@ spec:
metadata:
type: object
spec:
description: NVIDIADriverSpec defines the desired state of NVIDIADriver
description: |-
NVIDIADriverSpec defines the desired state of NVIDIADriver.
The CEL validation allows non-default drivers to use nodeSelector, but requires
default drivers to leave nodeSelector unset or empty.
properties:
annotations:
additionalProperties:
Expand All @@ -70,6 +76,12 @@ spec:
name:
type: string
type: object
default:
default: false
description: |-
Default indicates that this NVIDIADriver acts as the fallback driver daemon set manager for GPU nodes
that do not match any non-default NVIDIADriver nodeSelector.
type: boolean
driverType:
default: gpu
description: DriverType defines NVIDIA driver type
Expand Down Expand Up @@ -968,9 +980,14 @@ spec:
type: string
type: object
required:
- default
- driverType
- image
type: object
x-kubernetes-validations:
- message: default NVIDIADriver must not use nodeSelector
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
size(self.nodeSelector) == 0 : true'
status:
description: NVIDIADriverStatus defines the observed state of NVIDIADriver
properties:
Expand Down
89 changes: 84 additions & 5 deletions controllers/nvidiadriver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import (
"github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
"github.com/NVIDIA/gpu-operator/internal/conditions"
"github.com/NVIDIA/gpu-operator/internal/consts"
nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver"
"github.com/NVIDIA/gpu-operator/internal/state"
"github.com/NVIDIA/gpu-operator/internal/validator"
)
Expand Down Expand Up @@ -83,9 +84,8 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
if err := r.Get(ctx, req.NamespacedName, instance); err != nil {
if apierrors.IsNotFound(err) {
// Request object not found, could have been deleted after reconcile request.
// Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
// Return and don't requeue
return reconcile.Result{}, nil
// Re-run owner assignment so deleting the last NVIDIADriver clears stale node owner labels.
return r.cleanupNVIDIADriverOwnerLabels(ctx)
}
wrappedErr := fmt.Errorf("error getting NVIDIADriver object: %w", err)
logger.Error(err, "error getting NVIDIADriver object")
Expand All @@ -96,6 +96,10 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
// Error reading the object - requeue the request.
return reconcile.Result{}, wrappedErr
}
if instance.HasDeletionTimestamp() {
logger.Info("NVIDIADriver delete requested; cleaning up owner labels")
return r.cleanupNVIDIADriverOwnerLabels(ctx)
}

// Get the singleton NVIDIA ClusterPolicy object in the cluster.
clusterPolicyList := &gpuv1.ClusterPolicyList{}
Expand Down Expand Up @@ -152,6 +156,19 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
return reconcile.Result{}, nil
}

changed, err := nvidiadriverutil.AssignOwners(ctx, r.Client)
if err != nil {
logger.Error(err, "failed to assign NVIDIADriver owners to nodes")
instance.Status.State = nvidiav1alpha1.NotReady
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
logger.Error(condErr, "failed to set condition")
}
return reconcile.Result{}, err
}
if changed {
return reconcile.Result{RequeueAfter: time.Second}, nil
}
Comment thread
cdesiniotis marked this conversation as resolved.

if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) {
err := errors.New("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers")
logger.Error(err, "unsupported driver combination detected")
Expand Down Expand Up @@ -188,6 +205,11 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
// Sync state and update status
managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog)

if err := r.labelNodesWithOrphanedDriverPods(ctx); err != nil {
logger.Error(err, "failed to label nodes with orphaned NVIDIA driver pods")
return reconcile.Result{}, err
}

// update CR status
if err := r.updateCrStatus(ctx, instance, managerStatus); err != nil {
return ctrl.Result{}, err
Expand Down Expand Up @@ -276,6 +298,63 @@ func (r *NVIDIADriverReconciler) enqueueAllNVIDIADrivers(ctx context.Context) []
return reconcileRequests
}

// enqueueNVIDIADriverReconcilers enqueues the NVIDIADriver that triggered the
// event and all current NVIDIADriver instances. The triggering object is
// included even for delete events so the NotFound reconcile path can clear
// stale node owner labels.
func (r *NVIDIADriverReconciler) enqueueNVIDIADriverReconcilers(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) []reconcile.Request {
requests := r.enqueueAllNVIDIADrivers(ctx)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of doing multiple list-alls of NVIDIADriver CR, how about creating a struct field (a map/set) in the NVIDIADriverReconciler struct which holds the unique set of all NVIDIADriver CRs? We'd add a new NVIDIADriver CR to the set if it's processed for the first time. We can remove the NVIDIADriver CR from this set when it is marked for deletion.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will create a second source of truth beside the controller-runtime cache. We'll have to deal with startup correctness, delete correctness, concurrency, etc. The current r.List(ctx, list) is normally served from the informer cache, not a live API-server request, so it is much cheaper than it looks. It is also much safer because it reflects the cache’s current view after watch/list reconciliation. Are you seeing any performance issues with current approach? Just want to be clear on the issue you might be seeing with current approach.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While it may not be an issue in clusters with a few NVIDIADrivers. This becomes a problem in large clusters where it's also possible to have several NVIDIADriver CRs to loop through. In these scenarios, we are bound to hit scaling problems.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should move the assignOwners() method to its own controller, that way it can reconcile on its own when needed and add ownership label. Then, we don't need to redo those calculations again and again on each reconcile in nvidiadriver CR and it can focus just on driver enablement. I'll add ownership-reconciler in a follow up PR which will improve things a lot in large clusters with lots of nvidiadrivers. I'll open follow up PRs for this.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rahulait in addition to moving assignOwners() to its own controller, what are your thoughts on moving the other label operations, like labelNodesWithOrphanDriverPods(), to this separate controller? Would that be possible / desirable? I am also thinking about whether it makes sense to move all of the node labeling that GPU Operator does (primarily in the clusterpolicy controller) to a dedicated controller.

if driver != nil {
requests = append(requests, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: driver.GetName(),
Namespace: driver.GetNamespace(),
},
})
}
return dedupeReconcileRequests(requests)
}

func dedupeReconcileRequests(requests []reconcile.Request) []reconcile.Request {
seen := map[types.NamespacedName]struct{}{}
deduped := make([]reconcile.Request, 0, len(requests))
for _, request := range requests {
if _, ok := seen[request.NamespacedName]; ok {
continue
}
seen[request.NamespacedName] = struct{}{}
deduped = append(deduped, request)
}
return deduped
}

// cleanupNVIDIADriverOwnerLabels re-runs owner assignment after a delete event
// when ClusterPolicy is still configured for NVIDIADriver mode. This lets
// AssignOwners remove stale nvidia.com/gpu-operator.driver.owner labels when no remaining
// NVIDIADriver matches a GPU node, including deletion of the last NVIDIADriver.
func (r *NVIDIADriverReconciler) cleanupNVIDIADriverOwnerLabels(ctx context.Context) (reconcile.Result, error) {
clusterPolicyList := &gpuv1.ClusterPolicyList{}
if err := r.List(ctx, clusterPolicyList); err != nil {
wrappedErr := fmt.Errorf("error getting ClusterPolicy list: %w", err)
log.FromContext(ctx).Error(wrappedErr, "failed to cleanup NVIDIADriver owner labels")
return reconcile.Result{}, wrappedErr
}

if len(clusterPolicyList.Items) == 0 {
return reconcile.Result{}, nil
Comment thread
rahulait marked this conversation as resolved.
}

if !clusterPolicyList.Items[0].Spec.Driver.UseNvidiaDriverCRDType() {
return reconcile.Result{}, nil
}

if _, err := nvidiadriverutil.AssignOwners(ctx, r.Client); err != nil {
log.FromContext(ctx).Error(err, "failed to cleanup NVIDIADriver owner labels")
return reconcile.Result{}, err
}
return reconcile.Result{}, nil
}

// SetupWithManager sets up the controller with the Manager.
func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
// Create state manager
Expand Down Expand Up @@ -307,8 +386,8 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.

// Watch for changes to NVIDIADriver CRs. Whenever an event is generated for a NVIDIADriver CR,
// enqueue a reconcile request for all NVIDIADriver instances.
nvidiaDriverMapFn := func(ctx context.Context, _ *nvidiav1alpha1.NVIDIADriver) []reconcile.Request {
return r.enqueueAllNVIDIADrivers(ctx)
nvidiaDriverMapFn := func(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) []reconcile.Request {
return r.enqueueNVIDIADriverReconcilers(ctx, driver)
}

// Watch for changes to the primary resource NVIDIADriver
Expand Down
Loading
Loading