diff --git a/assets/state-driver/0500_daemonset.yaml b/assets/state-driver/0500_daemonset.yaml index 853cf6fc9..ea6a7b7fe 100644 --- a/assets/state-driver/0500_daemonset.yaml +++ b/assets/state-driver/0500_daemonset.yaml @@ -128,8 +128,6 @@ spec: - name: run-mellanox-drivers mountPath: /run/mellanox/drivers mountPropagation: HostToContainer - - name: sysfs-memory-online - mountPath: /sys/devices/system/memory/auto_online_blocks - name: firmware-search-path mountPath: /sys/module/firmware_class/parameters/path - name: nv-firmware @@ -320,9 +318,6 @@ spec: - name: firmware-search-path hostPath: path: /sys/module/firmware_class/parameters/path - - name: sysfs-memory-online - hostPath: - path: /sys/devices/system/memory/auto_online_blocks - name: nv-firmware hostPath: path: /run/nvidia/driver/lib/firmware diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 0e9767a20..05881ded4 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -1053,6 +1053,8 @@ func TransformDriver(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n C } } + applyMemoryHotplugAutoOnlineMount(&obj.Spec.Template.Spec, n.memoryHotplugAutoOnline) + // Compute driver configuration digest after all transformations are complete. // This digest enables fast-path driver installation by detecting when configuration // hasn't changed, avoiding unnecessary driver reinstalls and pod evictions. @@ -3569,6 +3571,70 @@ func applyLicensingConfig(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec podSpec.Volumes = append(podSpec.Volumes, licensingConfigVol) } +func applyMemoryHotplugAutoOnlineMount(podSpec *corev1.PodSpec, enabled bool) { + const ( + volumeName = "sysfs-memory-online" + mountPath = "/sys/devices/system/memory/auto_online_blocks" + ) + + if enabled { + driverContainer := findContainerByName(podSpec.Containers, "nvidia-driver-ctr") + if driverContainer != nil && !hasVolumeMount(driverContainer.VolumeMounts, volumeName) { + driverContainer.VolumeMounts = append(driverContainer.VolumeMounts, corev1.VolumeMount{ + Name: volumeName, + MountPath: mountPath, + }) + } + if !hasVolume(podSpec.Volumes, volumeName) { + podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{ + Name: volumeName, + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: mountPath}, + }, + }) + } + return + } + + for i := range podSpec.Containers { + volumeMounts := podSpec.Containers[i].VolumeMounts[:0] + for _, volumeMount := range podSpec.Containers[i].VolumeMounts { + if volumeMount.Name == volumeName { + continue + } + volumeMounts = append(volumeMounts, volumeMount) + } + podSpec.Containers[i].VolumeMounts = volumeMounts + } + + volumes := podSpec.Volumes[:0] + for _, volume := range podSpec.Volumes { + if volume.Name == volumeName { + continue + } + volumes = append(volumes, volume) + } + podSpec.Volumes = volumes +} + +func hasVolumeMount(volumeMounts []corev1.VolumeMount, name string) bool { + for _, volumeMount := range volumeMounts { + if volumeMount.Name == name { + return true + } + } + return false +} + +func hasVolume(volumes []corev1.Volume, name string) bool { + for _, volume := range volumes { + if volume.Name == name { + return true + } + } + return false +} + func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { podSpec := &obj.Spec.Template.Spec driverContainer := findContainerByName(podSpec.Containers, "nvidia-driver-ctr") diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 29eef5cf2..c02078b4e 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -39,31 +39,32 @@ import ( ) const ( - commonGPULabelKey = "nvidia.com/gpu.present" - commonGPULabelValue = "true" - commonOperandsLabelKey = "nvidia.com/gpu.deploy.operands" - commonOperandsLabelValue = "true" - migManagerLabelKey = "nvidia.com/gpu.deploy.mig-manager" - migManagerLabelValue = "true" - migCapableLabelKey = "nvidia.com/mig.capable" - migCapableLabelValue = "true" - migConfigLabelKey = "nvidia.com/mig.config" - migConfigDisabledValue = "all-disabled" - vgpuHostDriverLabelKey = "nvidia.com/vgpu.host-driver-version" - gpuProductLabelKey = "nvidia.com/gpu.product" - nfdLabelPrefix = "feature.node.kubernetes.io/" - nfdKernelLabelKey = "feature.node.kubernetes.io/kernel-version.full" - nfdOSTreeVersionLabelKey = "feature.node.kubernetes.io/system-os_release.OSTREE_VERSION" - nfdOSReleaseIDLabelKey = "feature.node.kubernetes.io/system-os_release.ID" - nfdOSVersionIDLabelKey = "feature.node.kubernetes.io/system-os_release.VERSION_ID" - ocpDriverToolkitVersionLabel = "openshift.driver-toolkit.rhcos" - ocpDriverToolkitIdentificationLabel = "openshift.driver-toolkit" - appLabelKey = "app" - ocpDriverToolkitIdentificationValue = "true" - ocpNamespaceMonitoringLabelKey = "openshift.io/cluster-monitoring" - ocpNamespaceMonitoringLabelValue = "true" - precompiledIdentificationLabelKey = "nvidia.com/precompiled" - precompiledIdentificationLabelValue = "true" + commonGPULabelKey = "nvidia.com/gpu.present" + commonGPULabelValue = "true" + commonOperandsLabelKey = "nvidia.com/gpu.deploy.operands" + commonOperandsLabelValue = "true" + migManagerLabelKey = "nvidia.com/gpu.deploy.mig-manager" + migManagerLabelValue = "true" + migCapableLabelKey = "nvidia.com/mig.capable" + migCapableLabelValue = "true" + migConfigLabelKey = "nvidia.com/mig.config" + migConfigDisabledValue = "all-disabled" + vgpuHostDriverLabelKey = "nvidia.com/vgpu.host-driver-version" + gpuProductLabelKey = "nvidia.com/gpu.product" + nfdLabelPrefix = "feature.node.kubernetes.io/" + nfdKernelLabelKey = "feature.node.kubernetes.io/kernel-version.full" + nfdKernelConfigMemoryHotplugLabelKey = "feature.node.kubernetes.io/kernel-config.MEMORY_HOTPLUG" + nfdOSTreeVersionLabelKey = "feature.node.kubernetes.io/system-os_release.OSTREE_VERSION" + nfdOSReleaseIDLabelKey = "feature.node.kubernetes.io/system-os_release.ID" + nfdOSVersionIDLabelKey = "feature.node.kubernetes.io/system-os_release.VERSION_ID" + ocpDriverToolkitVersionLabel = "openshift.driver-toolkit.rhcos" + ocpDriverToolkitIdentificationLabel = "openshift.driver-toolkit" + appLabelKey = "app" + ocpDriverToolkitIdentificationValue = "true" + ocpNamespaceMonitoringLabelKey = "openshift.io/cluster-monitoring" + ocpNamespaceMonitoringLabelValue = "true" + precompiledIdentificationLabelKey = "nvidia.com/precompiled" + precompiledIdentificationLabelValue = "true" // see bundle/manifests/gpu-operator.clusterserviceversion.yaml // --> ClusterServiceVersion.metadata.annotations.operatorframework.io/suggested-namespace ocpSuggestedNamespace = "nvidia-gpu-operator" @@ -164,12 +165,13 @@ type ClusterPolicyController struct { openshift string ocpDriverToolkit OpenShiftDriverToolkit - runtime gpuv1.Runtime - gpuNodeOSTag string - gpuNodeOSRelease string - hasGPUNodes bool - hasNFDLabels bool - sandboxEnabled bool + runtime gpuv1.Runtime + gpuNodeOSTag string + gpuNodeOSRelease string + hasGPUNodes bool + hasNFDLabels bool + memoryHotplugAutoOnline bool + sandboxEnabled bool } func addState(n *ClusterPolicyController, path string) { @@ -529,6 +531,7 @@ func (n *ClusterPolicyController) labelGPUNodes() (bool, int, error) { } clusterHasNFDLabels := false gpuNodesTotal := 0 + memoryHotplugAutoOnline := true for _, node := range list.Items { node := node updateLabels := false @@ -569,6 +572,10 @@ func (n *ClusterPolicyController) labelGPUNodes() (bool, int, error) { } if hasCommonGPULabel(labels) { + if labels[nfdKernelConfigMemoryHotplugLabelKey] != "true" { + memoryHotplugAutoOnline = false + } + // If node has GPU, then add state labels as per the workload type n.logger.Info("Checking GPU state labels on the node", "NodeName", node.Name) if gpuWorkloadConfig.updateGPUStateLabels(labels) { @@ -616,6 +623,7 @@ func (n *ClusterPolicyController) labelGPUNodes() (bool, int, error) { } } // end node loop + n.memoryHotplugAutoOnline = gpuNodesTotal > 0 && memoryHotplugAutoOnline n.logger.Info("Number of nodes with GPU label", "NodeCount", gpuNodesTotal) n.operatorMetrics.gpuNodesTotal.Set(float64(gpuNodesTotal)) return clusterHasNFDLabels, gpuNodesTotal, nil diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index a0ad8bdd4..0eb6c1ec5 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -584,6 +584,9 @@ node-feature-discovery: effect: NoSchedule config: sources: + kernel: + configOpts: + - "MEMORY_HOTPLUG" pci: deviceClassWhitelist: - "02" diff --git a/internal/state/driver.go b/internal/state/driver.go index 355910a8e..85d227368 100644 --- a/internal/state/driver.go +++ b/internal/state/driver.go @@ -89,15 +89,16 @@ type additionalConfigs struct { } type driverRenderData struct { - Driver *driverSpec - GDS *gdsDriverSpec - GPUDirectRDMA *nvidiav1alpha1.GPUDirectRDMASpec - GDRCopy *gdrcopyDriverSpec - Runtime *driverRuntimeSpec - Openshift *openshiftSpec - Precompiled *precompiledSpec - AdditionalConfigs *additionalConfigs - HostRoot string + Driver *driverSpec + GDS *gdsDriverSpec + GPUDirectRDMA *nvidiav1alpha1.GPUDirectRDMASpec + GDRCopy *gdrcopyDriverSpec + Runtime *driverRuntimeSpec + Openshift *openshiftSpec + Precompiled *precompiledSpec + AdditionalConfigs *additionalConfigs + HostRoot string + MemoryHotplugAutoOnline bool } // ConfigDigest computes a hash of all driver-install-relevant fields. @@ -301,6 +302,7 @@ func (s *stateDriver) getManifestObjects(ctx context.Context, cr *nvidiav1alpha1 return nil, fmt.Errorf("failed to construct driver spec: %w", err) } renderData.Driver = driverSpec + renderData.MemoryHotplugAutoOnline = nodePool.memoryHotplugAutoOnline if cr.Spec.UsePrecompiledDrivers() { renderData.Precompiled = &precompiledSpec{ diff --git a/internal/state/driver_test.go b/internal/state/driver_test.go index 02b2736f2..622e35b0b 100644 --- a/internal/state/driver_test.go +++ b/internal/state/driver_test.go @@ -142,6 +142,28 @@ func TestDriverRenderMinimal(t *testing.T) { require.Equal(t, string(o), actual) } +func TestDriverRenderSkipsMemoryHotplugMountWhenUnsupported(t *testing.T) { + state, err := NewStateDriver(nil, "", nil, manifestDir) + require.Nil(t, err) + stateDriver, ok := state.(*stateDriver) + require.True(t, ok) + + renderData := getMinimalDriverRenderData() + renderData.MemoryHotplugAutoOnline = false + + objs, err := stateDriver.renderer.RenderObjects( + &render.TemplatingData{ + Data: renderData, + }) + require.Nil(t, err) + require.NotEmpty(t, objs) + + actual, err := getYAMLString(objs) + require.Nil(t, err) + require.NotContains(t, actual, "sysfs-memory-online") + require.NotContains(t, actual, "/sys/devices/system/memory/auto_online_blocks") +} + func TestDriverHostNetwork(t *testing.T) { const ( testName = "driver-hostnetwork" @@ -816,7 +838,8 @@ func getMinimalDriverRenderData() *driverRenderData { Runtime: &driverRuntimeSpec{ Namespace: "test-operator", }, - HostRoot: "", + HostRoot: "", + MemoryHotplugAutoOnline: true, } } diff --git a/internal/state/nodepool.go b/internal/state/nodepool.go index c9781cce0..d52a96e3d 100644 --- a/internal/state/nodepool.go +++ b/internal/state/nodepool.go @@ -29,20 +29,22 @@ import ( ) const ( - nfdKernelLabelKey = "feature.node.kubernetes.io/kernel-version.full" - nfdOSTreeVersionLabelKey = "feature.node.kubernetes.io/system-os_release.OSTREE_VERSION" + nfdKernelLabelKey = "feature.node.kubernetes.io/kernel-version.full" + nfdKernelConfigMemoryHotplugLabelKey = "feature.node.kubernetes.io/kernel-config.MEMORY_HOTPLUG" + nfdOSTreeVersionLabelKey = "feature.node.kubernetes.io/system-os_release.OSTREE_VERSION" ) // TODO: move this code to it's own module? // TODO: add unit tests type nodePool struct { - name string - osRelease string - osVersion string - osTag string - rhcosVersion string - kernel string - nodeSelector map[string]string + name string + osRelease string + osVersion string + osTag string + rhcosVersion string + kernel string + nodeSelector map[string]string + memoryHotplugAutoOnline bool } // getNodePools partitions nodes into one or more node pools. The list of nodes to partition @@ -80,6 +82,7 @@ func getNodePools(ctx context.Context, k8sClient client.Client, selector map[str nodePool := nodePool{} nodePool.nodeSelector = make(map[string]string) maps.Copy(nodePool.nodeSelector, nodeSelector) + nodePool.memoryHotplugAutoOnline = nodeLabels[nfdKernelConfigMemoryHotplugLabelKey] == "true" osID, ok := nodeLabels[nfdOSReleaseIDLabelKey] if !ok { @@ -126,10 +129,14 @@ func getNodePools(ctx context.Context, k8sClient client.Client, selector map[str nodePool.name = rhcosVersion } - if _, exists := nodePoolMap[nodePool.name]; !exists { - logger.Info("Detected new node pool", "NodePool", nodePool) - nodePoolMap[nodePool.name] = nodePool + if existing, exists := nodePoolMap[nodePool.name]; exists { + existing.memoryHotplugAutoOnline = existing.memoryHotplugAutoOnline && nodePool.memoryHotplugAutoOnline + nodePoolMap[nodePool.name] = existing + continue } + + logger.Info("Detected new node pool", "NodePool", nodePool) + nodePoolMap[nodePool.name] = nodePool } var nodePools []nodePool diff --git a/internal/state/nodepool_test.go b/internal/state/nodepool_test.go index 6d175d7d2..f51ec2908 100644 --- a/internal/state/nodepool_test.go +++ b/internal/state/nodepool_test.go @@ -17,9 +17,14 @@ package state import ( + "context" "testing" "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" ) func TestGetOSTag(t *testing.T) { @@ -95,3 +100,44 @@ func TestGetOSTag(t *testing.T) { }) } } + +func TestGetNodePoolsMemoryHotplugAutoOnline(t *testing.T) { + nodeLabels := map[string]string{ + "nvidia.com/gpu.present": "true", + nfdOSReleaseIDLabelKey: "ubuntu", + nfdOSVersionIDLabelKey: "22.04", + nfdKernelConfigMemoryHotplugLabelKey: "true", + } + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node-0", Labels: nodeLabels}} + client := fake.NewClientBuilder().WithScheme(scheme.Scheme).WithObjects(node).Build() + + pools, err := getNodePools(context.Background(), client, nil, false, false) + require.NoError(t, err) + require.Len(t, pools, 1) + require.True(t, pools[0].memoryHotplugAutoOnline) +} + +func TestGetNodePoolsDisablesMemoryHotplugAutoOnlineForMixedPool(t *testing.T) { + baseLabels := map[string]string{ + "nvidia.com/gpu.present": "true", + nfdOSReleaseIDLabelKey: "ubuntu", + nfdOSVersionIDLabelKey: "22.04", + } + nodeWithMemoryHotplug := &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node-0", Labels: map[string]string{}}} + for key, value := range baseLabels { + nodeWithMemoryHotplug.Labels[key] = value + } + nodeWithMemoryHotplug.Labels[nfdKernelConfigMemoryHotplugLabelKey] = "true" + + nodeWithoutMemoryHotplug := &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node-1", Labels: map[string]string{}}} + for key, value := range baseLabels { + nodeWithoutMemoryHotplug.Labels[key] = value + } + + client := fake.NewClientBuilder().WithScheme(scheme.Scheme).WithObjects(nodeWithMemoryHotplug, nodeWithoutMemoryHotplug).Build() + + pools, err := getNodePools(context.Background(), client, nil, false, false) + require.NoError(t, err) + require.Len(t, pools, 1) + require.False(t, pools[0].memoryHotplugAutoOnline) +} diff --git a/manifests/state-driver/0500_daemonset.yaml b/manifests/state-driver/0500_daemonset.yaml index 5b9c6c62f..7b140d95e 100644 --- a/manifests/state-driver/0500_daemonset.yaml +++ b/manifests/state-driver/0500_daemonset.yaml @@ -325,8 +325,10 @@ spec: mountPropagation: HostToContainer - name: firmware-search-path mountPath: /sys/module/firmware_class/parameters/path + {{- if .MemoryHotplugAutoOnline }} - name: sysfs-memory-online mountPath: /sys/devices/system/memory/auto_online_blocks + {{- end }} - name: nv-firmware mountPath: /lib/firmware - name: driver-startup-probe-script @@ -687,9 +689,11 @@ spec: - name: firmware-search-path hostPath: path: /sys/module/firmware_class/parameters/path + {{- if .MemoryHotplugAutoOnline }} - name: sysfs-memory-online hostPath: path: /sys/devices/system/memory/auto_online_blocks + {{- end }} - name: nv-firmware hostPath: path: /run/nvidia/driver/lib/firmware