-
Notifications
You must be signed in to change notification settings - Fork 196
[FEA] Multi-node Out of Core Streaming KMeans API #2066
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
66d7fd3
07707af
efc270f
0a09e6f
99a5730
a077406
d659875
ec2e8b7
03a6473
42a8d9d
86af2fa
d4e4e2c
0819af5
e0f079c
c2f7390
b9c3102
e3956c1
986d78a
7197b71
84ab315
47d4b94
a8e1d26
384d054
455b286
5462809
6ba759c
e76eaac
afbefdf
e62a63c
e4f08bf
6e4a8f0
4a8a85c
bbf2a9f
410092c
c515c1e
e8e63ab
30c457c
ab96623
269f23c
80a22ca
ac06b05
855624a
0a6748d
7055272
0569340
8cac63a
f6df4ae
9fc74b1
dec3dc4
0d030a2
b1c034e
a482495
8ecfdc1
1e1525e
ec22e07
d2e410d
b791c38
a05a006
73293cf
880c7b9
e2035ec
e28c200
55bbdad
9a9b8ee
a800b27
3db8582
c048352
2f968f8
affe85a
c6dea64
7dfab3e
7a383da
ce6c4b5
5a06a44
419619a
2d716ae
066092b
bbdd66d
12d682c
9e5e55c
28cda6a
bfb5290
add9db1
6c08a7b
acbcd5a
af606bc
41c66b8
f664c2c
5430f42
b2ab5bd
bbdf521
10e6def
2040145
1828462
5c5b8c8
05da5f3
90435c1
db41338
6c2c03d
7f6d664
f8270e2
bbf0302
a14a6bc
7b54a42
d86b8b4
6e11f67
9f5b6e5
aaef638
920a460
548d7db
9f3a486
c93f248
51fbf6c
d327569
b5e66a3
a636188
d3cafed
1b547f4
72cfd43
4d25e95
81155e6
85522aa
00336b5
6585866
aa6f28e
178a7e7
713bc7c
c576d8f
a401a0e
8102596
00d0adb
caefd53
8f6f83d
d88a991
1b57b74
7bac418
9851017
f572877
edaa7e7
588bb6a
ad180ed
72cc34b
ed50703
28f6036
a811c56
95f334c
d176314
1db9e02
089e970
6cc895c
4abe6f2
ebf188a
785e4a3
9a526c8
f08e581
51efb42
3e3cac7
7ffae6d
447e136
50d0359
b1e7521
f8a3503
272a9d5
1f9fd9e
ed705c6
c8135e5
cf5e831
8b6eab2
100f7d6
46d18d1
db23a72
1e7c119
116a6cf
ae284be
d15cb28
653aac1
92588ba
ef60e3c
8831789
3f800a1
32be863
d0fa3a9
0b969c4
040e82b
d65287d
3321926
de57f46
1a6bbb5
e627af4
fd19b43
993b813
bd3e1de
40c77ca
079c216
7a38f62
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ | |
|
|
||
| #include <cuvs/core/export.hpp> | ||
| #include <optional> | ||
| #include <vector> | ||
|
|
||
| namespace CUVS_EXPORT cuvs { | ||
| namespace cluster { | ||
|
|
@@ -94,7 +95,15 @@ struct params : base_params { | |
| int n_init = 1; | ||
|
|
||
| /** | ||
| * Oversampling factor for use in the k-means|| algorithm | ||
| * Oversampling factor for use in the k-means|| algorithm. | ||
| * | ||
| * In the single-GPU path the value `0` is overloaded as an algorithm switch | ||
| * that selects the classic sequential k-means++ instead of the scalable | ||
| * variant. Any value `> 0` is used as-is. | ||
| * | ||
| * In the multi-GPU path (@ref cuvs::cluster::kmeans::mg::fit with device- | ||
| * resident inputs) any value `< 1.0` (including `0`) is internally clamped to `1.0`. | ||
| * Values `>= 1.0` are passed through unchanged. | ||
| */ | ||
| double oversampling_factor = 2.0; | ||
|
|
||
|
|
@@ -137,11 +146,13 @@ struct params : base_params { | |
| /** | ||
| * Number of samples to process per GPU batch when fitting with host data. | ||
| * When set to 0, defaults to n_samples (process all at once). | ||
| * Only used by the batched (host-data) code path and ignored by device-data | ||
| * overloads. | ||
| * Only used by the batched (host-data) code path and ignored by | ||
| * device-data overloads. | ||
| * | ||
| * In multi-GPU mode, this is a per-rank batch size. Each rank processes up to | ||
| * this many local samples per batch, clamped to that rank's local sample count. | ||
| * In multi-GPU mode this is a per-rank batch size: each rank processes up | ||
| * to this many local samples per batch, clamped to that rank's local sample | ||
| * count. When MG is invoked with device partitions, the runtime ignores | ||
| * `streaming_batch_size` and processes each partition in full. | ||
| * Default: 0 (process all data at once). | ||
| */ | ||
| int64_t streaming_batch_size = 0; | ||
|
|
@@ -179,26 +190,22 @@ enum class kmeans_type { KMeans = 0, KMeansBalanced = 1 }; | |
|
|
||
| /** | ||
| * @brief Find clusters with k-means algorithm using batched processing of host data. | ||
| * Single-GPU only. | ||
| * | ||
| * Multi-GPU migration (breaking change in cuVS 26.08): earlier releases | ||
| * silently dispatched this single-GPU overload to a multi-GPU implementation | ||
| * when the supplied RAFT handle had RAFT comms or an SNMG clique attached. That | ||
| * implicit dispatch has been removed: this overload is now strictly | ||
| * single-GPU. If `handle` carries communications/clique state it is ignored and the call falls back | ||
| * to the single-GPU path. To run on multiple GPUs, call `cuvs::cluster::kmeans::mg::fit` | ||
| * explicitly. | ||
| * | ||
| * TODO: Evaluate replacing the extent type with int64_t. Reference issue: | ||
| * https://github.com/rapidsai/cuvs/issues/1961 | ||
| * | ||
| * This overload supports out-of-core computation where the dataset resides | ||
| * on the host. Data is processed in GPU-sized batches, streaming from host to device. | ||
| * The batch size is controlled by params.streaming_batch_size. In multi-GPU mode, | ||
| * this is a per-rank batch size. | ||
| * | ||
| * Multi-GPU dispatch is selected automatically based on the handle state: | ||
| * - If `raft::resource::is_multi_gpu(handle)` (cuVS SNMG): the full dataset X | ||
| * is split across GPUs internally with an OpenMP parallel region and NCCL. | ||
| * - If `raft::resource::comms_initialized(handle)` (Dask/Ray/MPI): X is treated as | ||
| * this worker's partition, and RAFT communicators are used for collectives. | ||
| * - Otherwise: single-GPU batched k-means. | ||
| * | ||
| * With `params.init == InitMethod::KMeansPlusPlus` in multi-GPU mode, the | ||
| * effective initialization sample must fit in GPU memory on every rank because | ||
| * it is materialized on every device. Rank 0 must also have enough GPU memory | ||
| * for the seeding workspace before centroids are broadcast. | ||
| * on the host. Data is processed in batches, streaming from host to | ||
| * device. The batch size is controlled by `params.streaming_batch_size`. | ||
| * | ||
| * @code{.cpp} | ||
| * #include <raft/core/resources.hpp> | ||
|
|
@@ -229,8 +236,7 @@ enum class kmeans_type { KMeans = 0, KMeansBalanced = 1 }; | |
| * raft::make_host_scalar_view(&n_iter)); | ||
| * @endcode | ||
| * | ||
| * @param[in] handle The raft handle. When a multi-GPU resource is | ||
| * attached, multi-GPU dispatch is used automatically. | ||
| * @param[in] handle The raft handle. | ||
| * @param[in] params Parameters for KMeans model. Batch size is read from | ||
| * params.streaming_batch_size. | ||
| * @param[in] X Training instances on HOST memory. The data must | ||
|
|
@@ -1607,6 +1613,234 @@ void cluster_cost( | |
| * @} | ||
| */ | ||
|
|
||
| #ifdef CUVS_BUILD_MG_ALGOS | ||
| namespace mg { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One compatibility question: the public header now declares cuvs::cluster::kmeans::mg::fit unconditionally, while cpp/CMakeLists.txt only compiles kmeans_fit_mg_float.cu and kmeans_fit_mg_double.cu under BUILD_MG_ALGOS. If a non-MG build installs this header, callers can compile against mg::fit but fail at link time. Would it be better to guard those declarations or provide stubs with a clear error?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair enough! I'll add the guarding macro in the header itself.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am also thinking about removing the ::mg namespace and rather naming the API |
||
| /** | ||
| * @defgroup kmeans_mg Multi-GPU / out-of-core k-means fit | ||
| * @{ | ||
| * | ||
| * @brief Explicit multi-GPU k-means entry points. | ||
| * | ||
| * All multi-GPU k-means APIs live in this namespace | ||
| * (`cuvs::cluster::kmeans::mg`). To run k-means on multiple GPUs a `handle` that | ||
| * carries either an SNMG clique (`raft::resource::is_multi_gpu(handle)`) or | ||
| * initialized RAFT comms (`raft::resource::comms_initialized(handle)`) must be used. | ||
| * | ||
| * Migration from earlier releases (breaking change in cuVS 26.08): before | ||
| * this release, the single-GPU `cuvs::cluster::kmeans::fit` overloads would | ||
| * silently dispatch to the multi-GPU backend when the supplied `handle` | ||
| * carried RAFT comms or an SNMG clique. That implicit dispatch has been | ||
| * removed: the single-GPU `fit` is now strictly single-GPU. Existing | ||
| * multi-GPU call sites must be updated to invoke | ||
| * `cuvs::cluster::kmeans::mg::fit` directly. Two flavors of the multi-GPU | ||
| * API are provided here: | ||
| * - A single mdspan per rank (drop-in replacement for the old single-GPU | ||
| * signature; cuVS wraps it into a one-element vector internally). | ||
| * - A `std::vector` of mdspan partitions per rank (multiple partitions per | ||
| * rank, typical for Dask/Ray and out-of-core host-data flows). | ||
| * | ||
| * @code{.cpp} | ||
| * // Before (cuVS <= 26.06): implicit multi-GPU dispatch via single-GPU API. | ||
| * // raft::resources handle; // attached to NCCL comms or SNMG clique | ||
| * // cuvs::cluster::kmeans::fit(handle, params, local_X, std::nullopt, | ||
| * // centroids, inertia, n_iter); | ||
| * | ||
| * // After (cuVS >= 26.08): explicit multi-GPU API. | ||
| * raft::resources handle; // NCCL comms or SNMG clique attached | ||
| * cuvs::cluster::kmeans::mg::fit(handle, params, local_X, std::nullopt, | ||
| * centroids, inertia, n_iter); | ||
| * @endcode | ||
| */ | ||
|
|
||
| /** | ||
| * @brief Multi-GPU k-means fit with one or more local data | ||
| * partitions per rank. | ||
| * | ||
| * Each rank supplies its local training data as a vector of partitions. For | ||
| * host-resident partitions the implementation streams each partition through | ||
| * Lloyd iterations using `params.streaming_batch_size` (per rank). For | ||
| * device-resident partitions `streaming_batch_size` is ignored and each local | ||
| * partition is processed in full. | ||
| * | ||
| * The active backend is selected by the resources attached to | ||
| * `handle`: | ||
| * - When `raft::resource::is_multi_gpu(handle)` is true (SNMG clique), the | ||
| * call must be issued from inside an OpenMP region with one thread per | ||
| * rank in the clique. | ||
| * - Otherwise, multi-process NCCL comms must be initialized on the handle | ||
| * (`raft::resource::comms_initialized(handle)`); each process supplies its | ||
| * own local partitions. | ||
| * | ||
| * @param[in] handle The raft handle. Must have NCCL comms or | ||
| * a SNMG clique initialized. | ||
| * @param[in] params K-means parameters. For host-resident | ||
| * partitions the per-rank streaming batch | ||
| * size is read from | ||
| * `params.streaming_batch_size`; it is | ||
| * ignored for device-resident partitions. | ||
| * @param[in] X_parts Per-partition local data on this rank. | ||
| * Each entry is [n_rows_i x n_features]. | ||
| * @param[in] sample_weight_parts Optional per-partition row weights with | ||
| * one vector per data partition. | ||
| * @param[inout] centroids Device matrix [n_clusters x n_features]. | ||
| * On entry, used as the initial centers | ||
| * when `params.init == InitMethod::Array`. | ||
| * On return, holds the converged | ||
| * centroids. | ||
| * @param[out] inertia Host scalar receiving the final | ||
| * clustering cost. | ||
| * @param[out] n_iter Host scalar receiving the iteration | ||
| * count at which the run terminated. | ||
| */ | ||
| void fit( | ||
| raft::resources const& handle, | ||
| const cuvs::cluster::kmeans::params& params, | ||
| const std::vector<raft::device_matrix_view<const float, int>>& X_parts, | ||
| const std::optional<std::vector<raft::device_vector_view<const float, int>>>& sample_weight_parts, | ||
| raft::device_matrix_view<float, int> centroids, | ||
| raft::host_scalar_view<float> inertia, | ||
| raft::host_scalar_view<int> n_iter); | ||
|
|
||
| /** | ||
| * @brief Multi-GPU k-means fit. | ||
| */ | ||
| void fit(raft::resources const& handle, | ||
| const cuvs::cluster::kmeans::params& params, | ||
| const std::vector<raft::device_matrix_view<const float, int64_t>>& X_parts, | ||
| const std::optional<std::vector<raft::device_vector_view<const float, int64_t>>>& | ||
| sample_weight_parts, | ||
| raft::device_matrix_view<float, int64_t> centroids, | ||
| raft::host_scalar_view<float> inertia, | ||
| raft::host_scalar_view<int64_t> n_iter); | ||
|
|
||
| /** | ||
| * @brief Multi-GPU k-means fit. | ||
| */ | ||
| void fit(raft::resources const& handle, | ||
| const cuvs::cluster::kmeans::params& params, | ||
| const std::vector<raft::device_matrix_view<const double, int>>& X_parts, | ||
| const std::optional<std::vector<raft::device_vector_view<const double, int>>>& | ||
| sample_weight_parts, | ||
| raft::device_matrix_view<double, int> centroids, | ||
| raft::host_scalar_view<double> inertia, | ||
| raft::host_scalar_view<int> n_iter); | ||
|
|
||
| /** | ||
| * @brief Multi-GPU k-means fit. | ||
| */ | ||
| void fit(raft::resources const& handle, | ||
| const cuvs::cluster::kmeans::params& params, | ||
| const std::vector<raft::device_matrix_view<const double, int64_t>>& X_parts, | ||
| const std::optional<std::vector<raft::device_vector_view<const double, int64_t>>>& | ||
| sample_weight_parts, | ||
| raft::device_matrix_view<double, int64_t> centroids, | ||
| raft::host_scalar_view<double> inertia, | ||
| raft::host_scalar_view<int64_t> n_iter); | ||
|
|
||
| /** | ||
| * @brief Multi-GPU / out-of-core k-means fit. | ||
| */ | ||
| void fit(raft::resources const& handle, | ||
| const cuvs::cluster::kmeans::params& params, | ||
| const std::vector<raft::host_matrix_view<const float, int64_t>>& X_parts, | ||
| const std::optional<std::vector<raft::host_vector_view<const float, int64_t>>>& | ||
| sample_weight_parts, | ||
| raft::device_matrix_view<float, int64_t> centroids, | ||
| raft::host_scalar_view<float> inertia, | ||
| raft::host_scalar_view<int64_t> n_iter); | ||
|
|
||
| /** | ||
| * @brief Multi-GPU / out-of-core k-means fit. | ||
| */ | ||
| void fit(raft::resources const& handle, | ||
| const cuvs::cluster::kmeans::params& params, | ||
| const std::vector<raft::host_matrix_view<const double, int64_t>>& X_parts, | ||
| const std::optional<std::vector<raft::host_vector_view<const double, int64_t>>>& | ||
| sample_weight_parts, | ||
| raft::device_matrix_view<double, int64_t> centroids, | ||
| raft::host_scalar_view<double> inertia, | ||
| raft::host_scalar_view<int64_t> n_iter); | ||
|
|
||
| /** | ||
| * @brief Multi-GPU k-means fit, single mdspan per rank. | ||
| * | ||
| * Convenience overload for the common case where each rank has exactly one | ||
| * local partition. The mdspan is wrapped in a one-element vector and routed | ||
| * through the vector-of-partitions overload above. See that overload's | ||
| * documentation for backend selection and handle requirements. | ||
| */ | ||
| void fit(raft::resources const& handle, | ||
| const cuvs::cluster::kmeans::params& params, | ||
| raft::device_matrix_view<const float, int> X, | ||
| std::optional<raft::device_vector_view<const float, int>> sample_weight, | ||
| raft::device_matrix_view<float, int> centroids, | ||
| raft::host_scalar_view<float> inertia, | ||
| raft::host_scalar_view<int> n_iter); | ||
|
|
||
| /** | ||
| * @brief Multi-GPU k-means fit, single mdspan per rank. | ||
| */ | ||
| void fit(raft::resources const& handle, | ||
| const cuvs::cluster::kmeans::params& params, | ||
| raft::device_matrix_view<const float, int64_t> X, | ||
| std::optional<raft::device_vector_view<const float, int64_t>> sample_weight, | ||
| raft::device_matrix_view<float, int64_t> centroids, | ||
| raft::host_scalar_view<float> inertia, | ||
| raft::host_scalar_view<int64_t> n_iter); | ||
|
|
||
| /** | ||
| * @brief Multi-GPU k-means fit, single mdspan per rank. | ||
| */ | ||
| void fit(raft::resources const& handle, | ||
| const cuvs::cluster::kmeans::params& params, | ||
| raft::device_matrix_view<const double, int> X, | ||
| std::optional<raft::device_vector_view<const double, int>> sample_weight, | ||
| raft::device_matrix_view<double, int> centroids, | ||
| raft::host_scalar_view<double> inertia, | ||
| raft::host_scalar_view<int> n_iter); | ||
|
|
||
| /** | ||
| * @brief Multi-GPU k-means fit, single mdspan per rank. | ||
| */ | ||
| void fit(raft::resources const& handle, | ||
| const cuvs::cluster::kmeans::params& params, | ||
| raft::device_matrix_view<const double, int64_t> X, | ||
| std::optional<raft::device_vector_view<const double, int64_t>> sample_weight, | ||
| raft::device_matrix_view<double, int64_t> centroids, | ||
| raft::host_scalar_view<double> inertia, | ||
| raft::host_scalar_view<int64_t> n_iter); | ||
|
|
||
| /** | ||
| * @brief Multi-GPU / out-of-core k-means fit, single mdspan per rank. | ||
| * | ||
| * Dispatches to the SNMG-clique (batched per-rank) backend when the handle | ||
| * carries an SNMG clique, and to the NCCL multi-process backend otherwise. | ||
| */ | ||
| void fit(raft::resources const& handle, | ||
| const cuvs::cluster::kmeans::params& params, | ||
| raft::host_matrix_view<const float, int64_t> X, | ||
| std::optional<raft::host_vector_view<const float, int64_t>> sample_weight, | ||
| raft::device_matrix_view<float, int64_t> centroids, | ||
| raft::host_scalar_view<float> inertia, | ||
| raft::host_scalar_view<int64_t> n_iter); | ||
|
|
||
| /** | ||
| * @brief Multi-GPU / out-of-core k-means fit, single mdspan per rank. | ||
| */ | ||
| void fit(raft::resources const& handle, | ||
| const cuvs::cluster::kmeans::params& params, | ||
| raft::host_matrix_view<const double, int64_t> X, | ||
| std::optional<raft::host_vector_view<const double, int64_t>> sample_weight, | ||
| raft::device_matrix_view<double, int64_t> centroids, | ||
| raft::host_scalar_view<double> inertia, | ||
| raft::host_scalar_view<int64_t> n_iter); | ||
|
|
||
| /** | ||
| * @} | ||
| */ | ||
| } // namespace mg | ||
| #endif | ||
|
|
||
| namespace helpers { | ||
| /** | ||
| * @defgroup kmeans_helpers k-means API helpers | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.