Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion c/src/cluster/kmeans.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
#include <cuvs/cluster/kmeans.hpp>
#include <cuvs/core/c_api.h>

#include <raft/core/copy.hpp>
#include <raft/core/device_mdarray.hpp>
#include <raft/core/mdspan.hpp>
#include <raft/core/resource/cuda_stream.hpp>

#include "../core/exceptions.hpp"
#include "../core/interop.hpp"

Expand Down Expand Up @@ -213,10 +218,14 @@ void _cluster_cost(cuvsResources_t res,
if (cuvs::core::is_dlpack_device_compatible(X)) {
using mdspan_type = raft::device_matrix_view<const T, IdxT, raft::row_major>;

auto d_cost = raft::make_device_scalar<T>(*res_ptr, T{0});
cuvs::cluster::kmeans::cluster_cost(*res_ptr,
cuvs::core::from_dlpack<mdspan_type>(X_tensor),
cuvs::core::from_dlpack<mdspan_type>(centroids_tensor),
raft::make_host_scalar_view<T>(&cost_temp));
d_cost.view());
raft::copy(
*res_ptr, raft::make_host_scalar_view<T>(&cost_temp), raft::make_const_mdspan(d_cost.view()));
raft::resource::sync_stream(*res_ptr);
} else {
RAFT_FAIL("X dataset must be accessible on device memory");
}
Expand Down
32 changes: 23 additions & 9 deletions cpp/include/cuvs/cluster/kmeans.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1532,14 +1532,16 @@ void transform(raft::resources const& handle,
* @param[out] cost Resulting cluster cost
* @param[in] sample_weight Optional per-sample weights.
* [len = n_samples]
*
* @param[in] X_norm Optional precomputed squared L2 row norms of X (||x||^2) [n_samples].
* When provided, the internal norm computation is skipped.
*/
void cluster_cost(
const raft::resources& handle,
raft::device_matrix_view<const float, int> X,
raft::device_matrix_view<const float, int> centroids,
raft::host_scalar_view<float> cost,
std::optional<raft::device_vector_view<const float, int>> sample_weight = std::nullopt);
raft::device_scalar_view<float> cost,
std::optional<raft::device_vector_view<const float, int>> sample_weight = std::nullopt,
std::optional<raft::device_vector_view<const float, int>> X_norm = std::nullopt);
Comment on lines 1538 to +1544

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗄️ Data Integrity & Integration | 🔴 Critical | 🏗️ Heavy lift

CRITICAL: cluster_cost is now source-breaking in the public C++ API.

Switching cost from raft::host_scalar_view to raft::device_scalar_view in cpp/include/cuvs/ removes the old call surface entirely, so existing downstream callers stop compiling with no deprecation path or migration note.

Consider keeping the old host-scalar overloads as deprecated shims for at least one release and documenting the migration. As per coding guidelines, “API changes require deprecation warnings.” As per path instructions for cpp/include/cuvs/**/*, “Breaking changes require deprecation warnings and migration guide updates.”

Also applies to: 1562-1568, 1586-1592, 1610-1616

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@cpp/include/cuvs/cluster/kmeans.hpp` around lines 1538 - 1544, `cluster_cost`
in the public kmeans API is source-breaking because `cost` was changed from a
host scalar to a device scalar, so restore compatibility by keeping the existing
host-scalar overloads as deprecated shims alongside the new
`raft::device_scalar_view` overload in `cluster_cost` and the related overloads
at the referenced locations. Update the overload set in `cuvs::cluster::kmeans`
so downstream callers can still compile, emit deprecation warnings from the old
signatures, and add the required migration/deprecation note in the public API
docs for the affected functions.

Sources: Coding guidelines, Path instructions


/**
* @brief Compute cluster cost
Expand All @@ -1554,13 +1556,17 @@ void cluster_cost(
* @param[out] cost Resulting cluster cost
* @param[in] sample_weight Optional per-sample weights.
* [len = n_samples]
* @param[in] X_norm Optional precomputed squared L2 row norms of X (||x||^2,
* i.e. sum of squares without the sqrt) [n_samples]. When
* provided, the internal norm computation is skipped.
*/
void cluster_cost(
const raft::resources& handle,
raft::device_matrix_view<const double, int> X,
raft::device_matrix_view<const double, int> centroids,
raft::host_scalar_view<double> cost,
std::optional<raft::device_vector_view<const double, int>> sample_weight = std::nullopt);
raft::device_scalar_view<double> cost,
std::optional<raft::device_vector_view<const double, int>> sample_weight = std::nullopt,
std::optional<raft::device_vector_view<const double, int>> X_norm = std::nullopt);

/**
* @brief Compute (optionally weighted) cluster cost
Expand All @@ -1575,13 +1581,17 @@ void cluster_cost(
* @param[out] cost Resulting cluster cost
* @param[in] sample_weight Optional per-sample weights.
* [len = n_samples]
* @param[in] X_norm Optional precomputed squared L2 row norms of X (||x||^2,
* i.e. sum of squares without the sqrt) [n_samples]. When
* provided, the internal norm computation is skipped.
*/
void cluster_cost(
const raft::resources& handle,
raft::device_matrix_view<const float, int64_t> X,
raft::device_matrix_view<const float, int64_t> centroids,
raft::host_scalar_view<float> cost,
std::optional<raft::device_vector_view<const float, int64_t>> sample_weight = std::nullopt);
raft::device_scalar_view<float> cost,
std::optional<raft::device_vector_view<const float, int64_t>> sample_weight = std::nullopt,
std::optional<raft::device_vector_view<const float, int64_t>> X_norm = std::nullopt);

/**
* @brief Compute (optionally weighted) cluster cost
Expand All @@ -1596,13 +1606,17 @@ void cluster_cost(
* @param[out] cost Resulting cluster cost
* @param[in] sample_weight Optional per-sample weights.
* [len = n_samples]
* @param[in] X_norm Optional precomputed squared L2 row norms of X (||x||^2,
* i.e. sum of squares without the sqrt) [n_samples]. When
* provided, the internal norm computation is skipped.
*/
void cluster_cost(
const raft::resources& handle,
raft::device_matrix_view<const double, int64_t> X,
raft::device_matrix_view<const double, int64_t> centroids,
raft::host_scalar_view<double> cost,
std::optional<raft::device_vector_view<const double, int64_t>> sample_weight = std::nullopt);
raft::device_scalar_view<double> cost,
std::optional<raft::device_vector_view<const double, int64_t>> sample_weight = std::nullopt,
std::optional<raft::device_vector_view<const double, int64_t>> X_norm = std::nullopt);
/**
* @}
*/
Expand Down
38 changes: 30 additions & 8 deletions cpp/src/cluster/detail/kmeans.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -931,7 +931,11 @@ void kmeans_fit(
auto centroids_const = raft::make_device_matrix_view<const DataT, IndexT>(
cur_centroids_ptr, n_clusters, n_features);

iter_inertia = DataT{0};
auto d_iter_inertia = raft::make_device_scalar<DataT>(handle, DataT{0});
auto d_batch_cost = raft::make_device_scalar<DataT>(handle, DataT{0});
DataT* p_acc = d_iter_inertia.data_handle();
DataT* p_batch = d_batch_cost.data_handle();

data_batches.reset();
using wt_iter_t = cuvs::spatial::knn::detail::utils::batch_load_iterator_dyn<DataT>;
std::optional<wt_iter_t> wt_it;
Expand All @@ -956,15 +960,33 @@ void kmeans_fit(
cur_batch_weights(static_cast<IndexT>(data_batch.offset()), wt_data, cur_batch_size);
}

DataT batch_cost = DataT{0};
cuvs::cluster::kmeans::cluster_cost(handle,
batch_data_view,
centroids_const,
raft::make_host_scalar_view(&batch_cost),
batch_sw);
std::optional<raft::device_vector_view<const DataT, IndexT>> batch_xnorm = std::nullopt;
if (need_compute_norms) {
if constexpr (data_on_device) {
batch_xnorm = raft::make_device_vector_view<const DataT, IndexT>(
L2NormBatch.data_handle() + data_batch.offset(), cur_batch_size);
} else if (norms_cached) {
raft::copy(L2NormBatch.data_handle(),
h_norm_cache.data_handle() + data_batch.offset(),
cur_batch_size,
stream);
batch_xnorm = raft::make_device_vector_view<const DataT, IndexT>(
L2NormBatch.data_handle(), cur_batch_size);
}
Comment on lines +963 to +975

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟠 Major | ⚡ Quick win

HIGH: Guard the uncached host-norm path for zero-iteration fits.

Issue: For host data, this final inertia path always copies from h_norm_cache; if max_iter == 0, the training loop never populated it, so inertia is computed from uninitialized norms.
Why: This returns incorrect final inertia for a valid-looking parameter combination because max_iter is not rejected earlier.

Suggested fix
         if (need_compute_norms) {
           if constexpr (data_on_device) {
             batch_xnorm = raft::make_device_vector_view<const DataT, IndexT>(
               L2NormBatch.data_handle() + data_batch.offset(), cur_batch_size);
           } else {
-            raft::copy(L2NormBatch.data_handle(),
-                       h_norm_cache.data_handle() + data_batch.offset(),
-                       cur_batch_size,
-                       stream);
+            if (norms_cached) {
+              raft::copy(L2NormBatch.data_handle(),
+                         h_norm_cache.data_handle() + data_batch.offset(),
+                         cur_batch_size,
+                         stream);
+            } else {
+              compute_batch_norms(data_batch.data(), cur_batch_size);
+            }
             batch_xnorm = raft::make_device_vector_view<const DataT, IndexT>(
               L2NormBatch.data_handle(), cur_batch_size);
           }
         }
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@cpp/src/cluster/detail/kmeans.cuh` around lines 963 - 975, The host data path
(in the else block after checking data_on_device) unconditionally copies from
h_norm_cache without verifying it was populated. When max_iter equals zero, the
training loop never initializes h_norm_cache, causing the raft::copy operation
to read uninitialized memory and compute incorrect final inertia. Add a guard
condition in the else block to check if max_iter is zero, and handle this case
separately (either by skipping the norm computation or computing norms
on-the-fly) before attempting to copy from the unpopulated h_norm_cache.

}

cuvs::cluster::kmeans::cluster_cost(
handle, batch_data_view, centroids_const, d_batch_cost.view(), batch_sw, batch_xnorm);

iter_inertia += batch_cost;
raft::linalg::map_offset(handle,
raft::make_device_vector_view<DataT, int>(p_acc, 1),
[p_acc, p_batch] __device__(int) { return *p_acc + *p_batch; });
}

raft::copy(handle,
raft::make_host_scalar_view<DataT>(&iter_inertia),
raft::make_const_mdspan(d_iter_inertia.view()));
raft::resource::sync_stream(handle);
}

if (iter_inertia < inertia[0]) {
Expand Down
8 changes: 6 additions & 2 deletions cpp/src/cluster/detail/kmeans_balanced.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1139,8 +1139,12 @@ void build_hierarchical(const raft::resources& handle,
reinterpret_cast<const MathT*>(dataset), n_rows, dim);
auto centroids_view =
raft::make_device_matrix_view<const MathT, IdxT>(cluster_centers, n_clusters, dim);
cuvs::cluster::kmeans::cluster_cost(
handle, X_view, centroids_view, raft::make_host_scalar_view<MathT>(inertia));
auto d_inertia = raft::make_device_scalar<MathT>(handle, MathT{0});
cuvs::cluster::kmeans::cluster_cost(handle, X_view, centroids_view, d_inertia.view());
raft::copy(handle,
raft::make_host_scalar_view<MathT>(inertia),
raft::make_const_mdspan(d_inertia.view()));
raft::resource::sync_stream(handle, stream);
Comment on lines +1142 to +1147

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚀 Performance & Scalability | 🟠 Major | ⚡ Quick win

HIGH: final hierarchical inertia still drops the cached dataset norms.

build_hierarchical() has already materialized dataset_norm above, but this cluster_cost() call does not pass it through, so the final inertia path pays for another full row-norm allocation/recompute over n_rows. That gives back much of the win this PR is targeting on the hierarchical path.

Have you considered forwarding dataset_norm here for the L2-based cases that precompute it?

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@cpp/src/cluster/detail/kmeans_balanced.cuh` around lines 1142 - 1147, The
final inertia computation in build_hierarchical is still recomputing row norms
instead of reusing the cached dataset_norm. Update the cluster_cost call in this
hierarchical path to forward dataset_norm for the L2 cases that already
precompute it, and keep the existing fallback behavior for other metrics or
paths. Use the build_hierarchical and cluster_cost symbols to locate the call
site and thread the cached norm through the final inertia calculation.

} else {
RAFT_LOG_WARN("Inertia is not computed for non float/double types");
}
Expand Down
49 changes: 17 additions & 32 deletions cpp/src/cluster/kmeans.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -382,14 +382,17 @@ void min_cluster_distance(raft::resources const& handle,
* @param[in] centroids Cluster centroids [n_clusters x n_features]
* @param[out] cost Sum of squared distances to nearest centroid (device)
* @param[in] sample_weight Optional per-sample weights [n_samples]
* @param[in] X_norm Optional precomputed squared L2 row norms of X (||x||^2) [n_samples].
* When provided, the internal norm computation is skipped.
*/
template <typename DataT, typename IndexT>
void cluster_cost(
raft::resources const& handle,
raft::device_matrix_view<const DataT, IndexT> X,
raft::device_matrix_view<const DataT, IndexT> centroids,
raft::device_scalar_view<DataT> cost,
std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight = std::nullopt)
std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight = std::nullopt,
std::optional<raft::device_vector_view<const DataT, IndexT>> X_norm = std::nullopt)
{
auto stream = raft::resource::get_cuda_stream(handle);
auto n_clusters = centroids.extent(0);
Expand All @@ -398,8 +401,18 @@ void cluster_cost(

rmm::device_uvector<char> workspace(n_samples * sizeof(IndexT), stream);

auto x_norms = raft::make_device_vector<DataT>(handle, n_samples);
raft::linalg::norm<raft::linalg::L2Norm, raft::Apply::ALONG_ROWS>(handle, X, x_norms.view());
std::optional<raft::device_vector<DataT, IndexT>> x_norms_buf;
DataT* x_norms_ptr;
if (X_norm.has_value()) {
RAFT_EXPECTS(X_norm->extent(0) == n_samples, "X_norm size !=n_samples");
x_norms_ptr = const_cast<DataT*>(X_norm->data_handle());
} else {
x_norms_buf.emplace(raft::make_device_vector<DataT, IndexT>(handle, n_samples));
raft::linalg::norm<raft::linalg::L2Norm, raft::Apply::ALONG_ROWS>(
handle, X, x_norms_buf->view());
x_norms_ptr = x_norms_buf->data_handle();
}
auto x_norms_view = raft::make_device_vector_view<DataT, IndexT>(x_norms_ptr, n_samples);
Comment thread
coderabbitai[bot] marked this conversation as resolved.

auto min_cluster_distance = raft::make_device_vector<DataT>(handle, n_samples);
rmm::device_uvector<DataT> l2_norm_or_distance_buffer(0, stream);
Expand All @@ -412,7 +425,7 @@ void cluster_cost(
raft::make_device_matrix_view<DataT, IndexT>(
const_cast<DataT*>(centroids.data_handle()), n_clusters, n_features),
min_cluster_distance.view(),
x_norms.view(),
x_norms_view,
l2_norm_or_distance_buffer,
metric,
n_samples,
Expand All @@ -431,34 +444,6 @@ void cluster_cost(
handle, min_cluster_distance.view(), workspace, cost, raft::add_op{});
}

/**
* @brief Compute (optionally weighted) cluster cost (inertia) — host-scalar output.
*
* Convenience wrapper that copies the result to host and synchronizes.
*
* @tparam DataT float or double
* @tparam IndexT Index type
*
* @param[in] handle The raft handle
* @param[in] X Input data [n_samples x n_features]
* @param[in] centroids Cluster centroids [n_clusters x n_features]
* @param[out] cost Sum of squared distances to nearest centroid (host)
* @param[in] sample_weight Optional per-sample weights [n_samples]
*/
template <typename DataT, typename IndexT>
void cluster_cost(
raft::resources const& handle,
raft::device_matrix_view<const DataT, IndexT> X,
raft::device_matrix_view<const DataT, IndexT> centroids,
raft::host_scalar_view<DataT> cost,
std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight = std::nullopt)
{
auto device_cost = raft::make_device_scalar<DataT>(handle, DataT(0));
cuvs::cluster::kmeans::cluster_cost(handle, X, centroids, device_cost.view(), sample_weight);
raft::copy(handle, cost, raft::make_const_mdspan(device_cost.view()));
raft::resource::sync_stream(handle);
}

/**
* @brief Calculates a <key, value> pair for every sample in input 'X' where key is an
* index of one of the 'centroids' (index of the nearest centroid) and 'value'
Expand Down
32 changes: 20 additions & 12 deletions cpp/src/cluster/kmeans_cluster_cost.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,36 +11,44 @@ namespace cuvs::cluster::kmeans {
void cluster_cost(const raft::resources& handle,
raft::device_matrix_view<const float, int> X,
raft::device_matrix_view<const float, int> centroids,
raft::host_scalar_view<float> cost,
std::optional<raft::device_vector_view<const float, int>> sample_weight)
raft::device_scalar_view<float> cost,
std::optional<raft::device_vector_view<const float, int>> sample_weight,
std::optional<raft::device_vector_view<const float, int>> X_norm)
{
cuvs::cluster::kmeans::cluster_cost<float, int>(handle, X, centroids, cost, sample_weight);
cuvs::cluster::kmeans::cluster_cost<float, int>(
handle, X, centroids, cost, sample_weight, X_norm);
}

void cluster_cost(const raft::resources& handle,
raft::device_matrix_view<const double, int> X,
raft::device_matrix_view<const double, int> centroids,
raft::host_scalar_view<double> cost,
std::optional<raft::device_vector_view<const double, int>> sample_weight)
raft::device_scalar_view<double> cost,
std::optional<raft::device_vector_view<const double, int>> sample_weight,
std::optional<raft::device_vector_view<const double, int>> X_norm)
{
cuvs::cluster::kmeans::cluster_cost<double, int>(handle, X, centroids, cost, sample_weight);
cuvs::cluster::kmeans::cluster_cost<double, int>(
handle, X, centroids, cost, sample_weight, X_norm);
}

void cluster_cost(const raft::resources& handle,
raft::device_matrix_view<const float, int64_t> X,
raft::device_matrix_view<const float, int64_t> centroids,
raft::host_scalar_view<float> cost,
std::optional<raft::device_vector_view<const float, int64_t>> sample_weight)
raft::device_scalar_view<float> cost,
std::optional<raft::device_vector_view<const float, int64_t>> sample_weight,
std::optional<raft::device_vector_view<const float, int64_t>> X_norm)
{
cuvs::cluster::kmeans::cluster_cost<float, int64_t>(handle, X, centroids, cost, sample_weight);
cuvs::cluster::kmeans::cluster_cost<float, int64_t>(
handle, X, centroids, cost, sample_weight, X_norm);
}

void cluster_cost(const raft::resources& handle,
raft::device_matrix_view<const double, int64_t> X,
raft::device_matrix_view<const double, int64_t> centroids,
raft::host_scalar_view<double> cost,
std::optional<raft::device_vector_view<const double, int64_t>> sample_weight)
raft::device_scalar_view<double> cost,
std::optional<raft::device_vector_view<const double, int64_t>> sample_weight,
std::optional<raft::device_vector_view<const double, int64_t>> X_norm)
{
cuvs::cluster::kmeans::cluster_cost<double, int64_t>(handle, X, centroids, cost, sample_weight);
cuvs::cluster::kmeans::cluster_cost<double, int64_t>(
handle, X, centroids, cost, sample_weight, X_norm);
}
} // namespace cuvs::cluster::kmeans
Loading
Loading