diff --git a/rust/cuvs/Cargo.toml b/rust/cuvs/Cargo.toml index 16a02c61b0..c66fa09993 100644 --- a/rust/cuvs/Cargo.toml +++ b/rust/cuvs/Cargo.toml @@ -14,10 +14,12 @@ doc-only = ["cuvs-sys/doc-only"] [dependencies] cuvs-sys = { workspace = true } -ndarray = "0.15" +thiserror = "2" +tinyvec = { version = "1", features = ["alloc"] } [dev-dependencies] -ndarray-rand = "0.14" +ndarray = "0.17" +ndarray-rand = "0.16" [package.metadata.docs.rs] features = ["doc-only"] diff --git a/rust/cuvs/examples/cagra.rs b/rust/cuvs/examples/cagra.rs index 2f0ee4e071..f9c2697f10 100644 --- a/rust/cuvs/examples/cagra.rs +++ b/rust/cuvs/examples/cagra.rs @@ -3,62 +3,220 @@ * SPDX-License-Identifier: Apache-2.0 */ +//! CAGRA example with a user-provided GPU tensor. +//! +//! This demonstrates how to feed your own device memory into cuVS by +//! implementing the public [`IntoDlTensor`]/[`IntoDlTensorMut`] traits. The +//! [`CudaTensor`] type manages device memory directly through the CUDA runtime +//! (`cudaMalloc`/`cudaFree`) and copies to/from host arrays with `cudaMemcpyAsync` +//! on the cuVS stream, reusing the resources handle's `get_cuda_stream`/ +//! `sync_stream` for stream access and synchronization. +//! +//! A real application would likely rely on a helper crate such as `cudarc` +//! and its `CudaSlice`. + +use std::ffi::c_void; +use std::marker::PhantomData; +use std::os::raw::c_int; + +use cuvs::Resources; use cuvs::cagra::{Index, IndexParams, SearchParams}; -use cuvs::{ManagedTensor, Resources, Result}; +use cuvs::dlpack::{ + DLDevice, DLDeviceType, DLPackError, DLTensorView, DLTensorViewMut, DType, IntoDlTensor, + IntoDlTensorMut, +}; use ndarray::s; use ndarray_rand::RandomExt; use ndarray_rand::rand_distr::Uniform; -/// Example showing how to index and search data with CAGRA -fn cagra_example() -> Result<()> { +type ExampleResult = std::result::Result>; + +// --------------------------------------------------------------------------- +// Minimal CUDA runtime FFI +// --------------------------------------------------------------------------- + +#[allow(non_camel_case_types)] +type cudaError_t = c_int; +const CUDA_SUCCESS: cudaError_t = 0; +const CUDA_MEMCPY_HOST_TO_DEVICE: c_int = 1; +const CUDA_MEMCPY_DEVICE_TO_HOST: c_int = 2; + +#[link(name = "cudart")] +unsafe extern "C" { + fn cudaMalloc(ptr: *mut *mut c_void, size: usize) -> cudaError_t; + fn cudaFree(ptr: *mut c_void) -> cudaError_t; + fn cudaMemcpyAsync( + dst: *mut c_void, + src: *const c_void, + count: usize, + kind: c_int, + stream: cuvs_sys::cudaStream_t, + ) -> cudaError_t; +} + +fn check_cuda(status: cudaError_t) -> ExampleResult<()> { + if status == CUDA_SUCCESS { + Ok(()) + } else { + Err(format!("CUDA runtime error: {status}").into()) + } +} + +// --------------------------------------------------------------------------- +// A custom device tensor backed by the CUDA runtime +// --------------------------------------------------------------------------- + +struct CudaTensor { + data: *mut c_void, + shape: Vec, + bytes: usize, + _marker: PhantomData, +} + +impl CudaTensor { + /// Allocate an uninitialized device buffer (used for search outputs). + fn alloc(shape: &[usize]) -> ExampleResult { + let bytes = shape.iter().product::() * std::mem::size_of::(); + let mut data: *mut c_void = std::ptr::null_mut(); + check_cuda(unsafe { cudaMalloc(&mut data, bytes) })?; + Ok(Self { + data, + shape: shape.iter().map(|&d| d as i64).collect(), + bytes, + _marker: PhantomData, + }) + } + + /// Copy a contiguous host array onto the device. + fn from_host(res: &Resources, host: &ndarray::ArrayRef) -> ExampleResult + where + D: ndarray::Dimension, + { + if !host.is_standard_layout() { + return Err("host array must be contiguous (row-major)".into()); + } + let tensor = Self::alloc(host.shape())?; + + let stream = res.get_cuda_stream()?; + check_cuda(unsafe { + cudaMemcpyAsync( + tensor.data, + host.as_ptr() as *const c_void, + tensor.bytes, + CUDA_MEMCPY_HOST_TO_DEVICE, + stream, + ) + })?; + res.sync_stream()?; + + Ok(tensor) + } + + /// Copy the device buffer back into a contiguous host array. + fn to_host(&self, res: &Resources, host: &mut ndarray::ArrayRef) -> ExampleResult<()> + where + D: ndarray::Dimension, + { + if !host.is_standard_layout() { + return Err("host array must be contiguous (row-major)".into()); + } + + let stream = res.get_cuda_stream()?; + check_cuda(unsafe { + cudaMemcpyAsync( + host.as_mut_ptr() as *mut c_void, + self.data, + self.bytes, + CUDA_MEMCPY_DEVICE_TO_HOST, + stream, + ) + })?; + res.sync_stream()?; + + Ok(()) + } +} + +impl Drop for CudaTensor { + fn drop(&mut self) { + if !self.data.is_null() { + unsafe { cudaFree(self.data) }; + } + } +} + +impl<'a, T: DType> IntoDlTensor<'a> for &'a CudaTensor { + fn into_dl_tensor(self) -> std::result::Result, DLPackError> { + unsafe { + DLTensorView::from_raw_parts( + self.data, + DLDevice { device_type: DLDeviceType::kDLCUDA, device_id: 0 }, + &self.shape, + None, + T::dl_dtype(), + ) + } + } +} + +impl<'a, T: DType> IntoDlTensorMut<'a> for &'a mut CudaTensor { + fn into_dl_tensor_mut(self) -> std::result::Result, DLPackError> { + unsafe { + DLTensorViewMut::from_raw_parts( + self.data, + DLDevice { device_type: DLDeviceType::kDLCUDA, device_id: 0 }, + &self.shape, + None, + T::dl_dtype(), + ) + } + } +} + +/// Example showing how to index and search data with CAGRA. +fn cagra_example() -> ExampleResult<()> { let res = Resources::new()?; - // Create a new random dataset to index + // Create a new random dataset to index and copy it to the device. let n_datapoints = 65536; let n_features = 512; - let dataset = - ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + let dataset_host = ndarray::Array::::random( + (n_datapoints, n_features), + Uniform::new(0., 1.0).unwrap(), + ); + let dataset = CudaTensor::from_host(&res, &dataset_host)?; - // build the cagra index + // Build the CAGRA index. let build_params = IndexParams::new()?; let index = Index::build(&res, &build_params, &dataset)?; - println!("Indexed {}x{} datapoints into cagra index", n_datapoints, n_features); + println!("Indexed {n_datapoints}x{n_features} datapoints into cagra index"); - // use the first 4 points from the dataset as queries : will test that we get them back - // as their own nearest neighbor + // Use the first 4 points as queries; each should be its own nearest neighbor. let n_queries = 4; - let queries = dataset.slice(s![0..n_queries, ..]); - let k = 10; + let queries_host = dataset_host.slice(s![0..n_queries, ..]).to_owned(); + let queries = CudaTensor::from_host(&res, &queries_host)?; - // CAGRA search API requires queries and outputs to be on device memory - // copy query data over, and allocate new device memory for the distances/ neighbors - // outputs - let queries = ManagedTensor::from(&queries).to_device(&res)?; - let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); - let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res)?; - - let mut distances_host = ndarray::Array::::zeros((n_queries, k)); - let distances = ManagedTensor::from(&distances_host).to_device(&res)?; + let mut neighbors = CudaTensor::::alloc(&[n_queries, k])?; + let mut distances = CudaTensor::::alloc(&[n_queries, k])?; let search_params = SearchParams::new()?; + index.search(&res, &search_params, &queries, &mut neighbors, &mut distances)?; - index.search(&res, &search_params, &queries, &neighbors, &distances)?; - - // Copy back to host memory - distances.to_host(&res, &mut distances_host)?; + // Copy the results back to the host. + let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); + let mut distances_host = ndarray::Array::::zeros((n_queries, k)); neighbors.to_host(&res, &mut neighbors_host)?; + distances.to_host(&res, &mut distances_host)?; - // nearest neighbors should be themselves, since queries are from the - // dataset - println!("Neighbors {:?}", neighbors_host); - println!("Distances {:?}", distances_host); + println!("Neighbors {neighbors_host:?}"); + println!("Distances {distances_host:?}"); Ok(()) } fn main() { if let Err(e) = cagra_example() { - println!("Failed to run CAGRA: {:?}", e); + println!("Failed to run CAGRA: {e:?}"); } } diff --git a/rust/cuvs/src/brute_force.rs b/rust/cuvs/src/brute_force.rs index 413e8b0fb1..2b513c3627 100644 --- a/rust/cuvs/src/brute_force.rs +++ b/rust/cuvs/src/brute_force.rs @@ -2,94 +2,102 @@ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ -//! Brute Force KNN +//! Brute-force (exact) k-NN. +//! +//! Build an [`Index`] over a dataset, then [`search`](Index::search) it with +//! device-resident queries and output buffers. Tensors are passed through the +//! [`IntoDlTensor`] / +//! [`IntoDlTensorMut`] traits; see the +//! [`dlpack`](crate::dlpack) module for the tensor model and `examples/cagra.rs` +//! for the same build/search workflow. use std::io::{Write, stderr}; +use std::marker::PhantomData; use crate::distance_type::DistanceType; -use crate::dlpack::ManagedTensor; +use crate::dlpack::{IntoDlTensor, IntoDlTensorMut}; use crate::error::{Result, check_cuvs}; use crate::resources::Resources; /// Brute Force KNN Index #[derive(Debug)] -pub struct Index { +pub struct Index<'d> { inner: ffi::cuvsBruteForceIndex_t, // cuVS brute_force::index stores a non-owning view into the dataset. - // Keep the Rust tensor alive for as long as the C++ index may read it. - _dataset: Option, + // Keep the Rust borrow alive for as long as the C++ index may read it. + _dataset: PhantomData<&'d ()>, } -impl Index { - /// Builds a new Brute Force KNN Index from the dataset for efficient search. +impl<'d> Index<'d> { + /// Builds a brute-force index over `dataset` for exact k-NN search. /// - /// # Arguments - /// - /// * `res` - Resources to use - /// * `metric` - DistanceType to use for building the index - /// * `metric_arg` - Optional value of `p` for Minkowski distances - /// * `dataset` - A row-major matrix on either the host or device to index - pub fn build>( + /// `metric` selects the distance and `metric_arg` is the optional `p` for + /// Minkowski distances (defaults to 2). `dataset` is a row-major matrix on + /// the host or device implementing [`IntoDlTensor`]; the + /// C++ index keeps a non-owning view of it, so the returned [`Index`] borrows + /// it for `'d` and cannot outlive it. + pub fn build( res: &Resources, metric: DistanceType, metric_arg: Option, - dataset: T, - ) -> Result { - let dataset: ManagedTensor = dataset.into(); - let mut index = Index::new()?; + dataset: impl IntoDlTensor<'d>, + ) -> Result> { + let dataset = dataset.into_dl_tensor()?; + let index = Index::new()?; unsafe { check_cuvs(ffi::cuvsBruteForceBuild( res.0, - dataset.as_ptr(), + dataset.to_c().as_mut_ptr(), metric, metric_arg.unwrap_or(2.0), index.inner, ))?; } - index._dataset = Some(dataset); Ok(index) } /// Creates a new empty index - pub fn new() -> Result { + pub fn new() -> Result> { unsafe { let mut index = std::mem::MaybeUninit::::uninit(); check_cuvs(ffi::cuvsBruteForceIndexCreate(index.as_mut_ptr()))?; - Ok(Index { inner: index.assume_init(), _dataset: None }) + Ok(Index { inner: index.assume_init(), _dataset: PhantomData }) } } - /// Perform a Nearest Neighbors search on the Index - /// - /// # Arguments + /// Searches the index for the `k` nearest neighbors of each query. /// - /// * `res` - Resources to use - /// * `queries` - A matrix in device memory to query for - /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors - /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors - pub fn search( + /// `queries`, `neighbors`, and `distances` must reside in device memory and + /// implement [`IntoDlTensor`] / + /// [`IntoDlTensorMut`]. `neighbors` receives the + /// neighbor indices and `distances` their distances; both are written in + /// place. + pub fn search<'a>( &self, res: &Resources, - queries: &ManagedTensor, - neighbors: &ManagedTensor, - distances: &ManagedTensor, + queries: impl IntoDlTensor<'a>, + neighbors: impl IntoDlTensorMut<'a>, + distances: impl IntoDlTensorMut<'a>, ) -> Result<()> { + let queries = queries.into_dl_tensor()?; + let neighbors = neighbors.into_dl_tensor_mut()?; + let distances = distances.into_dl_tensor_mut()?; unsafe { let prefilter = ffi::cuvsFilter { addr: 0, type_: ffi::cuvsFilterType::NO_FILTER }; check_cuvs(ffi::cuvsBruteForceSearch( res.0, self.inner, - queries.as_ptr(), - neighbors.as_ptr(), - distances.as_ptr(), + queries.to_c().as_mut_ptr(), + neighbors.to_c().as_mut_ptr(), + distances.to_c().as_mut_ptr(), prefilter, )) } } } -impl Drop for Index { +impl Drop for Index<'_> { fn drop(&mut self) { if let Err(e) = check_cuvs(unsafe { ffi::cuvsBruteForceIndexDestroy(self.inner) }) { write!(stderr(), "failed to call bruteForceIndexDestroy {:?}", e) @@ -101,6 +109,7 @@ impl Drop for Index { #[cfg(test)] mod tests { use super::*; + use crate::test_utils::DeviceTensor; use ndarray::s; use ndarray_rand::RandomExt; use ndarray_rand::rand_distr::Uniform; @@ -111,39 +120,41 @@ mod tests { // Create a new random dataset to index let n_datapoints = 16; let n_features = 8; - let dataset_host = - ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + let dataset_host = ndarray::Array::::random( + (n_datapoints, n_features), + Uniform::new(0., 1.0).unwrap(), + ); - let dataset = ManagedTensor::from(&dataset_host).to_device(&res).unwrap(); + let dataset = DeviceTensor::from_host(&res, &dataset_host).unwrap(); println!("dataset {:#?}", dataset_host); // build the brute force index let index = - Index::build(&res, metric, None, dataset).expect("failed to create brute force index"); + Index::build(&res, metric, None, &dataset).expect("failed to create brute force index"); res.sync_stream().unwrap(); // use the first 4 points from the dataset as queries : will test that we get them back // as their own nearest neighbor let n_queries = 4; - let queries = dataset_host.slice(s![0..n_queries, ..]); + let queries = dataset_host.slice(s![0..n_queries, ..]).to_owned(); let k = 4; println!("queries! {:#?}", queries); - let queries = ManagedTensor::from(&queries).to_device(&res).unwrap(); + let queries = DeviceTensor::from_host(&res, &queries).unwrap(); let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); - let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res).unwrap(); + let mut neighbors = DeviceTensor::::zeros(&res, &[n_queries, k]).unwrap(); let mut distances_host = ndarray::Array::::zeros((n_queries, k)); - let distances = ManagedTensor::from(&distances_host).to_device(&res).unwrap(); + let mut distances = DeviceTensor::::zeros(&res, &[n_queries, k]).unwrap(); - index.search(&res, &queries, &neighbors, &distances).unwrap(); + index.search(&res, &queries, &mut neighbors, &mut distances).unwrap(); // Copy back to host memory - distances.to_host(&res, &mut distances_host).unwrap(); - neighbors.to_host(&res, &mut neighbors_host).unwrap(); + distances.copy_to_host(&res, &mut distances_host).unwrap(); + neighbors.copy_to_host(&res, &mut neighbors_host).unwrap(); res.sync_stream().unwrap(); println!("distances {:#?}", distances_host); diff --git a/rust/cuvs/src/cagra/index.rs b/rust/cuvs/src/cagra/index.rs index d69a4d5033..290cfaef63 100644 --- a/rust/cuvs/src/cagra/index.rs +++ b/rust/cuvs/src/cagra/index.rs @@ -5,16 +5,26 @@ use std::ffi::CString; use std::io::{Write, stderr}; +use std::marker::PhantomData; use std::path::Path; use crate::cagra::{IndexParams, SearchParams}; -use crate::dlpack::ManagedTensor; +use crate::dlpack::{IntoDlTensor, IntoDlTensorMut}; use crate::error::{Error, Result, check_cuvs}; use crate::resources::Resources; -/// CAGRA ANN Index +/// A CAGRA approximate nearest neighbor index. +/// +/// The lifetime `'d` ties this index to the underlying dataset, +/// passed at construction time. The C library may store a non-owning view +/// of properly aligned device-resident data, so the dataset must outlive +/// the index. When an index is deserialized from disk, the data is +/// self-contained and its lifetime is `'static`. #[derive(Debug)] -pub struct Index(ffi::cuvsCagraIndex_t); +pub struct Index<'d> { + handle: ffi::cuvsCagraIndex_t, + _dataset: PhantomData<&'d ()>, +} /// Convert a filesystem path into a `CString` suitable for the cuVS C API, /// returning `Error::InvalidArgument` instead of panicking for paths that are @@ -27,63 +37,68 @@ fn path_to_cstring(path: &Path) -> Result { .map_err(|e| Error::InvalidArgument(format!("path contains an interior NUL byte: {e}"))) } -impl Index { - /// Builds a new Index from the dataset for efficient search. - /// - /// # Arguments +impl<'d> Index<'d> { + /// Builds a CAGRA index over `dataset` for efficient search. /// - /// * `res` - Resources to use - /// * `params` - Parameters for building the index - /// * `dataset` - A row-major matrix on either the host or device to index - pub fn build>( + /// `dataset` is a row-major matrix on the host or device implementing + /// [`IntoDlTensor`](crate::IntoDlTensor). The C++ index keeps a non-owning + /// view of it, so the returned [`Index`] borrows `dataset` for `'d` and + /// cannot outlive it. + pub fn build( res: &Resources, params: &IndexParams, - dataset: T, - ) -> Result { - let dataset: ManagedTensor = dataset.into(); + dataset: impl IntoDlTensor<'d>, + ) -> Result> { + let dataset = dataset.into_dl_tensor()?; let index = Index::new()?; unsafe { - check_cuvs(ffi::cuvsCagraBuild(res.0, params.0, dataset.as_ptr(), index.0))?; + check_cuvs(ffi::cuvsCagraBuild( + res.0, + params.0, + dataset.to_c().as_mut_ptr(), + index.handle, + ))?; } Ok(index) } /// Creates a new empty index - pub fn new() -> Result { + pub fn new() -> Result> { unsafe { let mut index = std::mem::MaybeUninit::::uninit(); check_cuvs(ffi::cuvsCagraIndexCreate(index.as_mut_ptr()))?; - Ok(Index(index.assume_init())) + Ok(Index { handle: index.assume_init(), _dataset: PhantomData }) } } - /// Perform a Approximate Nearest Neighbors search on the Index - /// - /// # Arguments + /// Searches the index for the `k` nearest neighbors of each query. /// - /// * `res` - Resources to use - /// * `params` - Parameters to use in searching the index - /// * `queries` - A matrix in device memory to query for - /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors - /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors - pub fn search( + /// `queries`, `neighbors`, and `distances` must reside in device memory and + /// implement [`IntoDlTensor`](crate::IntoDlTensor) / + /// [`IntoDlTensorMut`](crate::IntoDlTensorMut). `neighbors` (shape + /// `n_queries × k`) receives the neighbor indices and `distances` their + /// distances; both are written in place. + pub fn search<'a>( &self, res: &Resources, params: &SearchParams, - queries: &ManagedTensor, - neighbors: &ManagedTensor, - distances: &ManagedTensor, + queries: impl IntoDlTensor<'a>, + neighbors: impl IntoDlTensorMut<'a>, + distances: impl IntoDlTensorMut<'a>, ) -> Result<()> { + let queries = queries.into_dl_tensor()?; + let neighbors = neighbors.into_dl_tensor_mut()?; + let distances = distances.into_dl_tensor_mut()?; unsafe { let prefilter = ffi::cuvsFilter { addr: 0, type_: ffi::cuvsFilterType::NO_FILTER }; check_cuvs(ffi::cuvsCagraSearch( res.0, params.0, - self.0, - queries.as_ptr(), - neighbors.as_ptr(), - distances.as_ptr(), + self.handle, + queries.to_c().as_mut_ptr(), + neighbors.to_c().as_mut_ptr(), + distances.to_c().as_mut_ptr(), prefilter, )) } @@ -91,41 +106,43 @@ impl Index { /// Perform a filtered Approximate Nearest Neighbors search on the Index /// - /// Like [`search`](Self::search), but accepts a bitset filter to exclude - /// vectors during graph traversal. Filtered vectors are never visited, - /// giving better recall than post-filtering. - /// - /// # Arguments + /// Like [`search`](Self::search), but applies a bitset filter to exclude + /// vectors during graph traversal. Filtered vectors are never visited, which + /// gives better recall than post-filtering. /// - /// * `res` - Resources to use - /// * `params` - Parameters to use in searching the index - /// * `queries` - A matrix in device memory to query for - /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors - /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors - /// * `bitset` - A 1-D `uint32` device tensor with `ceil(n_rows / 32)` elements. - /// Each bit corresponds to a dataset row: bit 1 = include, bit 0 = exclude. - pub fn search_with_filter( + /// `queries`, `neighbors`, and `distances` are as in [`search`](Self::search). + /// `bitset` is a 1-D `uint32` device tensor of `ceil(n_rows / 32)` elements, + /// where each bit maps to a dataset row (1 = include, 0 = exclude). + pub fn search_with_filter<'a>( &self, res: &Resources, params: &SearchParams, - queries: &ManagedTensor, - neighbors: &ManagedTensor, - distances: &ManagedTensor, - bitset: &ManagedTensor, + queries: impl IntoDlTensor<'a>, + neighbors: impl IntoDlTensorMut<'a>, + distances: impl IntoDlTensorMut<'a>, + bitset: impl IntoDlTensor<'a>, ) -> Result<()> { + let queries = queries.into_dl_tensor()?; + let neighbors = neighbors.into_dl_tensor_mut()?; + let distances = distances.into_dl_tensor_mut()?; + let bitset = bitset.into_dl_tensor()?; + // The bitset pointer is cast to `usize` and stored in `prefilter`, then read + // by the search call, so its `ManagedTensorRef` must outlive both. + // Hence we keep it bound instead of chaining `to_c().as_mut_ptr()`. + let mut bitset_c = bitset.to_c(); unsafe { let prefilter = ffi::cuvsFilter { - addr: bitset.as_ptr() as usize, + addr: bitset_c.as_mut_ptr() as usize, type_: ffi::cuvsFilterType::BITSET, }; check_cuvs(ffi::cuvsCagraSearch( res.0, params.0, - self.0, - queries.as_ptr(), - neighbors.as_ptr(), - distances.as_ptr(), + self.handle, + queries.to_c().as_mut_ptr(), + neighbors.to_c().as_mut_ptr(), + distances.to_c().as_mut_ptr(), prefilter, )) } @@ -140,6 +157,29 @@ impl Index { /// * `res` - Resources to use /// * `filename` - The file path for saving the index /// * `include_dataset` - Whether to write out the dataset to the file + /// + /// # Example: + /// ```no_run + /// use cuvs::cagra::{Index, IndexParams}; + /// use cuvs::{Resources, Result}; + /// + /// fn serialize_example() -> Result<()> { + /// let res = Resources::new()?; + /// + /// // Build an index (using some dataset) + /// let build_params = IndexParams::new()?; + /// // let index = Index::build(&res, &build_params, &dataset)?; + /// + /// // Save the index to disk (including the dataset) + /// // index.serialize(&res, "/path/to/index.bin", true)?; + /// + /// // Later, load the index from disk + /// let loaded_index = Index::deserialize(&res, "/path/to/index.bin")?; + /// + /// // The loaded index can be used for search just like the original + /// Ok(()) + /// } + /// ``` pub fn serialize>( &self, res: &Resources, @@ -148,7 +188,12 @@ impl Index { ) -> Result<()> { let c_filename = path_to_cstring(filename.as_ref())?; unsafe { - check_cuvs(ffi::cuvsCagraSerialize(res.0, c_filename.as_ptr(), self.0, include_dataset)) + check_cuvs(ffi::cuvsCagraSerialize( + res.0, + c_filename.as_ptr(), + self.handle, + include_dataset, + )) } } @@ -165,7 +210,9 @@ impl Index { /// * `filename` - The file path for saving the index pub fn serialize_to_hnswlib>(&self, res: &Resources, filename: P) -> Result<()> { let c_filename = path_to_cstring(filename.as_ref())?; - unsafe { check_cuvs(ffi::cuvsCagraSerializeToHnswlib(res.0, c_filename.as_ptr(), self.0)) } + unsafe { + check_cuvs(ffi::cuvsCagraSerializeToHnswlib(res.0, c_filename.as_ptr(), self.handle)) + } } /// Load a CAGRA index from file. @@ -176,19 +223,19 @@ impl Index { /// /// * `res` - Resources to use /// * `filename` - The path of the file that stores the index - pub fn deserialize>(res: &Resources, filename: P) -> Result { + pub fn deserialize>(res: &Resources, filename: P) -> Result> { let c_filename = path_to_cstring(filename.as_ref())?; let index = Index::new()?; unsafe { - check_cuvs(ffi::cuvsCagraDeserialize(res.0, c_filename.as_ptr(), index.0))?; + check_cuvs(ffi::cuvsCagraDeserialize(res.0, c_filename.as_ptr(), index.handle))?; } Ok(index) } } -impl Drop for Index { +impl Drop for Index<'_> { fn drop(&mut self) { - if let Err(e) = check_cuvs(unsafe { ffi::cuvsCagraIndexDestroy(self.0) }) { + if let Err(e) = check_cuvs(unsafe { ffi::cuvsCagraIndexDestroy(self.handle) }) { write!(stderr(), "failed to call cagraIndexDestroy {:?}", e) .expect("failed to write to stderr"); } @@ -198,6 +245,7 @@ impl Drop for Index { #[cfg(test)] mod tests { use super::*; + use crate::test_utils::DeviceTensor; use ndarray::s; use ndarray_rand::RandomExt; use ndarray_rand::rand_distr::Uniform; @@ -205,41 +253,32 @@ mod tests { const N_DATAPOINTS: usize = 256; const N_FEATURES: usize = 16; - /// Build a small random dataset and a CAGRA index over it. - fn build_test_index( - res: &Resources, - build_params: &IndexParams, - ) -> (ndarray::Array2, Index) { - let dataset = - ndarray::Array::::random((N_DATAPOINTS, N_FEATURES), Uniform::new(0., 1.0)); - let index = Index::build(res, build_params, &dataset).expect("failed to build cagra index"); - (dataset, index) - } - /// Search the first `n_queries` rows of `dataset` against `index` and /// assert each query finds itself as the top-1 neighbor. CAGRA search /// requires queries and outputs to live in device memory. fn search_and_verify_self_neighbors( res: &Resources, - index: &Index, + index: &Index<'_>, dataset: &ndarray::Array2, n_queries: usize, k: usize, ) { let queries = dataset.slice(s![0..n_queries, ..]); - let queries = ManagedTensor::from(&queries).to_device(res).unwrap(); + let queries = DeviceTensor::from_host(res, &queries.to_owned()).unwrap(); let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); - let neighbors = ManagedTensor::from(&neighbors_host).to_device(res).unwrap(); + let mut neighbors = DeviceTensor::::zeros(res, &[n_queries, k]).unwrap(); let mut distances_host = ndarray::Array::::zeros((n_queries, k)); - let distances = ManagedTensor::from(&distances_host).to_device(res).unwrap(); + let mut distances = DeviceTensor::::zeros(res, &[n_queries, k]).unwrap(); let search_params = SearchParams::new().unwrap(); - index.search(res, &search_params, &queries, &neighbors, &distances).expect("search failed"); + index + .search(res, &search_params, &queries, &mut neighbors, &mut distances) + .expect("search failed"); - distances.to_host(res, &mut distances_host).unwrap(); - neighbors.to_host(res, &mut neighbors_host).unwrap(); + distances.copy_to_host(res, &mut distances_host).unwrap(); + neighbors.copy_to_host(res, &mut neighbors_host).unwrap(); for i in 0..n_queries { assert_eq!( @@ -252,7 +291,12 @@ mod tests { fn test_cagra(build_params: IndexParams) { let res = Resources::new().unwrap(); - let (dataset, index) = build_test_index(&res, &build_params); + let dataset = ndarray::Array::::random( + (N_DATAPOINTS, N_FEATURES), + Uniform::new(0., 1.0).unwrap(), + ); + let index = + Index::build(&res, &build_params, &*dataset).expect("failed to build cagra index"); search_and_verify_self_neighbors(&res, &index, &dataset, 4, 10); } @@ -278,11 +322,13 @@ mod tests { let n_datapoints = 256; let n_features = 16; - let dataset = - ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + let dataset = ndarray::Array::::random( + (n_datapoints, n_features), + Uniform::new(0., 1.0).unwrap(), + ); let index = - Index::build(&res, &build_params, &dataset).expect("failed to create cagra index"); + Index::build(&res, &build_params, &*dataset).expect("failed to create cagra index"); // Build a bitset that includes only even-indexed rows let n_words = (n_datapoints + 31) / 32; @@ -292,26 +338,32 @@ mod tests { bitset_host[i / 32] |= 1u32 << (i % 32); } } - let bitset = ManagedTensor::from(&bitset_host).to_device(&res).unwrap(); + let bitset = DeviceTensor::from_host(&res, &bitset_host).unwrap(); // Query with the first 4 even-indexed rows let n_queries = 4; - let queries = dataset.slice(s![0..n_queries * 2;2, ..]); // rows 0, 2, 4, 6 - let queries = ManagedTensor::from(&queries).to_device(&res).unwrap(); + let queries = dataset.slice(s![0..n_queries * 2;2, ..]).to_owned(); // rows 0, 2, 4, 6 + let queries = DeviceTensor::from_host(&res, &queries).unwrap(); let k = 10; let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); - let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res).unwrap(); - let mut distances_host = ndarray::Array::::zeros((n_queries, k)); - let distances = ManagedTensor::from(&distances_host).to_device(&res).unwrap(); + let mut neighbors = DeviceTensor::::zeros(&res, &[n_queries, k]).unwrap(); + let mut distances = DeviceTensor::::zeros(&res, &[n_queries, k]).unwrap(); let search_params = SearchParams::new().unwrap(); index - .search_with_filter(&res, &search_params, &queries, &neighbors, &distances, &bitset) + .search_with_filter( + &res, + &search_params, + &queries, + &mut neighbors, + &mut distances, + &bitset, + ) .unwrap(); - neighbors.to_host(&res, &mut neighbors_host).unwrap(); + neighbors.copy_to_host(&res, &mut neighbors_host).unwrap(); // All returned neighbors must be even-indexed (odd rows are filtered out). for q in 0..n_queries { @@ -335,7 +387,12 @@ mod tests { fn test_cagra_multiple_searches() { let res = Resources::new().unwrap(); let build_params = IndexParams::new().unwrap(); - let (dataset, index) = build_test_index(&res, &build_params); + let dataset = ndarray::Array::::random( + (N_DATAPOINTS, N_FEATURES), + Uniform::new(0., 1.0).unwrap(), + ); + let index = + Index::build(&res, &build_params, &*dataset).expect("failed to build cagra index"); for _ in 0..3 { search_and_verify_self_neighbors(&res, &index, &dataset, 4, 5); @@ -346,7 +403,12 @@ mod tests { fn test_cagra_serialize_deserialize() { let res = Resources::new().unwrap(); let build_params = IndexParams::new().unwrap(); - let (dataset, index) = build_test_index(&res, &build_params); + let dataset = ndarray::Array::::random( + (N_DATAPOINTS, N_FEATURES), + Uniform::new(0., 1.0).unwrap(), + ); + let index = + Index::build(&res, &build_params, &*dataset).expect("failed to build cagra index"); let filepath = std::env::temp_dir().join("test_cagra_index.bin"); index.serialize(&res, &filepath, true).expect("failed to serialize cagra index"); @@ -371,7 +433,12 @@ mod tests { fn test_cagra_serialize_without_dataset() { let res = Resources::new().unwrap(); let build_params = IndexParams::new().unwrap(); - let (_dataset, index) = build_test_index(&res, &build_params); + let dataset = ndarray::Array::::random( + (N_DATAPOINTS, N_FEATURES), + Uniform::new(0., 1.0).unwrap(), + ); + let index = + Index::build(&res, &build_params, &*dataset).expect("failed to build cagra index"); let filepath = std::env::temp_dir().join("test_cagra_index_no_dataset.bin"); index @@ -387,7 +454,12 @@ mod tests { fn test_cagra_serialize_to_hnswlib() { let res = Resources::new().unwrap(); let build_params = IndexParams::new().unwrap(); - let (_dataset, index) = build_test_index(&res, &build_params); + let dataset = ndarray::Array::::random( + (N_DATAPOINTS, N_FEATURES), + Uniform::new(0., 1.0).unwrap(), + ); + let index = + Index::build(&res, &build_params, &*dataset).expect("failed to build cagra index"); let filepath = std::env::temp_dir().join("test_cagra_index_hnsw.bin"); index @@ -409,7 +481,12 @@ mod tests { fn test_cagra_serialize_rejects_interior_nul() { let res = Resources::new().unwrap(); let build_params = IndexParams::new().unwrap(); - let (_dataset, index) = build_test_index(&res, &build_params); + let dataset = ndarray::Array::::random( + (N_DATAPOINTS, N_FEATURES), + Uniform::new(0., 1.0).unwrap(), + ); + let index = + Index::build(&res, &build_params, &*dataset).expect("failed to build cagra index"); // `PathBuf::from` on Unix preserves arbitrary bytes, so we can embed a // NUL byte in the path and confirm the helper rejects it. diff --git a/rust/cuvs/src/cagra/mod.rs b/rust/cuvs/src/cagra/mod.rs index 9043b17386..a50d132840 100644 --- a/rust/cuvs/src/cagra/mod.rs +++ b/rust/cuvs/src/cagra/mod.rs @@ -3,91 +3,15 @@ * SPDX-License-Identifier: Apache-2.0 */ -//! CAGRA is a graph-based nearest neighbors implementation with state-of-the art -//! query performance for both small- and large-batch sized search. -//! -//! Example: -//! ``` -//! -//! use cuvs::cagra::{Index, IndexParams, SearchParams}; -//! use cuvs::{ManagedTensor, Resources, Result}; -//! -//! use ndarray::s; -//! use ndarray_rand::rand_distr::Uniform; -//! use ndarray_rand::RandomExt; -//! -//! fn cagra_example() -> Result<()> { -//! let res = Resources::new()?; -//! -//! // Create a new random dataset to index -//! let n_datapoints = 65536; -//! let n_features = 512; -//! let dataset = -//! ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); -//! -//! // build the cagra index -//! let build_params = IndexParams::new()?; -//! let index = Index::build(&res, &build_params, &dataset)?; -//! println!( -//! "Indexed {}x{} datapoints into cagra index", -//! n_datapoints, n_features -//! ); -//! -//! // use the first 4 points from the dataset as queries : will test that we get them back -//! // as their own nearest neighbor -//! let n_queries = 4; -//! let queries = dataset.slice(s![0..n_queries, ..]); -//! -//! let k = 10; -//! -//! // CAGRA search API requires queries and outputs to be on device memory -//! // copy query data over, and allocate new device memory for the distances/ neighbors -//! // outputs -//! let queries = ManagedTensor::from(&queries).to_device(&res)?; -//! let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); -//! let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res)?; -//! -//! let mut distances_host = ndarray::Array::::zeros((n_queries, k)); -//! let distances = ManagedTensor::from(&distances_host).to_device(&res)?; -//! -//! let search_params = SearchParams::new()?; -//! -//! index.search(&res, &search_params, &queries, &neighbors, &distances)?; -//! -//! // Copy back to host memory -//! distances.to_host(&res, &mut distances_host)?; -//! neighbors.to_host(&res, &mut neighbors_host)?; -//! -//! // nearest neighbors should be themselves, since queries are from the -//! // dataset -//! println!("Neighbors {:?}", neighbors_host); -//! println!("Distances {:?}", distances_host); -//! Ok(()) -//! } -//! ``` -//! -//! Serialization example: -//! ```no_run -//! use cuvs::cagra::{Index, IndexParams}; -//! use cuvs::{Resources, Result}; -//! -//! fn serialize_example() -> Result<()> { -//! let res = Resources::new()?; -//! -//! // Build an index (using some dataset) -//! let build_params = IndexParams::new()?; -//! // let index = Index::build(&res, &build_params, &dataset)?; -//! -//! // Save the index to disk (including the dataset) -//! // index.serialize(&res, "/path/to/index.bin", true)?; -//! -//! // Later, load the index from disk -//! let loaded_index = Index::deserialize(&res, "/path/to/index.bin")?; -//! -//! // The loaded index can be used for search just like the original -//! Ok(()) -//! } -//! ``` +//! CAGRA: a graph-based approximate nearest neighbors algorithm with +//! state-of-the-art query throughput for both small and large batch sizes. +//! +//! Build an [`Index`] from a dataset, then [`search`](Index::search) it with +//! device-resident queries and output buffers. Tensors are passed through the +//! [`IntoDlTensor`](crate::IntoDlTensor) / +//! [`IntoDlTensorMut`](crate::IntoDlTensorMut) traits; see the +//! [`dlpack`](crate::dlpack) module for the tensor model and `examples/cagra.rs` +//! for a complete, runnable example. mod index; mod index_params; diff --git a/rust/cuvs/src/cluster/kmeans/mod.rs b/rust/cuvs/src/cluster/kmeans/mod.rs index 5015f49f45..cf1834e9a2 100644 --- a/rust/cuvs/src/cluster/kmeans/mod.rs +++ b/rust/cuvs/src/cluster/kmeans/mod.rs @@ -3,77 +3,53 @@ * SPDX-License-Identifier: Apache-2.0 */ -//! Kmeans clustering API's +//! K-means clustering. //! -//! Example: -//! ``` -//! -//! use cuvs::cluster::kmeans; -//! use cuvs::{ManagedTensor, Resources, Result}; -//! -//! use ndarray_rand::rand_distr::Uniform; -//! use ndarray_rand::RandomExt; -//! -//! fn kmeans_example() -> Result<()> { -//! let res = Resources::new()?; -//! -//! // Create a new random dataset to index -//! let n_datapoints = 65536; -//! let n_features = 512; -//! let n_clusters = 8; -//! let dataset = -//! ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); -//! let dataset = ManagedTensor::from(&dataset).to_device(&res)?; -//! -//! let centroids_host = ndarray::Array::::zeros((n_clusters, n_features)); -//! let mut centroids = ManagedTensor::from(¢roids_host).to_device(&res)?; -//! -//! // find the centroids with the kmeans index -//! let kmeans_params = kmeans::Params::new()?.set_n_clusters(n_clusters as i32); -//! let (inertia, n_iter) = kmeans::fit(&res, &kmeans_params, &dataset, &None, &mut centroids)?; -//! Ok(()) -//! } -//! ``` +//! [`fit`] computes cluster centroids for a dataset, [`predict`] assigns points +//! to clusters, and [`cluster_cost`] reports the inertia. All inputs and outputs +//! reside in device memory and are passed through the +//! [`IntoDlTensor`] / +//! [`IntoDlTensorMut`] traits; see the +//! [`dlpack`](crate::dlpack) module for the tensor model. mod params; pub use params::Params; -use crate::dlpack::ManagedTensor; +use crate::dlpack::{IntoDlTensor, IntoDlTensorMut}; use crate::error::{Result, check_cuvs}; use crate::resources::Resources; -/// Find clusters with the k-means algorithm +/// Fits k-means centroids to `x`, returning `(inertia, n_iterations)`. /// -/// # Arguments -/// -/// * `res` - Resources to use -/// * `params` - Parameters to use to fit KMeans model -/// * `x` - A matrix in device memory - shape (m, k) -/// * `sample_weight` - Optional device matrix shape (n_clusters, 1) -/// * `centroids` - Output device matrix, that has the centroids for each cluster -/// shape (n_clusters, k) -pub fn fit( +/// `x` (shape `m × k`) is the input matrix and `centroids` (shape +/// `n_clusters × k`) receives the fitted centroids; `sample_weight` is an +/// optional per-sample weight. All reside in device memory and implement +/// [`IntoDlTensor`] / +/// [`IntoDlTensorMut`]. +pub fn fit<'a>( res: &Resources, params: &Params, - x: &ManagedTensor, - sample_weight: &Option, - centroids: &mut ManagedTensor, + x: impl IntoDlTensor<'a>, + sample_weight: Option>, + centroids: impl IntoDlTensorMut<'a>, ) -> Result<(f64, i32)> { + let x = x.into_dl_tensor()?; + let sample_weight = sample_weight.map(|w| w.into_dl_tensor()).transpose()?; + let centroids = centroids.into_dl_tensor_mut()?; let mut inertia: f64 = 0.0; let mut niter: i32 = 0; + let mut sample_weight_c = sample_weight.as_ref().map(|w| w.to_c()); + let sample_weight_ptr = + sample_weight_c.as_mut().map(|w| w.as_mut_ptr()).unwrap_or(std::ptr::null_mut()); unsafe { - let sample_weight_dlpack = match sample_weight { - Some(tensor) => tensor.as_ptr(), - None => std::ptr::null_mut(), - }; check_cuvs(ffi::cuvsKMeansFit( res.0, params.0, - x.as_ptr(), - sample_weight_dlpack, - centroids.as_ptr(), + x.to_c().as_mut_ptr(), + sample_weight_ptr, + centroids.to_c().as_mut_ptr(), &mut inertia as *mut f64, &mut niter as *mut i32, ))?; @@ -81,40 +57,40 @@ pub fn fit( Ok((inertia, niter)) } -/// Predict clusters with the k-means algorithm -/// -/// # Arguments +/// Assigns each row of `x` to its nearest centroid, writing cluster labels into +/// `labels` and returning the inertia. /// -/// * `res` - Resources to use -/// * `params` - Parameters to use to fit KMeans model -/// * `x` - Input matrix in device memory - shape (m, k) -/// * `sample_weight` - Optional device matrix shape (n_clusters, 1) -/// * `centroids` - Centroids calculated by fit in device memory, shape (n_clusters, k) -/// * `labels` - preallocated CUDA array interface matrix shape (m, 1) to hold the output labels -/// * `normalize_weight` - whether or not to normalize the weights -pub fn predict( +/// `x` (shape `m × k`), `centroids` (shape `n_clusters × k`), the optional +/// `sample_weight`, and `labels` (shape `m × 1`) reside in device memory and +/// implement [`IntoDlTensor`] / +/// [`IntoDlTensorMut`]. `normalize_weight` selects +/// whether the sample weights are normalized. +pub fn predict<'a>( res: &Resources, params: &Params, - x: &ManagedTensor, - sample_weight: &Option, - centroids: &ManagedTensor, - labels: &mut ManagedTensor, + x: impl IntoDlTensor<'a>, + sample_weight: Option>, + centroids: impl IntoDlTensor<'a>, + labels: impl IntoDlTensorMut<'a>, normalize_weight: bool, ) -> Result { + let x = x.into_dl_tensor()?; + let sample_weight = sample_weight.map(|w| w.into_dl_tensor()).transpose()?; + let centroids = centroids.into_dl_tensor()?; + let labels = labels.into_dl_tensor_mut()?; let mut inertia: f64 = 0.0; + let mut sample_weight_c = sample_weight.as_ref().map(|w| w.to_c()); + let sample_weight_ptr = + sample_weight_c.as_mut().map(|w| w.as_mut_ptr()).unwrap_or(std::ptr::null_mut()); unsafe { - let sample_weight_dlpack = match sample_weight { - Some(tensor) => tensor.as_ptr(), - None => std::ptr::null_mut(), - }; check_cuvs(ffi::cuvsKMeansPredict( res.0, params.0, - x.as_ptr(), - sample_weight_dlpack, - centroids.as_ptr(), - labels.as_ptr(), + x.to_c().as_mut_ptr(), + sample_weight_ptr, + centroids.to_c().as_mut_ptr(), + labels.to_c().as_mut_ptr(), normalize_weight, &mut inertia as *mut f64, ))?; @@ -122,20 +98,24 @@ pub fn predict( Ok(inertia) } -/// Compute cluster cost given an input matrix and existing centroids -/// # Arguments +/// Computes the k-means cost (inertia) of `x` against existing `centroids`. /// -/// * `res` - Resources to use -/// * `x` - Input matrix in device memory - shape (m, k) -/// * `centroids` - Centroids calculated by fit in device memory, shape (n_clusters, k) -pub fn cluster_cost(res: &Resources, x: &ManagedTensor, centroids: &ManagedTensor) -> Result { +/// `x` (shape `m × k`) and `centroids` (shape `n_clusters × k`) reside in device +/// memory and implement [`IntoDlTensor`]. +pub fn cluster_cost<'a>( + res: &Resources, + x: impl IntoDlTensor<'a>, + centroids: impl IntoDlTensor<'a>, +) -> Result { + let x = x.into_dl_tensor()?; + let centroids = centroids.into_dl_tensor()?; let mut inertia: f64 = 0.0; unsafe { check_cuvs(ffi::cuvsKMeansClusterCost( res.0, - x.as_ptr(), - centroids.as_ptr(), + x.to_c().as_mut_ptr(), + centroids.to_c().as_mut_ptr(), &mut inertia as *mut f64, ))?; } @@ -145,6 +125,7 @@ pub fn cluster_cost(res: &Resources, x: &ManagedTensor, centroids: &ManagedTenso #[cfg(test)] mod tests { use super::*; + use crate::test_utils::DeviceTensor; use ndarray_rand::RandomExt; use ndarray_rand::rand_distr::Uniform; @@ -157,12 +138,14 @@ mod tests { // Create a new random dataset to index let n_datapoints = 256; let n_features = 16; - let dataset = - ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); - let dataset = ManagedTensor::from(&dataset).to_device(&res).unwrap(); + let dataset_host = ndarray::Array::::random( + (n_datapoints, n_features), + Uniform::new(0., 1.0).unwrap(), + ); + let dataset = DeviceTensor::from_host(&res, &dataset_host).unwrap(); let centroids_host = ndarray::Array::::zeros((n_clusters, n_features)); - let mut centroids = ManagedTensor::from(¢roids_host).to_device(&res).unwrap(); + let mut centroids = DeviceTensor::from_host(&res, ¢roids_host).unwrap(); let params = Params::new().unwrap().set_n_clusters(n_clusters as i32); @@ -170,18 +153,28 @@ mod tests { let original_inertia = cluster_cost(&res, &dataset, ¢roids).unwrap(); // fit the centroids, make sure that inertia has gone down - let (inertia, n_iter) = fit(&res, ¶ms, &dataset, &None, &mut centroids).unwrap(); + let (inertia, n_iter) = + fit(&res, ¶ms, &dataset, None::<&DeviceTensor<'_, f32>>, &mut centroids).unwrap(); assert!(inertia < original_inertia); assert!(n_iter >= 1); let mut labels_host = ndarray::Array::::zeros((n_clusters,)); - let mut labels = ManagedTensor::from(&labels_host).to_device(&res).unwrap(); + let mut labels = DeviceTensor::::zeros(&res, &[n_clusters]).unwrap(); // make sure the prediction for each centroid is the centroid itself - predict(&res, ¶ms, ¢roids, &None, ¢roids, &mut labels, false).unwrap(); - - labels.to_host(&res, &mut labels_host).unwrap(); + predict( + &res, + ¶ms, + ¢roids, + None::<&DeviceTensor<'_, f32>>, + ¢roids, + &mut labels, + false, + ) + .unwrap(); + + labels.copy_to_host(&res, &mut labels_host).unwrap(); assert_eq!(labels_host[[0,]], 0); assert_eq!(labels_host[[1,]], 1); assert_eq!(labels_host[[2,]], 2); diff --git a/rust/cuvs/src/distance/mod.rs b/rust/cuvs/src/distance/mod.rs index 36a5850905..70eca4a753 100644 --- a/rust/cuvs/src/distance/mod.rs +++ b/rust/cuvs/src/distance/mod.rs @@ -3,35 +3,43 @@ * SPDX-License-Identifier: Apache-2.0 */ +//! Pairwise distance computation. +//! +//! [`pairwise_distance`] computes all pairwise distances between two device +//! matrices. Inputs and output are passed through the +//! [`IntoDlTensor`] / +//! [`IntoDlTensorMut`] traits; see the +//! [`dlpack`](crate::dlpack) module for the tensor model. + use crate::distance_type::DistanceType; -use crate::dlpack::ManagedTensor; +use crate::dlpack::{IntoDlTensor, IntoDlTensorMut}; use crate::error::{Result, check_cuvs}; use crate::resources::Resources; -/// Compute pairwise distances between X and Y -/// -/// # Arguments +/// Computes all pairwise distances between the rows of `x` (shape `m × k`) and +/// `y` (shape `n × k`), writing the `m × n` result into `distances`. /// -/// * `res` - Resources to use -/// * `x` - A matrix in device memory - shape (m, k) -/// * `y` - A matrix in device memory - shape (n, k) -/// * `distances` - A matrix in device memory that receives the output distances - shape (m, n) -/// * `metric` - DistanceType to use for building the index -/// * `metric_arg` - Optional value of `p` for Minkowski distances -pub fn pairwise_distance( +/// `x`, `y`, and `distances` reside in device memory and implement +/// [`IntoDlTensor`] / +/// [`IntoDlTensorMut`]. `metric` selects the distance; +/// `metric_arg` is the optional `p` for Minkowski distances (defaults to 2). +pub fn pairwise_distance<'a>( res: &Resources, - x: &ManagedTensor, - y: &ManagedTensor, - distances: &ManagedTensor, + x: impl IntoDlTensor<'a>, + y: impl IntoDlTensor<'a>, + distances: impl IntoDlTensorMut<'a>, metric: DistanceType, metric_arg: Option, ) -> Result<()> { + let x = x.into_dl_tensor()?; + let y = y.into_dl_tensor()?; + let distances = distances.into_dl_tensor_mut()?; unsafe { check_cuvs(ffi::cuvsPairwiseDistance( res.0, - x.as_ptr(), - y.as_ptr(), - distances.as_ptr(), + x.to_c().as_mut_ptr(), + y.to_c().as_mut_ptr(), + distances.to_c().as_mut_ptr(), metric, metric_arg.unwrap_or(2.0), )) @@ -41,6 +49,7 @@ pub fn pairwise_distance( #[cfg(test)] mod tests { use super::*; + use crate::test_utils::DeviceTensor; use ndarray_rand::RandomExt; use ndarray_rand::rand_distr::Uniform; @@ -51,25 +60,28 @@ mod tests { // Create a new random dataset to index let n_datapoints = 256; let n_features = 16; - let dataset = - ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); - let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap(); + let dataset = ndarray::Array::::random( + (n_datapoints, n_features), + Uniform::new(0., 1.0).unwrap(), + ); + let dataset_device = DeviceTensor::from_host(&res, &dataset).unwrap(); let mut distances_host = ndarray::Array::::zeros((n_datapoints, n_datapoints)); - let distances = ManagedTensor::from(&distances_host).to_device(&res).unwrap(); + let mut distances = + DeviceTensor::::zeros(&res, &[n_datapoints, n_datapoints]).unwrap(); pairwise_distance( &res, &dataset_device, &dataset_device, - &distances, + &mut distances, DistanceType::L2Expanded, None, ) .unwrap(); // Copy back to host memory - distances.to_host(&res, &mut distances_host).unwrap(); + distances.copy_to_host(&res, &mut distances_host).unwrap(); // Self distance should be 0 assert_eq!(distances_host[[0, 0]], 0.0); diff --git a/rust/cuvs/src/dlpack.rs b/rust/cuvs/src/dlpack.rs index 1687f88d17..310e8d28de 100644 --- a/rust/cuvs/src/dlpack.rs +++ b/rust/cuvs/src/dlpack.rs @@ -3,151 +3,338 @@ * SPDX-License-Identifier: Apache-2.0 */ -use std::convert::From; +//! DLPack tensor interop. +//! +//! cuVS exchanges tensors with the C library through the DLPack ABI. This crate +//! never owns tensor storage: every entry point accepts a value that converts +//! into a borrowed view through the [`IntoDlTensor`] / [`IntoDlTensorMut`] +//! traits: +//! +//! * [`DLTensorView`] — a read-only view, for inputs the C API only reads +//! (datasets, queries). +//! * [`DLTensorViewMut`] — a writable view, for outputs the C API writes +//! (neighbors, distances). +//! +//! A view is non-owning. Its lifetime is tied to the Rust value that owns the +//! underlying buffer, and it materializes a stack-local [`DLManagedTensor`] only +//! for the duration of each FFI call. +//! +//! The crate ships no public tensor type. To hand your own GPU (or host) buffer +//! to cuVS, implement [`IntoDlTensor`] / [`IntoDlTensorMut`] for it on top of +//! [`DLTensorView::from_raw_parts`]. Most algorithms require search inputs and +//! outputs to live in device memory. See `examples/cagra.rs` for a complete, +//! runnable adapter built on the raw CUDA runtime. -use crate::error::{Result, check_cuvs}; -use crate::resources::Resources; +use std::fmt; +use std::marker::PhantomData; +use std::ops::Deref; -/// ManagedTensor is a wrapper around a dlpack DLManagedTensor object. -/// This lets you pass matrices in device or host memory into cuvs. -#[derive(Debug)] -pub struct ManagedTensor(ffi::DLManagedTensor); +use tinyvec::TinyVec; -pub trait IntoDtype { - fn ffi_dtype() -> ffi::DLDataType; -} +pub use ffi::{DLDataType, DLDataTypeCode, DLDevice, DLDeviceType, DLManagedTensor, DLTensor}; -impl ManagedTensor { - pub fn as_ptr(&self) -> *mut ffi::DLManagedTensor { - &self.0 as *const _ as *mut _ - } +/// Number of dimensions kept inline before [`TensorDims`] spills to the heap. +/// 1-D and 2-D tensors are the norm; IVF-PQ codebooks need 3-D. +const INLINE_DIMS: usize = 3; - /// Creates a new ManagedTensor on the current GPU device, and copies - /// the data into it. - pub fn to_device(&self, res: &Resources) -> Result { - unsafe { - let bytes = dl_tensor_bytes(&self.0.dl_tensor); - let mut device_data: *mut std::ffi::c_void = std::ptr::null_mut(); +pub(crate) type TensorDims = TinyVec<[i64; INLINE_DIMS]>; - // allocate storage, copy over - check_cuvs(ffi::cuvsRMMAlloc(res.0, &mut device_data as *mut _, bytes))?; +/// Conversion into a read-only [`DLTensorView`] for tensor inputs. +/// +/// Implement this for your own tensor type by calling +/// [`DLTensorView::from_raw_parts`] inside a small `unsafe` block, upholding its +/// safety contract. +/// +/// # Examples +/// +/// A minimal adapter for a row-major matrix in device memory: +/// +/// ``` +/// use cuvs::dlpack::{DLDevice, DLDeviceType, DLPackError, DLTensorView, DType, IntoDlTensor}; +/// +/// struct GpuMatrix { +/// ptr: *mut T, +/// rows: usize, +/// cols: usize, +/// } +/// +/// impl<'a, T: DType> IntoDlTensor<'a> for &'a GpuMatrix { +/// fn into_dl_tensor(self) -> Result, DLPackError> { +/// let shape = [self.rows as i64, self.cols as i64]; +/// // SAFETY: `ptr` points to `rows * cols` initialized elements of `T` +/// // in device 0's memory, valid for `'a`, and is row-major contiguous. +/// unsafe { +/// DLTensorView::from_raw_parts( +/// self.ptr.cast(), +/// DLDevice { device_type: DLDeviceType::kDLCUDA, device_id: 0 }, +/// &shape, +/// None, +/// T::dl_dtype(), +/// ) +/// } +/// } +/// } +/// ``` +pub trait IntoDlTensor<'a> { + fn into_dl_tensor(self) -> std::result::Result, DLPackError>; +} - let mut ret = ManagedTensor(self.0); - ret.0.dl_tensor.data = device_data; - ret.0.deleter = Some(rmm_free_tensor); - ret.0.dl_tensor.device.device_type = ffi::DLDeviceType::kDLCUDA; +/// A public conversion trait for writable tensor inputs. +/// +/// In addition to the [`DLTensorView::from_raw_parts`] invariants, writable +/// adapters must guarantee exclusive access to the data region for `'a`. +pub trait IntoDlTensorMut<'a> { + fn into_dl_tensor_mut(self) -> std::result::Result, DLPackError>; +} - check_cuvs(ffi::cuvsMatrixCopy(res.0, self.as_ptr(), ret.as_ptr()))?; +/// Maps a Rust element type to a DLPack [`DLDataType`]. +pub trait DType { + fn dl_dtype() -> ffi::DLDataType; +} - Ok(ret) +macro_rules! impl_dtype { + ($ty:ty, $code:expr, $bits:expr) => { + impl DType for $ty { + fn dl_dtype() -> ffi::DLDataType { + ffi::DLDataType { code: $code as u8, bits: $bits, lanes: 1 } + } } + }; +} + +impl_dtype!(f32, ffi::DLDataTypeCode::kDLFloat, 32); +impl_dtype!(f64, ffi::DLDataTypeCode::kDLFloat, 64); +impl_dtype!(i32, ffi::DLDataTypeCode::kDLInt, 32); +impl_dtype!(i64, ffi::DLDataTypeCode::kDLInt, 64); +impl_dtype!(u32, ffi::DLDataTypeCode::kDLUInt, 32); +impl_dtype!(u64, ffi::DLDataTypeCode::kDLUInt, 64); +impl_dtype!(u8, ffi::DLDataTypeCode::kDLUInt, 8); +impl_dtype!(i8, ffi::DLDataTypeCode::kDLInt, 8); +impl_dtype!(u16, ffi::DLDataTypeCode::kDLUInt, 16); +impl_dtype!(i16, ffi::DLDataTypeCode::kDLInt, 16); + +/// Error when converting an external tensor to a DLPack view. +#[derive(Debug, Clone, thiserror::Error)] +#[non_exhaustive] +pub enum DLPackError { + /// The tensor resides on a device not supported by cuVS. + #[error("unsupported tensor device: {0}")] + UnsupportedDevice(String), + /// The tensor dtype is not supported by the current adapter. + #[error("unsupported tensor dtype: {0}")] + UnsupportedDType(String), + /// A strides slice did not match the tensor rank. + #[error("strides length {strides} does not match tensor rank {ndim}")] + StridesLenMismatch { ndim: usize, strides: usize }, + /// The source tensor reported invalid DLPack metadata. + #[error("invalid DLPack metadata: {0}")] + InvalidMetadata(&'static str), +} + +/// A wrapper around [`ffi::DLManagedTensor`] with a lifetime attached. +pub(crate) struct ManagedTensorRef<'a> { + pub(crate) inner: ffi::DLManagedTensor, + _borrow: PhantomData<&'a ()>, +} + +impl ManagedTensorRef<'_> { + /// Returns a pointer to the inner [`ffi::DLManagedTensor`] for an FFI call. + /// + /// The pointer is valid only while this `ManagedTensorRef` is alive. This + /// covers calling `view.to_c().as_mut_ptr()` directly as an FFI argument, + /// because the temporary would live to the end of that statement. + pub(crate) fn as_mut_ptr(&mut self) -> *mut ffi::DLManagedTensor { + &mut self.inner } +} - /// Copies data from device memory into host memory - pub fn to_host< - T: IntoDtype, - S: ndarray::RawData + ndarray::RawDataMut, - D: ndarray::Dimension, - >( - &self, - res: &Resources, - arr: &mut ndarray::ArrayBase, - ) -> Result<()> { - unsafe { - let mut dst = self.0; - dst.dl_tensor.data = arr.as_mut_ptr() as *mut std::ffi::c_void; - dst.dl_tensor.device.device_type = ffi::DLDeviceType::kDLCPU; - dst.deleter = None; - - check_cuvs(ffi::cuvsMatrixCopy(res.0, self.as_ptr(), &mut dst))?; - Ok(()) +/// A non-owning, read-only DLPack tensor view. +#[must_use] +pub struct DLTensorView<'a> { + data: *mut std::ffi::c_void, + device: ffi::DLDevice, + dtype: ffi::DLDataType, + shape: TensorDims, + strides: Option, + _marker: PhantomData<&'a ()>, +} + +impl<'a> DLTensorView<'a> { + /// Construct a DLPack view from raw tensor metadata. + /// + /// # Safety + /// + /// The caller must guarantee that: + /// - `data` points to initialized storage matching `shape`, `strides`, and + /// `dtype`, residing on the device described by `device`; + /// - that storage remains valid for the lifetime `'a`; + /// - the C API consumes the resulting [`DLManagedTensor`] (including its + /// `shape`/`strides` pointers) only for the duration of the FFI call and + /// does not retain it afterward — cuVS upholds this. + pub unsafe fn from_raw_parts( + data: *mut std::ffi::c_void, + device: ffi::DLDevice, + shape: &[i64], + strides: Option<&[i64]>, + dtype: ffi::DLDataType, + ) -> std::result::Result { + if let Some(s) = strides + && s.len() != shape.len() + { + return Err(DLPackError::StridesLenMismatch { ndim: shape.len(), strides: s.len() }); } + Ok(Self { + data, + device, + dtype, + shape: shape.iter().copied().collect(), + strides: strides.map(|s| s.iter().copied().collect()), + _marker: PhantomData, + }) } -} -/// Figures out how many bytes are in a DLTensor -fn dl_tensor_bytes(tensor: &ffi::DLTensor) -> usize { - let mut bytes: usize = 1; - for dim in 0..tensor.ndim { - bytes *= unsafe { (*tensor.shape.add(dim as usize)) as usize }; - } - bytes *= (tensor.dtype.bits / 8) as usize; - bytes -} - -unsafe extern "C" fn rmm_free_tensor(self_: *mut ffi::DLManagedTensor) { - unsafe { - let bytes = dl_tensor_bytes(&(*self_).dl_tensor); - let res = Resources::new().unwrap(); - let _ = ffi::cuvsRMMFree(res.0, (*self_).dl_tensor.data as *mut _, bytes); - } -} - -/// Create a non-owning view of a Tensor from a ndarray -impl, D: ndarray::Dimension> - From<&ndarray::ArrayBase> for ManagedTensor -{ - fn from(arr: &ndarray::ArrayBase) -> Self { - // There is a draft PR out right now for creating dlpack directly from ndarray - // right now, but until its merged we have to implement ourselves - //https://github.com/rust-ndarray/ndarray/pull/1306/files - unsafe { - let mut ret = std::mem::MaybeUninit::::uninit(); - let tensor = ret.as_mut_ptr(); - (*tensor).data = arr.as_ptr() as *mut std::os::raw::c_void; - (*tensor).device = - ffi::DLDevice { device_type: ffi::DLDeviceType::kDLCPU, device_id: 0 }; - (*tensor).byte_offset = 0; - (*tensor).strides = std::ptr::null_mut(); // TODO: error if not rowmajor - (*tensor).ndim = arr.ndim() as i32; - (*tensor).shape = arr.shape().as_ptr() as *mut _; - (*tensor).dtype = T::ffi_dtype(); - ManagedTensor(ffi::DLManagedTensor { - dl_tensor: ret.assume_init(), + /// Build a [`DLManagedTensor`] for an FFI call. + /// + /// DLPack stores `shape` and `strides` as mutable pointers, so this method + /// casts pointers derived from `&self` to `*mut` for C ABI compatibility. + /// The callee must treat them as `const`. + pub(crate) fn to_c(&self) -> ManagedTensorRef<'_> { + ManagedTensorRef { + inner: ffi::DLManagedTensor { + dl_tensor: ffi::DLTensor { + data: self.data, + device: self.device, + ndim: self.shape.len() as i32, + dtype: self.dtype, + shape: self.shape.as_ptr() as *mut _, + strides: self + .strides + .as_ref() + .map_or(std::ptr::null_mut(), |s| s.as_ptr() as *mut _), + byte_offset: 0, + }, manager_ctx: std::ptr::null_mut(), deleter: None, - }) + }, + _borrow: PhantomData, } } + + /// Number of dimensions. + pub fn ndim(&self) -> usize { + self.shape.len() + } + + /// Shape of the tensor. + pub fn shape(&self) -> &[i64] { + &self.shape + } + + /// Strides, if non-contiguous. `None` means row-major contiguous. + pub fn strides(&self) -> Option<&[i64]> { + self.strides.as_deref() + } + + /// Element data type. + pub fn dtype(&self) -> ffi::DLDataType { + self.dtype + } + + /// Device where the data resides. + pub fn device(&self) -> ffi::DLDevice { + self.device + } } -impl Drop for ManagedTensor { - fn drop(&mut self) { - unsafe { - if let Some(deleter) = self.0.deleter { - deleter(&mut self.0 as *mut _); - } - } +impl fmt::Debug for DLTensorView<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("DLTensorView") + .field("shape", &self.shape.as_slice()) + .field("strides", &self.strides.as_deref()) + .finish() + } +} + +/// A non-owning, writable DLPack tensor view. +#[must_use] +pub struct DLTensorViewMut<'a> { + base: DLTensorView<'a>, + _unique: PhantomData<&'a mut ()>, +} + +impl<'a> DLTensorViewMut<'a> { + /// Construct a writable DLPack view from raw tensor metadata. + /// + /// # Safety + /// + /// In addition to the [`DLTensorView::from_raw_parts`] invariants, the + /// caller must guarantee the storage is exclusively writable for `'a`. + pub unsafe fn from_raw_parts( + data: *mut std::ffi::c_void, + device: ffi::DLDevice, + shape: &[i64], + strides: Option<&[i64]>, + dtype: ffi::DLDataType, + ) -> std::result::Result { + Ok(Self { + base: unsafe { DLTensorView::from_raw_parts(data, device, shape, strides, dtype)? }, + _unique: PhantomData, + }) + } +} + +impl<'a> Deref for DLTensorViewMut<'a> { + type Target = DLTensorView<'a>; + + fn deref(&self) -> &Self::Target { + &self.base + } +} + +impl fmt::Debug for DLTensorViewMut<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("DLTensorViewMut") + .field("shape", &self.base.shape.as_slice()) + .field("strides", &self.base.strides.as_deref()) + .finish() } } -impl IntoDtype for f32 { - fn ffi_dtype() -> ffi::DLDataType { - ffi::DLDataType { code: ffi::DLDataTypeCode::kDLFloat as _, bits: 32, lanes: 1 } +impl<'a> IntoDlTensor<'a> for DLTensorView<'a> { + fn into_dl_tensor(self) -> std::result::Result, DLPackError> { + Ok(self) } } -impl IntoDtype for f64 { - fn ffi_dtype() -> ffi::DLDataType { - ffi::DLDataType { code: ffi::DLDataTypeCode::kDLFloat as _, bits: 64, lanes: 1 } +impl<'a> IntoDlTensor<'a> for DLTensorViewMut<'a> { + fn into_dl_tensor(self) -> std::result::Result, DLPackError> { + Ok(self.base) } } -impl IntoDtype for i32 { - fn ffi_dtype() -> ffi::DLDataType { - ffi::DLDataType { code: ffi::DLDataTypeCode::kDLInt as _, bits: 32, lanes: 1 } +impl<'a> IntoDlTensorMut<'a> for DLTensorViewMut<'a> { + fn into_dl_tensor_mut(self) -> std::result::Result, DLPackError> { + Ok(self) } } -impl IntoDtype for i64 { - fn ffi_dtype() -> ffi::DLDataType { - ffi::DLDataType { code: ffi::DLDataTypeCode::kDLInt as _, bits: 64, lanes: 1 } +impl<'a> IntoDlTensor<'a> for &DLTensorView<'a> { + fn into_dl_tensor(self) -> std::result::Result, DLPackError> { + Ok(DLTensorView { + data: self.data, + device: self.device, + dtype: self.dtype, + shape: self.shape.clone(), + strides: self.strides.clone(), + _marker: PhantomData, + }) } } -impl IntoDtype for u32 { - fn ffi_dtype() -> ffi::DLDataType { - ffi::DLDataType { code: ffi::DLDataTypeCode::kDLUInt as _, bits: 32, lanes: 1 } +// The result is deliberately bounded by the borrow `'b`, NOT by the data lifetime `'a`. +impl<'a, 'b> IntoDlTensor<'b> for &'b DLTensorViewMut<'a> { + fn into_dl_tensor(self) -> std::result::Result, DLPackError> { + self.deref().into_dl_tensor() } } @@ -155,20 +342,76 @@ impl IntoDtype for u32 { mod tests { use super::*; + fn cpu() -> DLDevice { + DLDevice { device_type: DLDeviceType::kDLCPU, device_id: 0 } + } + #[test] - fn test_from_ndarray() { - let arr = ndarray::Array::::zeros((8, 4)); + fn to_c_translates_contiguous_metadata() { + let data = [0.0f32; 6]; + let view = unsafe { + DLTensorView::from_raw_parts( + data.as_ptr() as *mut _, + cpu(), + &[2, 3], + None, + f32::dl_dtype(), + ) + } + .unwrap(); - let tensor = unsafe { (*(ManagedTensor::from(&arr).as_ptr())).dl_tensor }; + let managed = view.to_c(); + let t = &managed.inner.dl_tensor; - assert_eq!(tensor.ndim, 2); + assert_eq!(t.ndim, 2); + assert_eq!(t.data as *const f32, data.as_ptr()); + assert_eq!(t.dtype.code, DLDataTypeCode::kDLFloat as u8); + assert_eq!(t.dtype.bits, 32); + assert_eq!(t.dtype.lanes, 1); + assert_eq!(t.byte_offset, 0); + assert!(managed.inner.manager_ctx.is_null()); + assert!(managed.inner.deleter.is_none()); - // make sure we can get the shape ok - assert_eq!(unsafe { *tensor.shape }, 8); - assert_eq!(unsafe { *tensor.shape.add(1) }, 4); + assert_eq!(unsafe { std::slice::from_raw_parts(t.shape, 2) }, &[2, 3]); + assert!(t.strides.is_null()); + } + + #[test] + fn to_c_preserves_explicit_strides() { + let data = [0.0f32; 6]; + let view = unsafe { + DLTensorView::from_raw_parts( + data.as_ptr() as *mut _, + cpu(), + &[2, 3], + Some(&[1, 2]), + f32::dl_dtype(), + ) + } + .unwrap(); + + let managed = view.to_c(); + let t = &managed.inner.dl_tensor; + + assert_eq!(unsafe { std::slice::from_raw_parts(t.shape, 2) }, &[2, 3]); + assert!(!t.strides.is_null()); + assert_eq!(unsafe { std::slice::from_raw_parts(t.strides, 2) }, &[1, 2]); + } + + #[test] + fn from_raw_parts_rejects_mismatched_strides_len() { + let data = [0.0f32; 6]; + let err = unsafe { + DLTensorView::from_raw_parts( + data.as_ptr() as *mut _, + cpu(), + &[2, 3], + Some(&[1]), + f32::dl_dtype(), + ) + } + .unwrap_err(); - let arr = ndarray::Array::::zeros((8,)); - let tensor = unsafe { (*(ManagedTensor::from(&arr).as_ptr())).dl_tensor }; - assert_eq!(tensor.ndim, 1); + assert!(matches!(err, DLPackError::StridesLenMismatch { ndim: 2, strides: 1 })); } } diff --git a/rust/cuvs/src/error.rs b/rust/cuvs/src/error.rs index f9d5879a7b..f2942f5603 100644 --- a/rust/cuvs/src/error.rs +++ b/rust/cuvs/src/error.rs @@ -14,6 +14,8 @@ pub struct CuvsError { #[derive(Debug, Clone)] pub enum Error { CuvsError(CuvsError), + /// Tensor conversion into DLPack metadata failed. + DLPack(crate::dlpack::DLPackError), /// The caller passed an argument that could not be forwarded to the C API /// (e.g. a filename containing an interior NUL byte or invalid UTF-8). InvalidArgument(String), @@ -28,6 +30,7 @@ impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Error::CuvsError(cuvs_error) => write!(f, "cuvsError={:?}", cuvs_error), + Error::DLPack(err) => write!(f, "DLPack error: {}", err), Error::InvalidArgument(msg) => write!(f, "invalid argument: {}", msg), } } @@ -55,3 +58,9 @@ pub fn check_cuvs(err: ffi::cuvsError_t) -> Result<()> { } } } + +impl From for Error { + fn from(err: crate::dlpack::DLPackError) -> Self { + Self::DLPack(err) + } +} diff --git a/rust/cuvs/src/ivf_flat/index.rs b/rust/cuvs/src/ivf_flat/index.rs index a602d64c05..cdf806a149 100644 --- a/rust/cuvs/src/ivf_flat/index.rs +++ b/rust/cuvs/src/ivf_flat/index.rs @@ -5,7 +5,7 @@ use std::io::{Write, stderr}; -use crate::dlpack::ManagedTensor; +use crate::dlpack::{IntoDlTensor, IntoDlTensorMut}; use crate::error::{Result, check_cuvs}; use crate::ivf_flat::{IndexParams, SearchParams}; use crate::resources::Resources; @@ -15,22 +15,29 @@ use crate::resources::Resources; pub struct Index(ffi::cuvsIvfFlatIndex_t); impl Index { - /// Builds a new Index from the dataset for efficient search. + /// Builds an IVF-Flat index over `dataset` for efficient search. /// - /// # Arguments + /// `dataset` is a row-major matrix on the host or device implementing + /// [`IntoDlTensor`](crate::IntoDlTensor). It is copied into the index, so the + /// caller may free it once this call returns (hence `Index` carries no + /// lifetime). /// - /// * `res` - Resources to use - /// * `params` - Parameters for building the index - /// * `dataset` - A row-major matrix on either the host or device to index - pub fn build>( + /// Supported dataset/query dtypes in the current C-backed implementation are + /// `f32`, `f16`, `i8`, and `u8`. + pub fn build<'a>( res: &Resources, params: &IndexParams, - dataset: T, + dataset: impl IntoDlTensor<'a>, ) -> Result { - let dataset: ManagedTensor = dataset.into(); + let dataset = dataset.into_dl_tensor()?; let index = Index::new()?; unsafe { - check_cuvs(ffi::cuvsIvfFlatBuild(res.0, params.0, dataset.as_ptr(), index.0))?; + check_cuvs(ffi::cuvsIvfFlatBuild( + res.0, + params.0, + dataset.to_c().as_mut_ptr(), + index.0, + ))?; } Ok(index) } @@ -44,23 +51,24 @@ impl Index { } } - /// Perform a Approximate Nearest Neighbors search on the Index - /// - /// # Arguments + /// Searches the index for the `k` nearest neighbors of each query. /// - /// * `res` - Resources to use - /// * `params` - Parameters to use in searching the index - /// * `queries` - A matrix in device memory to query for - /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors - /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors - pub fn search( + /// `queries`, `neighbors`, and `distances` must reside in device memory and + /// implement [`IntoDlTensor`](crate::IntoDlTensor) / + /// [`IntoDlTensorMut`](crate::IntoDlTensorMut). `neighbors` receives the + /// neighbor indices and `distances` their distances; both are written in + /// place. + pub fn search<'a>( &self, res: &Resources, params: &SearchParams, - queries: &ManagedTensor, - neighbors: &ManagedTensor, - distances: &ManagedTensor, + queries: impl IntoDlTensor<'a>, + neighbors: impl IntoDlTensorMut<'a>, + distances: impl IntoDlTensorMut<'a>, ) -> Result<()> { + let queries = queries.into_dl_tensor()?; + let neighbors = neighbors.into_dl_tensor_mut()?; + let distances = distances.into_dl_tensor_mut()?; unsafe { let prefilter = ffi::cuvsFilter { addr: 0, type_: ffi::cuvsFilterType::NO_FILTER }; @@ -68,9 +76,9 @@ impl Index { res.0, params.0, self.0, - queries.as_ptr(), - neighbors.as_ptr(), - distances.as_ptr(), + queries.to_c().as_mut_ptr(), + neighbors.to_c().as_mut_ptr(), + distances.to_c().as_mut_ptr(), prefilter, )) } @@ -89,6 +97,7 @@ impl Drop for Index { #[cfg(test)] mod tests { use super::*; + use crate::test_utils::DeviceTensor; use ndarray::s; use ndarray_rand::RandomExt; use ndarray_rand::rand_distr::Uniform; @@ -102,39 +111,41 @@ mod tests { // Create a new random dataset to index let n_datapoints = 1024; let n_features = 16; - let dataset = - ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + let dataset = ndarray::Array::::random( + (n_datapoints, n_features), + Uniform::new(0., 1.0).unwrap(), + ); - let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap(); + let dataset_device = DeviceTensor::from_host(&res, &dataset).unwrap(); // build the ivf-flat index - let index = Index::build(&res, &build_params, dataset_device) + let index = Index::build(&res, &build_params, &dataset_device) .expect("failed to create ivf-flat index"); // use the first 4 points from the dataset as queries : will test that we get them back // as their own nearest neighbor let n_queries = 4; - let queries = dataset.slice(s![0..n_queries, ..]); + let queries = dataset.slice(s![0..n_queries, ..]).to_owned(); let k = 10; // IvfFlat search API requires queries and outputs to be on device memory // copy query data over, and allocate new device memory for the distances/ neighbors // outputs - let queries = ManagedTensor::from(&queries).to_device(&res).unwrap(); + let queries = DeviceTensor::from_host(&res, &queries).unwrap(); let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); - let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res).unwrap(); + let mut neighbors = DeviceTensor::::zeros(&res, &[n_queries, k]).unwrap(); let mut distances_host = ndarray::Array::::zeros((n_queries, k)); - let distances = ManagedTensor::from(&distances_host).to_device(&res).unwrap(); + let mut distances = DeviceTensor::::zeros(&res, &[n_queries, k]).unwrap(); let search_params = SearchParams::new().unwrap(); - index.search(&res, &search_params, &queries, &neighbors, &distances).unwrap(); + index.search(&res, &search_params, &queries, &mut neighbors, &mut distances).unwrap(); // Copy back to host memory - distances.to_host(&res, &mut distances_host).unwrap(); - neighbors.to_host(&res, &mut neighbors_host).unwrap(); + distances.copy_to_host(&res, &mut distances_host).unwrap(); + neighbors.copy_to_host(&res, &mut neighbors_host).unwrap(); // nearest neighbors should be themselves, since queries are from the // dataset @@ -154,13 +165,15 @@ mod tests { // Create a random dataset let n_datapoints = 1024; let n_features = 16; - let dataset = - ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + let dataset = ndarray::Array::::random( + (n_datapoints, n_features), + Uniform::new(0., 1.0).unwrap(), + ); - let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap(); + let dataset_device = DeviceTensor::from_host(&res, &dataset).unwrap(); // Build the index once - let index = Index::build(&res, &build_params, dataset_device) + let index = Index::build(&res, &build_params, &dataset_device) .expect("failed to create ivf-flat index"); let search_params = SearchParams::new().unwrap(); @@ -169,23 +182,23 @@ mod tests { // Perform multiple searches on the same index for search_iter in 0..3 { let n_queries = 4; - let queries = dataset.slice(s![0..n_queries, ..]); - let queries = ManagedTensor::from(&queries).to_device(&res).unwrap(); + let queries = dataset.slice(s![0..n_queries, ..]).to_owned(); + let queries = DeviceTensor::from_host(&res, &queries).unwrap(); let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); - let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res).unwrap(); + let mut neighbors = DeviceTensor::::zeros(&res, &[n_queries, k]).unwrap(); let mut distances_host = ndarray::Array::::zeros((n_queries, k)); - let distances = ManagedTensor::from(&distances_host).to_device(&res).unwrap(); + let mut distances = DeviceTensor::::zeros(&res, &[n_queries, k]).unwrap(); // This should work on every iteration because search() takes &self index - .search(&res, &search_params, &queries, &neighbors, &distances) + .search(&res, &search_params, &queries, &mut neighbors, &mut distances) .expect(&format!("search iteration {} failed", search_iter)); // Copy back to host memory - distances.to_host(&res, &mut distances_host).unwrap(); - neighbors.to_host(&res, &mut neighbors_host).unwrap(); + distances.copy_to_host(&res, &mut distances_host).unwrap(); + neighbors.copy_to_host(&res, &mut neighbors_host).unwrap(); // Verify results are consistent assert_eq!( diff --git a/rust/cuvs/src/ivf_flat/mod.rs b/rust/cuvs/src/ivf_flat/mod.rs index 7417116965..b3fdcc0ed4 100644 --- a/rust/cuvs/src/ivf_flat/mod.rs +++ b/rust/cuvs/src/ivf_flat/mod.rs @@ -1,71 +1,18 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ -//! The IVF-Flat method is an ANN algorithm. It uses an inverted file index (IVF) with -//! unmodified (that is, flat) vectors. This algorithm provides simple knobs to reduce -//! the overall search space and to trade-off accuracy for speed. -//! -//! Example: -//! ``` -//! -//! use cuvs::ivf_flat::{Index, IndexParams, SearchParams}; -//! use cuvs::{ManagedTensor, Resources, Result}; -//! -//! use ndarray::s; -//! use ndarray_rand::rand_distr::Uniform; -//! use ndarray_rand::RandomExt; -//! -//! fn ivf_flat_example() -> Result<()> { -//! let res = Resources::new()?; -//! -//! // Create a new random dataset to index -//! let n_datapoints = 65536; -//! let n_features = 512; -//! let dataset = -//! ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); -//! -//! // build the ivf-flat index -//! let build_params = IndexParams::new()?; -//! let index = Index::build(&res, &build_params, &dataset)?; -//! println!( -//! "Indexed {}x{} datapoints into ivf-flat index", -//! n_datapoints, n_features -//! ); -//! -//! // use the first 4 points from the dataset as queries : will test that we get them back -//! // as their own nearest neighbor -//! let n_queries = 4; -//! let queries = dataset.slice(s![0..n_queries, ..]); -//! -//! let k = 10; -//! -//! // Ivf-Flat search API requires queries and outputs to be on device memory -//! // copy query data over, and allocate new device memory for the distances/ neighbors -//! // outputs -//! let queries = ManagedTensor::from(&queries).to_device(&res)?; -//! let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); -//! let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res)?; -//! -//! let mut distances_host = ndarray::Array::::zeros((n_queries, k)); -//! let distances = ManagedTensor::from(&distances_host).to_device(&res)?; -//! -//! let search_params = SearchParams::new()?; -//! -//! index.search(&res, &search_params, &queries, &neighbors, &distances)?; -//! -//! // Copy back to host memory -//! distances.to_host(&res, &mut distances_host)?; -//! neighbors.to_host(&res, &mut neighbors_host)?; -//! -//! // nearest neighbors should be themselves, since queries are from the -//! // dataset -//! println!("Neighbors {:?}", neighbors_host); -//! println!("Distances {:?}", distances_host); -//! Ok(()) -//! } -//! ``` +//! IVF-Flat: an inverted-file index over uncompressed ("flat") vectors. It +//! partitions the dataset into `n_lists` clusters and, at query time, scans only +//! the `n_probes` closest clusters — a simple knob to trade recall for speed. +//! +//! Build an [`Index`] from a dataset, then [`search`](Index::search) it with +//! device-resident queries and output buffers. Tensors are passed through the +//! [`IntoDlTensor`](crate::IntoDlTensor) / +//! [`IntoDlTensorMut`](crate::IntoDlTensorMut) traits; see the +//! [`dlpack`](crate::dlpack) module for the tensor model and `examples/cagra.rs` +//! for the same build/search workflow. mod index; mod index_params; diff --git a/rust/cuvs/src/ivf_pq/index.rs b/rust/cuvs/src/ivf_pq/index.rs index 492fefa0f1..e2ce741a33 100644 --- a/rust/cuvs/src/ivf_pq/index.rs +++ b/rust/cuvs/src/ivf_pq/index.rs @@ -5,7 +5,7 @@ use std::io::{Write, stderr}; -use crate::dlpack::ManagedTensor; +use crate::dlpack::{IntoDlTensor, IntoDlTensorMut}; use crate::error::{Result, check_cuvs}; use crate::ivf_pq::{IndexParams, SearchParams}; use crate::resources::Resources; @@ -15,22 +15,24 @@ use crate::resources::Resources; pub struct Index(ffi::cuvsIvfPqIndex_t); impl Index { - /// Builds a new Index from the dataset for efficient search. + /// Builds an IVF-PQ index over `dataset` for efficient search. /// - /// # Arguments + /// `dataset` is a row-major matrix on the host or device implementing + /// [`IntoDlTensor`](crate::IntoDlTensor). It is copied (and quantized) into + /// the index, so the caller may free it once this call returns (hence + /// `Index` carries no lifetime). /// - /// * `res` - Resources to use - /// * `params` - Parameters for building the index - /// * `dataset` - A row-major matrix on either the host or device to index - pub fn build>( + /// Supported dataset/query dtypes in the current C-backed implementation are + /// `f32`, `f16`, `i8`, and `u8`. + pub fn build<'a>( res: &Resources, params: &IndexParams, - dataset: T, + dataset: impl IntoDlTensor<'a>, ) -> Result { - let dataset: ManagedTensor = dataset.into(); + let dataset = dataset.into_dl_tensor()?; let index = Index::new()?; unsafe { - check_cuvs(ffi::cuvsIvfPqBuild(res.0, params.0, dataset.as_ptr(), index.0))?; + check_cuvs(ffi::cuvsIvfPqBuild(res.0, params.0, dataset.to_c().as_mut_ptr(), index.0))?; } Ok(index) } @@ -44,31 +46,32 @@ impl Index { } } - /// Perform a Approximate Nearest Neighbors search on the Index + /// Searches the index for the `k` nearest neighbors of each query. /// - /// # Arguments - /// - /// * `res` - Resources to use - /// * `params` - Parameters to use in searching the index - /// * `queries` - A matrix in device memory to query for - /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors - /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors - pub fn search( + /// `queries`, `neighbors`, and `distances` must reside in device memory and + /// implement [`IntoDlTensor`](crate::IntoDlTensor) / + /// [`IntoDlTensorMut`](crate::IntoDlTensorMut). `neighbors` receives the + /// neighbor indices and `distances` their distances; both are written in + /// place. + pub fn search<'a>( &self, res: &Resources, params: &SearchParams, - queries: &ManagedTensor, - neighbors: &ManagedTensor, - distances: &ManagedTensor, + queries: impl IntoDlTensor<'a>, + neighbors: impl IntoDlTensorMut<'a>, + distances: impl IntoDlTensorMut<'a>, ) -> Result<()> { + let queries = queries.into_dl_tensor()?; + let neighbors = neighbors.into_dl_tensor_mut()?; + let distances = distances.into_dl_tensor_mut()?; unsafe { check_cuvs(ffi::cuvsIvfPqSearch( res.0, params.0, self.0, - queries.as_ptr(), - neighbors.as_ptr(), - distances.as_ptr(), + queries.to_c().as_mut_ptr(), + neighbors.to_c().as_mut_ptr(), + distances.to_c().as_mut_ptr(), )) } } @@ -86,6 +89,7 @@ impl Drop for Index { #[cfg(test)] mod tests { use super::*; + use crate::test_utils::DeviceTensor; use ndarray::s; use ndarray_rand::RandomExt; use ndarray_rand::rand_distr::Uniform; @@ -99,39 +103,41 @@ mod tests { // Create a new random dataset to index let n_datapoints = 1024; let n_features = 16; - let dataset = - ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + let dataset = ndarray::Array::::random( + (n_datapoints, n_features), + Uniform::new(0., 1.0).unwrap(), + ); - let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap(); + let dataset_device = DeviceTensor::from_host(&res, &dataset).unwrap(); // build the ivf-pq index - let index = Index::build(&res, &build_params, dataset_device) + let index = Index::build(&res, &build_params, &dataset_device) .expect("failed to create ivf-pq index"); // use the first 4 points from the dataset as queries : will test that we get them back // as their own nearest neighbor let n_queries = 4; - let queries = dataset.slice(s![0..n_queries, ..]); + let queries = dataset.slice(s![0..n_queries, ..]).to_owned(); let k = 10; // Ivf-Pq search API requires queries and outputs to be on device memory // copy query data over, and allocate new device memory for the distances/ neighbors // outputs - let queries = ManagedTensor::from(&queries).to_device(&res).unwrap(); + let queries = DeviceTensor::from_host(&res, &queries).unwrap(); let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); - let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res).unwrap(); + let mut neighbors = DeviceTensor::::zeros(&res, &[n_queries, k]).unwrap(); let mut distances_host = ndarray::Array::::zeros((n_queries, k)); - let distances = ManagedTensor::from(&distances_host).to_device(&res).unwrap(); + let mut distances = DeviceTensor::::zeros(&res, &[n_queries, k]).unwrap(); let search_params = SearchParams::new().unwrap(); - index.search(&res, &search_params, &queries, &neighbors, &distances).unwrap(); + index.search(&res, &search_params, &queries, &mut neighbors, &mut distances).unwrap(); // Copy back to host memory - distances.to_host(&res, &mut distances_host).unwrap(); - neighbors.to_host(&res, &mut neighbors_host).unwrap(); + distances.copy_to_host(&res, &mut distances_host).unwrap(); + neighbors.copy_to_host(&res, &mut neighbors_host).unwrap(); // nearest neighbors should be themselves, since queries are from the // dataset @@ -151,13 +157,15 @@ mod tests { // Create a random dataset let n_datapoints = 1024; let n_features = 16; - let dataset = - ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + let dataset = ndarray::Array::::random( + (n_datapoints, n_features), + Uniform::new(0., 1.0).unwrap(), + ); - let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap(); + let dataset_device = DeviceTensor::from_host(&res, &dataset).unwrap(); // Build the index once - let index = Index::build(&res, &build_params, dataset_device) + let index = Index::build(&res, &build_params, &dataset_device) .expect("failed to create ivf-pq index"); let search_params = SearchParams::new().unwrap(); @@ -166,23 +174,23 @@ mod tests { // Perform multiple searches on the same index for search_iter in 0..3 { let n_queries = 4; - let queries = dataset.slice(s![0..n_queries, ..]); - let queries = ManagedTensor::from(&queries).to_device(&res).unwrap(); + let queries = dataset.slice(s![0..n_queries, ..]).to_owned(); + let queries = DeviceTensor::from_host(&res, &queries).unwrap(); let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); - let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res).unwrap(); + let mut neighbors = DeviceTensor::::zeros(&res, &[n_queries, k]).unwrap(); let mut distances_host = ndarray::Array::::zeros((n_queries, k)); - let distances = ManagedTensor::from(&distances_host).to_device(&res).unwrap(); + let mut distances = DeviceTensor::::zeros(&res, &[n_queries, k]).unwrap(); // This should work on every iteration because search() takes &self index - .search(&res, &search_params, &queries, &neighbors, &distances) + .search(&res, &search_params, &queries, &mut neighbors, &mut distances) .expect(&format!("search iteration {} failed", search_iter)); // Copy back to host memory - distances.to_host(&res, &mut distances_host).unwrap(); - neighbors.to_host(&res, &mut neighbors_host).unwrap(); + distances.copy_to_host(&res, &mut distances_host).unwrap(); + neighbors.copy_to_host(&res, &mut neighbors_host).unwrap(); // Verify results are consistent assert_eq!( diff --git a/rust/cuvs/src/ivf_pq/mod.rs b/rust/cuvs/src/ivf_pq/mod.rs index c4676cd1aa..dce6c9810a 100644 --- a/rust/cuvs/src/ivf_pq/mod.rs +++ b/rust/cuvs/src/ivf_pq/mod.rs @@ -1,68 +1,18 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ -//! Inverted File Product Quantization -//! -//! Example: -//! ``` -//! -//! use cuvs::ivf_pq::{Index, IndexParams, SearchParams}; -//! use cuvs::{ManagedTensor, Resources, Result}; -//! -//! use ndarray::s; -//! use ndarray_rand::rand_distr::Uniform; -//! use ndarray_rand::RandomExt; -//! -//! fn ivf_pq_example() -> Result<()> { -//! let res = Resources::new()?; -//! -//! // Create a new random dataset to index -//! let n_datapoints = 65536; -//! let n_features = 512; -//! let dataset = -//! ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); -//! -//! // build the ivf-pq index -//! let build_params = IndexParams::new()?; -//! let index = Index::build(&res, &build_params, &dataset)?; -//! println!( -//! "Indexed {}x{} datapoints into ivf-pq index", -//! n_datapoints, n_features -//! ); -//! -//! // use the first 4 points from the dataset as queries : will test that we get them back -//! // as their own nearest neighbor -//! let n_queries = 4; -//! let queries = dataset.slice(s![0..n_queries, ..]); -//! -//! let k = 10; -//! -//! // Ivf-Pq search API requires queries and outputs to be on device memory -//! // copy query data over, and allocate new device memory for the distances/ neighbors -//! // outputs -//! let queries = ManagedTensor::from(&queries).to_device(&res)?; -//! let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); -//! let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res)?; -//! -//! let mut distances_host = ndarray::Array::::zeros((n_queries, k)); -//! let distances = ManagedTensor::from(&distances_host).to_device(&res)?; -//! -//! let search_params = SearchParams::new()?; -//! -//! index.search(&res, &search_params, &queries, &neighbors, &distances)?; -//! -//! // Copy back to host memory -//! distances.to_host(&res, &mut distances_host)?; -//! neighbors.to_host(&res, &mut neighbors_host)?; -//! -//! // nearest neighbors should be themselves, since queries are from the -//! // dataset -//! println!("Neighbors {:?}", neighbors_host); -//! println!("Distances {:?}", distances_host); -//! Ok(()) -//! } -//! ``` +//! IVF-PQ: an inverted-file index that product-quantizes the vectors. Like +//! IVF-Flat it partitions the dataset into `n_lists` clusters and scans the +//! `n_probes` closest at query time, but compresses each vector into `pq_dim` +//! codes of `pq_bits` bits — much smaller, slightly less accurate. +//! +//! Build an [`Index`] from a dataset, then [`search`](Index::search) it with +//! device-resident queries and output buffers. Tensors are passed through the +//! [`IntoDlTensor`](crate::IntoDlTensor) / +//! [`IntoDlTensorMut`](crate::IntoDlTensorMut) traits; see the +//! [`dlpack`](crate::dlpack) module for the tensor model and `examples/cagra.rs` +//! for the same build/search workflow. mod index; mod index_params; diff --git a/rust/cuvs/src/lib.rs b/rust/cuvs/src/lib.rs index 519519440b..dea2329336 100644 --- a/rust/cuvs/src/lib.rs +++ b/rust/cuvs/src/lib.rs @@ -14,13 +14,17 @@ pub mod cagra; pub mod cluster; pub mod distance; pub mod distance_type; -mod dlpack; +pub mod dlpack; mod error; pub mod ivf_flat; pub mod ivf_pq; mod resources; +#[cfg(test)] +pub(crate) mod test_utils; pub mod vamana; -pub use dlpack::ManagedTensor; +pub use dlpack::{ + DLPackError, DLTensorView, DLTensorViewMut, DType, IntoDlTensor, IntoDlTensorMut, +}; pub use error::{Error, Result}; pub use resources::Resources; diff --git a/rust/cuvs/src/resources.rs b/rust/cuvs/src/resources.rs index 70f128abb7..f6e9c40626 100644 --- a/rust/cuvs/src/resources.rs +++ b/rust/cuvs/src/resources.rs @@ -23,7 +23,12 @@ impl Resources { } /// Sets the current cuda stream - pub fn set_cuda_stream(&self, stream: ffi::cudaStream_t) -> Result<()> { + /// + /// # Safety + /// + /// `stream` must be a valid CUDA stream that remains valid for as long as it + /// is used by this resources handle. + pub unsafe fn set_cuda_stream(&self, stream: ffi::cudaStream_t) -> Result<()> { unsafe { check_cuvs(ffi::cuvsStreamSet(self.0, stream)) } } diff --git a/rust/cuvs/src/test_utils.rs b/rust/cuvs/src/test_utils.rs new file mode 100644 index 0000000000..192fba39f8 --- /dev/null +++ b/rust/cuvs/src/test_utils.rs @@ -0,0 +1,236 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +//! Test-only tensor adapters. +//! +//! [`DeviceTensor`] is an RMM-backed device matrix, and the `ndarray` host +//! adapters below implement [`IntoDlTensor`]/[`IntoDlTensorMut`] for plain host +//! arrays. We use `ndarray` only as a dev-dependency to assist with unit tests. + +use std::marker::PhantomData; + +use crate::dlpack::{ + DLPackError, DLTensorView, DLTensorViewMut, DType, IntoDlTensor, IntoDlTensorMut, +}; +use crate::error::{Result, check_cuvs}; +use crate::ffi; +use crate::resources::Resources; + +pub(crate) struct DeviceTensor<'res, T: DType> { + data: *mut std::ffi::c_void, + shape: Vec, + capacity_bytes: usize, + resources: &'res Resources, + _marker: PhantomData, +} + +impl<'res, T: DType> DeviceTensor<'res, T> { + pub(crate) fn zeros(res: &'res Resources, shape: &[usize]) -> Result { + let capacity_bytes = shape.iter().product::() * std::mem::size_of::(); + let mut data: *mut std::ffi::c_void = std::ptr::null_mut(); + unsafe { + check_cuvs(ffi::cuvsRMMAlloc(res.0, &mut data, capacity_bytes))?; + } + + Ok(Self { + data, + shape: shape.iter().map(|&dim| dim as i64).collect(), + capacity_bytes, + resources: res, + _marker: PhantomData, + }) + } + + pub(crate) fn from_host(res: &'res Resources, host: &ndarray::ArrayRef) -> Result + where + D: ndarray::Dimension, + { + let shape: Vec = host.shape().to_vec(); + let mut device = Self::zeros(res, &shape)?; + + let host_shape: Vec = host.shape().iter().map(|&dim| dim as i64).collect(); + let host_strides: Option> = if host.is_standard_layout() { + None + } else { + Some(host.strides().iter().map(|&stride| stride as i64).collect()) + }; + let host = unsafe { + DLTensorView::from_raw_parts( + host.as_ptr() as *mut _, + ffi::DLDevice { device_type: ffi::DLDeviceType::kDLCPU, device_id: 0 }, + &host_shape, + host_strides.as_deref(), + T::dl_dtype(), + )? + }; + let device_view = (&mut device).into_dl_tensor_mut()?; + unsafe { + check_cuvs(ffi::cuvsMatrixCopy( + res.0, + host.to_c().as_mut_ptr(), + device_view.to_c().as_mut_ptr(), + ))?; + } + + Ok(device) + } + + pub(crate) fn copy_to_host( + &self, + res: &Resources, + host: &mut ndarray::ArrayRef, + ) -> Result<()> + where + D: ndarray::Dimension, + { + let host_shape: Vec = host.shape().iter().map(|&dim| dim as i64).collect(); + let host_strides: Option> = if host.is_standard_layout() { + None + } else { + Some(host.strides().iter().map(|&stride| stride as i64).collect()) + }; + let host = unsafe { + DLTensorViewMut::from_raw_parts( + host.as_mut_ptr() as *mut _, + ffi::DLDevice { device_type: ffi::DLDeviceType::kDLCPU, device_id: 0 }, + &host_shape, + host_strides.as_deref(), + T::dl_dtype(), + )? + }; + let device = self.into_dl_tensor()?; + unsafe { + check_cuvs(ffi::cuvsMatrixCopy( + res.0, + device.to_c().as_mut_ptr(), + host.to_c().as_mut_ptr(), + ))?; + } + + Ok(()) + } +} + +impl Drop for DeviceTensor<'_, T> { + fn drop(&mut self) { + if !self.data.is_null() { + let _ = unsafe { ffi::cuvsRMMFree(self.resources.0, self.data, self.capacity_bytes) }; + } + } +} + +impl<'a, 'res, T: DType> IntoDlTensor<'a> for &'a DeviceTensor<'res, T> { + fn into_dl_tensor(self) -> std::result::Result, DLPackError> { + unsafe { + DLTensorView::from_raw_parts( + self.data, + ffi::DLDevice { device_type: ffi::DLDeviceType::kDLCUDA, device_id: 0 }, + &self.shape, + None, + T::dl_dtype(), + ) + } + } +} + +impl<'a, 'res, T: DType> IntoDlTensorMut<'a> for &'a mut DeviceTensor<'res, T> { + fn into_dl_tensor_mut(self) -> std::result::Result, DLPackError> { + unsafe { + DLTensorViewMut::from_raw_parts( + self.data, + ffi::DLDevice { device_type: ffi::DLDeviceType::kDLCUDA, device_id: 0 }, + &self.shape, + None, + T::dl_dtype(), + ) + } + } +} + +// --------------------------------------------------------------------------- +// ndarray host adapters (test-only) +// --------------------------------------------------------------------------- + +fn array_layout(arr: &ndarray::ArrayRef) -> (Vec, Option>) +where + D: ndarray::Dimension, +{ + let shape = arr.shape().iter().map(|&d| d as i64).collect(); + let strides = if arr.is_standard_layout() { + None + } else { + Some(arr.strides().iter().map(|&s| s as i64).collect()) + }; + (shape, strides) +} + +impl<'a, A, D> IntoDlTensor<'a> for &'a ndarray::ArrayRef +where + A: DType, + D: ndarray::Dimension, +{ + fn into_dl_tensor(self) -> std::result::Result, DLPackError> { + let (shape, strides) = array_layout(self); + unsafe { + DLTensorView::from_raw_parts( + self.as_ptr() as *mut _, + ffi::DLDevice { device_type: ffi::DLDeviceType::kDLCPU, device_id: 0 }, + &shape, + strides.as_deref(), + A::dl_dtype(), + ) + } + } +} + +impl<'a, A, D> IntoDlTensorMut<'a> for &'a mut ndarray::ArrayRef +where + A: DType, + D: ndarray::Dimension, +{ + fn into_dl_tensor_mut(self) -> std::result::Result, DLPackError> { + let (shape, strides) = array_layout(self); + unsafe { + DLTensorViewMut::from_raw_parts( + self.as_mut_ptr() as *mut _, + ffi::DLDevice { device_type: ffi::DLDeviceType::kDLCPU, device_id: 0 }, + &shape, + strides.as_deref(), + A::dl_dtype(), + ) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // The ndarray adapter must report a contiguous array as contiguous (no + // strides) with the correct shape, device, and dtype. + #[test] + fn ndarray_contiguous_view_omits_strides() { + let arr = ndarray::Array2::::zeros((10, 20)); + let view = (&arr).into_dl_tensor().unwrap(); + + assert_eq!(view.shape(), &[10, 20]); + assert!(view.strides().is_none()); + assert_eq!(view.device().device_type, ffi::DLDeviceType::kDLCPU); + assert_eq!(view.dtype().code, ffi::DLDataTypeCode::kDLFloat as u8); + assert_eq!(view.dtype().bits, 32); + } + + // A non-standard layout must surface explicit strides so cuVS reads the + // data in the right order. + #[test] + fn ndarray_transposed_view_reports_strides() { + let arr = ndarray::Array2::::zeros((10, 20)); + let transposed = arr.t(); + let view = (&transposed).into_dl_tensor().unwrap(); + + assert_eq!(view.shape(), &[20, 10]); + assert_eq!(view.strides(), Some(&[1i64, 20][..])); + } +} diff --git a/rust/cuvs/src/vamana/index.rs b/rust/cuvs/src/vamana/index.rs index 485f8ac008..975fb79006 100644 --- a/rust/cuvs/src/vamana/index.rs +++ b/rust/cuvs/src/vamana/index.rs @@ -6,7 +6,7 @@ use std::ffi::CString; use std::io::{Write, stderr}; -use crate::dlpack::ManagedTensor; +use crate::dlpack::IntoDlTensor; use crate::error::{Result, check_cuvs}; use crate::resources::Resources; use crate::vamana::IndexParams; @@ -24,21 +24,25 @@ impl Index { /// all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied /// to improve graph quality. The index_params struct controls the degree of the final graph. /// - /// - /// # Arguments - /// - /// * `res` - Resources to use - /// * `params` - Parameters for building the index - /// * `dataset` - A row-major matrix on either the host or device to index - pub fn build>( + /// `dataset` is a row-major matrix on the host or device implementing + /// [`IntoDlTensor`](crate::IntoDlTensor); it is copied into the index. + pub fn build<'a>( res: &Resources, params: &IndexParams, - dataset: T, + dataset: impl IntoDlTensor<'a>, ) -> Result { - let dataset: ManagedTensor = dataset.into(); + let dataset = dataset.into_dl_tensor()?; let index = Index::new()?; + // `cuvsVamanaBuild` copies the dataset into the index, so the index does not + // retain a view into `dataset`; the borrow only needs to be valid for the + // duration of this call. That is why `Index` carries no lifetime. unsafe { - check_cuvs(ffi::cuvsVamanaBuild(res.0, params.0, dataset.as_ptr(), index.0))?; + check_cuvs(ffi::cuvsVamanaBuild( + res.0, + params.0, + dataset.to_c().as_mut_ptr(), + index.0, + ))?; } Ok(index) } @@ -88,7 +92,7 @@ impl Drop for Index { #[cfg(test)] mod tests { use super::*; - + use crate::test_utils::DeviceTensor; use ndarray_rand::RandomExt; use ndarray_rand::rand_distr::Uniform; @@ -101,13 +105,15 @@ mod tests { // Create a new random dataset to index let n_datapoints = 1024; let n_features = 16; - let dataset = - ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + let dataset = ndarray::Array::::random( + (n_datapoints, n_features), + Uniform::new(0., 1.0).unwrap(), + ); - let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap(); + let dataset_device = DeviceTensor::from_host(&res, &dataset).unwrap(); // build the vamana index - let _index = Index::build(&res, &build_params, dataset_device) + let _index = Index::build(&res, &build_params, &dataset_device) .expect("failed to create vamana index"); } } diff --git a/rust/cuvs/src/vamana/mod.rs b/rust/cuvs/src/vamana/mod.rs index a3ae4ee9ff..4dd2434407 100644 --- a/rust/cuvs/src/vamana/mod.rs +++ b/rust/cuvs/src/vamana/mod.rs @@ -2,7 +2,11 @@ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ -//! Vamana +//! Vamana: builds a DiskANN-style Vamana graph over a dataset. +//! +//! Build an [`Index`] from a dataset (then typically serialize it). The dataset +//! is passed through the [`IntoDlTensor`](crate::IntoDlTensor) trait; see the +//! [`dlpack`](crate::dlpack) module for the tensor model. mod index; mod index_params;