Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions rust/cuvs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@ doc-only = ["cuvs-sys/doc-only"]

[dependencies]
cuvs-sys = { workspace = true }
ndarray = "0.15"
thiserror = "2"
tinyvec = { version = "1", features = ["alloc"] }

[dev-dependencies]
ndarray-rand = "0.14"
ndarray = "0.17"
ndarray-rand = "0.16"

[package.metadata.docs.rs]
features = ["doc-only"]
218 changes: 188 additions & 30 deletions rust/cuvs/examples/cagra.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,62 +3,220 @@
* SPDX-License-Identifier: Apache-2.0
*/

//! CAGRA example with a user-provided GPU tensor.
//!
//! This demonstrates how to feed your own device memory into cuVS by
//! implementing the public [`IntoDlTensor`]/[`IntoDlTensorMut`] traits. The
//! [`CudaTensor`] type manages device memory directly through the CUDA runtime
//! (`cudaMalloc`/`cudaFree`) and copies to/from host arrays with `cudaMemcpyAsync`
//! on the cuVS stream, reusing the resources handle's `get_cuda_stream`/
//! `sync_stream` for stream access and synchronization.
//!
//! A real application would likely rely on a helper crate such as `cudarc`
//! and its `CudaSlice`.

use std::ffi::c_void;
use std::marker::PhantomData;
use std::os::raw::c_int;

use cuvs::Resources;
use cuvs::cagra::{Index, IndexParams, SearchParams};
use cuvs::{ManagedTensor, Resources, Result};
use cuvs::dlpack::{
DLDevice, DLDeviceType, DLPackError, DLTensorView, DLTensorViewMut, DType, IntoDlTensor,
IntoDlTensorMut,
};

use ndarray::s;
use ndarray_rand::RandomExt;
use ndarray_rand::rand_distr::Uniform;

/// Example showing how to index and search data with CAGRA
fn cagra_example() -> Result<()> {
type ExampleResult<T> = std::result::Result<T, Box<dyn std::error::Error>>;

// ---------------------------------------------------------------------------
// Minimal CUDA runtime FFI
// ---------------------------------------------------------------------------

#[allow(non_camel_case_types)]
type cudaError_t = c_int;
const CUDA_SUCCESS: cudaError_t = 0;
const CUDA_MEMCPY_HOST_TO_DEVICE: c_int = 1;
const CUDA_MEMCPY_DEVICE_TO_HOST: c_int = 2;

#[link(name = "cudart")]
unsafe extern "C" {
fn cudaMalloc(ptr: *mut *mut c_void, size: usize) -> cudaError_t;
fn cudaFree(ptr: *mut c_void) -> cudaError_t;
fn cudaMemcpyAsync(
dst: *mut c_void,
src: *const c_void,
count: usize,
kind: c_int,
stream: cuvs_sys::cudaStream_t,
) -> cudaError_t;
}

fn check_cuda(status: cudaError_t) -> ExampleResult<()> {
if status == CUDA_SUCCESS {
Ok(())
} else {
Err(format!("CUDA runtime error: {status}").into())
}
}

// ---------------------------------------------------------------------------
// A custom device tensor backed by the CUDA runtime
// ---------------------------------------------------------------------------

struct CudaTensor<T: DType> {
data: *mut c_void,
shape: Vec<i64>,
bytes: usize,
_marker: PhantomData<T>,
}

impl<T: DType> CudaTensor<T> {
/// Allocate an uninitialized device buffer (used for search outputs).
fn alloc(shape: &[usize]) -> ExampleResult<Self> {
let bytes = shape.iter().product::<usize>() * std::mem::size_of::<T>();
let mut data: *mut c_void = std::ptr::null_mut();
check_cuda(unsafe { cudaMalloc(&mut data, bytes) })?;
Ok(Self {
data,
shape: shape.iter().map(|&d| d as i64).collect(),
bytes,
_marker: PhantomData,
})
}

/// Copy a contiguous host array onto the device.
fn from_host<D>(res: &Resources, host: &ndarray::ArrayRef<T, D>) -> ExampleResult<Self>
where
D: ndarray::Dimension,
{
if !host.is_standard_layout() {
return Err("host array must be contiguous (row-major)".into());
}
let tensor = Self::alloc(host.shape())?;

let stream = res.get_cuda_stream()?;
check_cuda(unsafe {
cudaMemcpyAsync(
tensor.data,
host.as_ptr() as *const c_void,
tensor.bytes,
CUDA_MEMCPY_HOST_TO_DEVICE,
stream,
)
})?;
res.sync_stream()?;

Ok(tensor)
}

/// Copy the device buffer back into a contiguous host array.
fn to_host<D>(&self, res: &Resources, host: &mut ndarray::ArrayRef<T, D>) -> ExampleResult<()>
where
D: ndarray::Dimension,
{
if !host.is_standard_layout() {
return Err("host array must be contiguous (row-major)".into());
}

let stream = res.get_cuda_stream()?;
check_cuda(unsafe {
cudaMemcpyAsync(
host.as_mut_ptr() as *mut c_void,
self.data,
self.bytes,
CUDA_MEMCPY_DEVICE_TO_HOST,
stream,
)
})?;
res.sync_stream()?;

Ok(())
}
}

impl<T: DType> Drop for CudaTensor<T> {
fn drop(&mut self) {
if !self.data.is_null() {
unsafe { cudaFree(self.data) };
}
}
}

impl<'a, T: DType> IntoDlTensor<'a> for &'a CudaTensor<T> {
fn into_dl_tensor(self) -> std::result::Result<DLTensorView<'a>, DLPackError> {
unsafe {
DLTensorView::from_raw_parts(
self.data,
DLDevice { device_type: DLDeviceType::kDLCUDA, device_id: 0 },
&self.shape,
None,
T::dl_dtype(),
)
}
}
}

impl<'a, T: DType> IntoDlTensorMut<'a> for &'a mut CudaTensor<T> {
fn into_dl_tensor_mut(self) -> std::result::Result<DLTensorViewMut<'a>, DLPackError> {
unsafe {
DLTensorViewMut::from_raw_parts(
self.data,
DLDevice { device_type: DLDeviceType::kDLCUDA, device_id: 0 },
&self.shape,
None,
T::dl_dtype(),
)
}
}
}

/// Example showing how to index and search data with CAGRA.
fn cagra_example() -> ExampleResult<()> {
let res = Resources::new()?;

// Create a new random dataset to index
// Create a new random dataset to index and copy it to the device.
let n_datapoints = 65536;
let n_features = 512;
let dataset =
ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
let dataset_host = ndarray::Array::<f32, _>::random(
(n_datapoints, n_features),
Uniform::new(0., 1.0).unwrap(),
);
let dataset = CudaTensor::from_host(&res, &dataset_host)?;

// build the cagra index
// Build the CAGRA index.
let build_params = IndexParams::new()?;
let index = Index::build(&res, &build_params, &dataset)?;
println!("Indexed {}x{} datapoints into cagra index", n_datapoints, n_features);
println!("Indexed {n_datapoints}x{n_features} datapoints into cagra index");

// use the first 4 points from the dataset as queries : will test that we get them back
// as their own nearest neighbor
// Use the first 4 points as queries; each should be its own nearest neighbor.
let n_queries = 4;
let queries = dataset.slice(s![0..n_queries, ..]);

let k = 10;
let queries_host = dataset_host.slice(s![0..n_queries, ..]).to_owned();
let queries = CudaTensor::from_host(&res, &queries_host)?;

// CAGRA search API requires queries and outputs to be on device memory
// copy query data over, and allocate new device memory for the distances/ neighbors
// outputs
let queries = ManagedTensor::from(&queries).to_device(&res)?;
let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res)?;

let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
let distances = ManagedTensor::from(&distances_host).to_device(&res)?;
let mut neighbors = CudaTensor::<u32>::alloc(&[n_queries, k])?;
let mut distances = CudaTensor::<f32>::alloc(&[n_queries, k])?;

let search_params = SearchParams::new()?;
index.search(&res, &search_params, &queries, &mut neighbors, &mut distances)?;

index.search(&res, &search_params, &queries, &neighbors, &distances)?;

// Copy back to host memory
distances.to_host(&res, &mut distances_host)?;
// Copy the results back to the host.
let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
neighbors.to_host(&res, &mut neighbors_host)?;
distances.to_host(&res, &mut distances_host)?;

// nearest neighbors should be themselves, since queries are from the
// dataset
println!("Neighbors {:?}", neighbors_host);
println!("Distances {:?}", distances_host);
println!("Neighbors {neighbors_host:?}");
println!("Distances {distances_host:?}");
Ok(())
}

fn main() {
if let Err(e) = cagra_example() {
println!("Failed to run CAGRA: {:?}", e);
println!("Failed to run CAGRA: {e:?}");
}
}
Loading
Loading