NVIDIA · yan-zaretskiy · Jun 12, 2026 · Jun 15, 2026 · Jun 22, 2026
diff --git a/rust/cuvs/Cargo.toml b/rust/cuvs/Cargo.toml
@@ -14,10 +14,12 @@ doc-only = ["cuvs-sys/doc-only"]
 
 [dependencies]
 cuvs-sys = { workspace = true }
-ndarray = "0.15"
+thiserror = "2"
+tinyvec = { version = "1", features = ["alloc"] }
 
 [dev-dependencies]
-ndarray-rand = "0.14"
+ndarray = "0.17"
+ndarray-rand = "0.16"
 
 [package.metadata.docs.rs]
 features = ["doc-only"]
diff --git a/rust/cuvs/examples/cagra.rs b/rust/cuvs/examples/cagra.rs
@@ -3,62 +3,220 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+//! CAGRA example with a user-provided GPU tensor.
+//!
+//! This demonstrates how to feed your own device memory into cuVS by
+//! implementing the public [`IntoDlTensor`]/[`IntoDlTensorMut`] traits. The
+//! [`CudaTensor`] type manages device memory directly through the CUDA runtime
+//! (`cudaMalloc`/`cudaFree`) and copies to/from host arrays with `cudaMemcpyAsync`
+//! on the cuVS stream, reusing the resources handle's `get_cuda_stream`/
+//! `sync_stream` for stream access and synchronization.
+//!
+//! A real application would likely rely on a helper crate such as `cudarc`
+//! and its `CudaSlice`.
+
+use std::ffi::c_void;
+use std::marker::PhantomData;
+use std::os::raw::c_int;
+
+use cuvs::Resources;
 use cuvs::cagra::{Index, IndexParams, SearchParams};
-use cuvs::{ManagedTensor, Resources, Result};
+use cuvs::dlpack::{
+    DLDevice, DLDeviceType, DLPackError, DLTensorView, DLTensorViewMut, DType, IntoDlTensor,
+    IntoDlTensorMut,
+};
 
 use ndarray::s;
 use ndarray_rand::RandomExt;
 use ndarray_rand::rand_distr::Uniform;
 
-/// Example showing how to index and search data with CAGRA
-fn cagra_example() -> Result<()> {
+type ExampleResult<T> = std::result::Result<T, Box<dyn std::error::Error>>;
+
+// ---------------------------------------------------------------------------
+// Minimal CUDA runtime FFI
+// ---------------------------------------------------------------------------
+
+#[allow(non_camel_case_types)]
+type cudaError_t = c_int;
+const CUDA_SUCCESS: cudaError_t = 0;
+const CUDA_MEMCPY_HOST_TO_DEVICE: c_int = 1;
+const CUDA_MEMCPY_DEVICE_TO_HOST: c_int = 2;
+
+#[link(name = "cudart")]
+unsafe extern "C" {
+    fn cudaMalloc(ptr: *mut *mut c_void, size: usize) -> cudaError_t;
+    fn cudaFree(ptr: *mut c_void) -> cudaError_t;
+    fn cudaMemcpyAsync(
+        dst: *mut c_void,
+        src: *const c_void,
+        count: usize,
+        kind: c_int,
+        stream: cuvs_sys::cudaStream_t,
+    ) -> cudaError_t;
+}
+
+fn check_cuda(status: cudaError_t) -> ExampleResult<()> {
+    if status == CUDA_SUCCESS {
+        Ok(())
+    } else {
+        Err(format!("CUDA runtime error: {status}").into())
+    }
+}
+
+// ---------------------------------------------------------------------------
+// A custom device tensor backed by the CUDA runtime
+// ---------------------------------------------------------------------------
+
+struct CudaTensor<T: DType> {
+    data: *mut c_void,
+    shape: Vec<i64>,
+    bytes: usize,
+    _marker: PhantomData<T>,
+}
+
+impl<T: DType> CudaTensor<T> {
+    /// Allocate an uninitialized device buffer (used for search outputs).
+    fn alloc(shape: &[usize]) -> ExampleResult<Self> {
+        let bytes = shape.iter().product::<usize>() * std::mem::size_of::<T>();
+        let mut data: *mut c_void = std::ptr::null_mut();
+        check_cuda(unsafe { cudaMalloc(&mut data, bytes) })?;
+        Ok(Self {
+            data,
+            shape: shape.iter().map(|&d| d as i64).collect(),
+            bytes,
+            _marker: PhantomData,
+        })
+    }
+
+    /// Copy a contiguous host array onto the device.
+    fn from_host<D>(res: &Resources, host: &ndarray::ArrayRef<T, D>) -> ExampleResult<Self>
+    where
+        D: ndarray::Dimension,
+    {
+        if !host.is_standard_layout() {
+            return Err("host array must be contiguous (row-major)".into());
+        }
+        let tensor = Self::alloc(host.shape())?;
+
+        let stream = res.get_cuda_stream()?;
+        check_cuda(unsafe {
+            cudaMemcpyAsync(
+                tensor.data,
+                host.as_ptr() as *const c_void,
+                tensor.bytes,
+                CUDA_MEMCPY_HOST_TO_DEVICE,
+                stream,
+            )
+        })?;
+        res.sync_stream()?;
+
+        Ok(tensor)
+    }
+
+    /// Copy the device buffer back into a contiguous host array.
+    fn to_host<D>(&self, res: &Resources, host: &mut ndarray::ArrayRef<T, D>) -> ExampleResult<()>
+    where
+        D: ndarray::Dimension,
+    {
+        if !host.is_standard_layout() {
+            return Err("host array must be contiguous (row-major)".into());
+        }
+
+        let stream = res.get_cuda_stream()?;
+        check_cuda(unsafe {
+            cudaMemcpyAsync(
+                host.as_mut_ptr() as *mut c_void,
+                self.data,
+                self.bytes,
+                CUDA_MEMCPY_DEVICE_TO_HOST,
+                stream,
+            )
+        })?;
+        res.sync_stream()?;
+
+        Ok(())
+    }
+}
+
+impl<T: DType> Drop for CudaTensor<T> {
+    fn drop(&mut self) {
+        if !self.data.is_null() {
+            unsafe { cudaFree(self.data) };
+        }
+    }
+}
+
+impl<'a, T: DType> IntoDlTensor<'a> for &'a CudaTensor<T> {
+    fn into_dl_tensor(self) -> std::result::Result<DLTensorView<'a>, DLPackError> {
+        unsafe {
+            DLTensorView::from_raw_parts(
+                self.data,
+                DLDevice { device_type: DLDeviceType::kDLCUDA, device_id: 0 },
+                &self.shape,
+                None,
+                T::dl_dtype(),
+            )
+        }
+    }
+}
+
+impl<'a, T: DType> IntoDlTensorMut<'a> for &'a mut CudaTensor<T> {
+    fn into_dl_tensor_mut(self) -> std::result::Result<DLTensorViewMut<'a>, DLPackError> {
+        unsafe {
+            DLTensorViewMut::from_raw_parts(
+                self.data,
+                DLDevice { device_type: DLDeviceType::kDLCUDA, device_id: 0 },
+                &self.shape,
+                None,
+                T::dl_dtype(),
+            )
+        }
+    }
+}
+
+/// Example showing how to index and search data with CAGRA.
+fn cagra_example() -> ExampleResult<()> {
     let res = Resources::new()?;
 
-    // Create a new random dataset to index
+    // Create a new random dataset to index and copy it to the device.
     let n_datapoints = 65536;
     let n_features = 512;
-    let dataset =
-        ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+    let dataset_host = ndarray::Array::<f32, _>::random(
+        (n_datapoints, n_features),
+        Uniform::new(0., 1.0).unwrap(),
+    );
+    let dataset = CudaTensor::from_host(&res, &dataset_host)?;
 
-    // build the cagra index
+    // Build the CAGRA index.
     let build_params = IndexParams::new()?;
     let index = Index::build(&res, &build_params, &dataset)?;
-    println!("Indexed {}x{} datapoints into cagra index", n_datapoints, n_features);
+    println!("Indexed {n_datapoints}x{n_features} datapoints into cagra index");
 
-    // use the first 4 points from the dataset as queries : will test that we get them back
-    // as their own nearest neighbor
+    // Use the first 4 points as queries; each should be its own nearest neighbor.
     let n_queries = 4;
-    let queries = dataset.slice(s![0..n_queries, ..]);
-
     let k = 10;
+    let queries_host = dataset_host.slice(s![0..n_queries, ..]).to_owned();
+    let queries = CudaTensor::from_host(&res, &queries_host)?;
 
-    // CAGRA search API requires queries and outputs to be on device memory
-    // copy query data over, and allocate new device memory for the distances/ neighbors
-    // outputs
-    let queries = ManagedTensor::from(&queries).to_device(&res)?;
-    let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
-    let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res)?;
-
-    let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
-    let distances = ManagedTensor::from(&distances_host).to_device(&res)?;
+    let mut neighbors = CudaTensor::<u32>::alloc(&[n_queries, k])?;
+    let mut distances = CudaTensor::<f32>::alloc(&[n_queries, k])?;
 
     let search_params = SearchParams::new()?;
+    index.search(&res, &search_params, &queries, &mut neighbors, &mut distances)?;
 
-    index.search(&res, &search_params, &queries, &neighbors, &distances)?;
-
-    // Copy back to host memory
-    distances.to_host(&res, &mut distances_host)?;
+    // Copy the results back to the host.
+    let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
+    let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
     neighbors.to_host(&res, &mut neighbors_host)?;
+    distances.to_host(&res, &mut distances_host)?;
 
-    // nearest neighbors should be themselves, since queries are from the
-    // dataset
-    println!("Neighbors {:?}", neighbors_host);
-    println!("Distances {:?}", distances_host);
+    println!("Neighbors {neighbors_host:?}");
+    println!("Distances {distances_host:?}");
     Ok(())
 }
 
 fn main() {
     if let Err(e) = cagra_example() {
-        println!("Failed to run CAGRA: {:?}", e);
+        println!("Failed to run CAGRA: {e:?}");
     }
 }