diff --git a/rust/cuvs/Cargo.toml b/rust/cuvs/Cargo.toml
index 16a02c61b0..c66fa09993 100644
--- a/rust/cuvs/Cargo.toml
+++ b/rust/cuvs/Cargo.toml
@@ -14,10 +14,12 @@ doc-only = ["cuvs-sys/doc-only"]
 
 [dependencies]
 cuvs-sys = { workspace = true }
-ndarray = "0.15"
+thiserror = "2"
+tinyvec = { version = "1", features = ["alloc"] }
 
 [dev-dependencies]
-ndarray-rand = "0.14"
+ndarray = "0.17"
+ndarray-rand = "0.16"
 
 [package.metadata.docs.rs]
 features = ["doc-only"]
diff --git a/rust/cuvs/examples/cagra.rs b/rust/cuvs/examples/cagra.rs
index 2f0ee4e071..f9c2697f10 100644
--- a/rust/cuvs/examples/cagra.rs
+++ b/rust/cuvs/examples/cagra.rs
@@ -3,62 +3,220 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+//! CAGRA example with a user-provided GPU tensor.
+//!
+//! This demonstrates how to feed your own device memory into cuVS by
+//! implementing the public [`IntoDlTensor`]/[`IntoDlTensorMut`] traits. The
+//! [`CudaTensor`] type manages device memory directly through the CUDA runtime
+//! (`cudaMalloc`/`cudaFree`) and copies to/from host arrays with `cudaMemcpyAsync`
+//! on the cuVS stream, reusing the resources handle's `get_cuda_stream`/
+//! `sync_stream` for stream access and synchronization.
+//!
+//! A real application would likely rely on a helper crate such as `cudarc`
+//! and its `CudaSlice`.
+
+use std::ffi::c_void;
+use std::marker::PhantomData;
+use std::os::raw::c_int;
+
+use cuvs::Resources;
 use cuvs::cagra::{Index, IndexParams, SearchParams};
-use cuvs::{ManagedTensor, Resources, Result};
+use cuvs::dlpack::{
+    DLDevice, DLDeviceType, DLPackError, DLTensorView, DLTensorViewMut, DType, IntoDlTensor,
+    IntoDlTensorMut,
+};
 
 use ndarray::s;
 use ndarray_rand::RandomExt;
 use ndarray_rand::rand_distr::Uniform;
 
-/// Example showing how to index and search data with CAGRA
-fn cagra_example() -> Result<()> {
+type ExampleResult<T> = std::result::Result<T, Box<dyn std::error::Error>>;
+
+// ---------------------------------------------------------------------------
+// Minimal CUDA runtime FFI
+// ---------------------------------------------------------------------------
+
+#[allow(non_camel_case_types)]
+type cudaError_t = c_int;
+const CUDA_SUCCESS: cudaError_t = 0;
+const CUDA_MEMCPY_HOST_TO_DEVICE: c_int = 1;
+const CUDA_MEMCPY_DEVICE_TO_HOST: c_int = 2;
+
+#[link(name = "cudart")]
+unsafe extern "C" {
+    fn cudaMalloc(ptr: *mut *mut c_void, size: usize) -> cudaError_t;
+    fn cudaFree(ptr: *mut c_void) -> cudaError_t;
+    fn cudaMemcpyAsync(
+        dst: *mut c_void,
+        src: *const c_void,
+        count: usize,
+        kind: c_int,
+        stream: cuvs_sys::cudaStream_t,
+    ) -> cudaError_t;
+}
+
+fn check_cuda(status: cudaError_t) -> ExampleResult<()> {
+    if status == CUDA_SUCCESS {
+        Ok(())
+    } else {
+        Err(format!("CUDA runtime error: {status}").into())
+    }
+}
+
+// ---------------------------------------------------------------------------
+// A custom device tensor backed by the CUDA runtime
+// ---------------------------------------------------------------------------
+
+struct CudaTensor<T: DType> {
+    data: *mut c_void,
+    shape: Vec<i64>,
+    bytes: usize,
+    _marker: PhantomData<T>,
+}
+
+impl<T: DType> CudaTensor<T> {
+    /// Allocate an uninitialized device buffer (used for search outputs).
+    fn alloc(shape: &[usize]) -> ExampleResult<Self> {
+        let bytes = shape.iter().product::<usize>() * std::mem::size_of::<T>();
+        let mut data: *mut c_void = std::ptr::null_mut();
+        check_cuda(unsafe { cudaMalloc(&mut data, bytes) })?;
+        Ok(Self {
+            data,
+            shape: shape.iter().map(|&d| d as i64).collect(),
+            bytes,
+            _marker: PhantomData,
+        })
+    }
+
+    /// Copy a contiguous host array onto the device.
+    fn from_host<D>(res: &Resources, host: &ndarray::ArrayRef<T, D>) -> ExampleResult<Self>
+    where
+        D: ndarray::Dimension,
+    {
+        if !host.is_standard_layout() {
+            return Err("host array must be contiguous (row-major)".into());
+        }
+        let tensor = Self::alloc(host.shape())?;
+
+        let stream = res.get_cuda_stream()?;
+        check_cuda(unsafe {
+            cudaMemcpyAsync(
+                tensor.data,
+                host.as_ptr() as *const c_void,
+                tensor.bytes,
+                CUDA_MEMCPY_HOST_TO_DEVICE,
+                stream,
+            )
+        })?;
+        res.sync_stream()?;
+
+        Ok(tensor)
+    }
+
+    /// Copy the device buffer back into a contiguous host array.
+    fn to_host<D>(&self, res: &Resources, host: &mut ndarray::ArrayRef<T, D>) -> ExampleResult<()>
+    where
+        D: ndarray::Dimension,
+    {
+        if !host.is_standard_layout() {
+            return Err("host array must be contiguous (row-major)".into());
+        }
+
+        let stream = res.get_cuda_stream()?;
+        check_cuda(unsafe {
+            cudaMemcpyAsync(
+                host.as_mut_ptr() as *mut c_void,
+                self.data,
+                self.bytes,
+                CUDA_MEMCPY_DEVICE_TO_HOST,
+                stream,
+            )
+        })?;
+        res.sync_stream()?;
+
+        Ok(())
+    }
+}
+
+impl<T: DType> Drop for CudaTensor<T> {
+    fn drop(&mut self) {
+        if !self.data.is_null() {
+            unsafe { cudaFree(self.data) };
+        }
+    }
+}
+
+impl<'a, T: DType> IntoDlTensor<'a> for &'a CudaTensor<T> {
+    fn into_dl_tensor(self) -> std::result::Result<DLTensorView<'a>, DLPackError> {
+        unsafe {
+            DLTensorView::from_raw_parts(
+                self.data,
+                DLDevice { device_type: DLDeviceType::kDLCUDA, device_id: 0 },
+                &self.shape,
+                None,
+                T::dl_dtype(),
+            )
+        }
+    }
+}
+
+impl<'a, T: DType> IntoDlTensorMut<'a> for &'a mut CudaTensor<T> {
+    fn into_dl_tensor_mut(self) -> std::result::Result<DLTensorViewMut<'a>, DLPackError> {
+        unsafe {
+            DLTensorViewMut::from_raw_parts(
+                self.data,
+                DLDevice { device_type: DLDeviceType::kDLCUDA, device_id: 0 },
+                &self.shape,
+                None,
+                T::dl_dtype(),
+            )
+        }
+    }
+}
+
+/// Example showing how to index and search data with CAGRA.
+fn cagra_example() -> ExampleResult<()> {
     let res = Resources::new()?;
 
-    // Create a new random dataset to index
+    // Create a new random dataset to index and copy it to the device.
     let n_datapoints = 65536;
     let n_features = 512;
-    let dataset =
-        ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+    let dataset_host = ndarray::Array::<f32, _>::random(
+        (n_datapoints, n_features),
+        Uniform::new(0., 1.0).unwrap(),
+    );
+    let dataset = CudaTensor::from_host(&res, &dataset_host)?;
 
-    // build the cagra index
+    // Build the CAGRA index.
     let build_params = IndexParams::new()?;
     let index = Index::build(&res, &build_params, &dataset)?;
-    println!("Indexed {}x{} datapoints into cagra index", n_datapoints, n_features);
+    println!("Indexed {n_datapoints}x{n_features} datapoints into cagra index");
 
-    // use the first 4 points from the dataset as queries : will test that we get them back
-    // as their own nearest neighbor
+    // Use the first 4 points as queries; each should be its own nearest neighbor.
     let n_queries = 4;
-    let queries = dataset.slice(s![0..n_queries, ..]);
-
     let k = 10;
+    let queries_host = dataset_host.slice(s![0..n_queries, ..]).to_owned();
+    let queries = CudaTensor::from_host(&res, &queries_host)?;
 
-    // CAGRA search API requires queries and outputs to be on device memory
-    // copy query data over, and allocate new device memory for the distances/ neighbors
-    // outputs
-    let queries = ManagedTensor::from(&queries).to_device(&res)?;
-    let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
-    let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res)?;
-
-    let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
-    let distances = ManagedTensor::from(&distances_host).to_device(&res)?;
+    let mut neighbors = CudaTensor::<u32>::alloc(&[n_queries, k])?;
+    let mut distances = CudaTensor::<f32>::alloc(&[n_queries, k])?;
 
     let search_params = SearchParams::new()?;
+    index.search(&res, &search_params, &queries, &mut neighbors, &mut distances)?;
 
-    index.search(&res, &search_params, &queries, &neighbors, &distances)?;
-
-    // Copy back to host memory
-    distances.to_host(&res, &mut distances_host)?;
+    // Copy the results back to the host.
+    let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
+    let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
     neighbors.to_host(&res, &mut neighbors_host)?;
+    distances.to_host(&res, &mut distances_host)?;
 
-    // nearest neighbors should be themselves, since queries are from the
-    // dataset
-    println!("Neighbors {:?}", neighbors_host);
-    println!("Distances {:?}", distances_host);
+    println!("Neighbors {neighbors_host:?}");
+    println!("Distances {distances_host:?}");
     Ok(())
 }
 
 fn main() {
     if let Err(e) = cagra_example() {
-        println!("Failed to run CAGRA: {:?}", e);
+        println!("Failed to run CAGRA: {e:?}");
     }
 }
diff --git a/rust/cuvs/src/brute_force.rs b/rust/cuvs/src/brute_force.rs
index 413e8b0fb1..2b513c3627 100644
--- a/rust/cuvs/src/brute_force.rs
+++ b/rust/cuvs/src/brute_force.rs
@@ -2,94 +2,102 @@
  * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
-//! Brute Force KNN
+//! Brute-force (exact) k-NN.
+//!
+//! Build an [`Index`] over a dataset, then [`search`](Index::search) it with
+//! device-resident queries and output buffers. Tensors are passed through the
+//! [`IntoDlTensor`] /
+//! [`IntoDlTensorMut`] traits; see the
+//! [`dlpack`](crate::dlpack) module for the tensor model and `examples/cagra.rs`
+//! for the same build/search workflow.
 
 use std::io::{Write, stderr};
+use std::marker::PhantomData;
 
 use crate::distance_type::DistanceType;
-use crate::dlpack::ManagedTensor;
+use crate::dlpack::{IntoDlTensor, IntoDlTensorMut};
 use crate::error::{Result, check_cuvs};
 use crate::resources::Resources;
 
 /// Brute Force KNN Index
 #[derive(Debug)]
-pub struct Index {
+pub struct Index<'d> {
     inner: ffi::cuvsBruteForceIndex_t,
     // cuVS brute_force::index stores a non-owning view into the dataset.
-    // Keep the Rust tensor alive for as long as the C++ index may read it.
-    _dataset: Option<ManagedTensor>,
+    // Keep the Rust borrow alive for as long as the C++ index may read it.
+    _dataset: PhantomData<&'d ()>,
 }
 
-impl Index {
-    /// Builds a new Brute Force KNN Index from the dataset for efficient search.
+impl<'d> Index<'d> {
+    /// Builds a brute-force index over `dataset` for exact k-NN search.
     ///
-    /// # Arguments
-    ///
-    /// * `res` - Resources to use
-    /// * `metric` - DistanceType to use for building the index
-    /// * `metric_arg` - Optional value of `p` for Minkowski distances
-    /// * `dataset` - A row-major matrix on either the host or device to index
-    pub fn build<T: Into<ManagedTensor>>(
+    /// `metric` selects the distance and `metric_arg` is the optional `p` for
+    /// Minkowski distances (defaults to 2). `dataset` is a row-major matrix on
+    /// the host or device implementing [`IntoDlTensor`]; the
+    /// C++ index keeps a non-owning view of it, so the returned [`Index`] borrows
+    /// it for `'d` and cannot outlive it.
+    pub fn build(
         res: &Resources,
         metric: DistanceType,
         metric_arg: Option<f32>,
-        dataset: T,
-    ) -> Result<Index> {
-        let dataset: ManagedTensor = dataset.into();
-        let mut index = Index::new()?;
+        dataset: impl IntoDlTensor<'d>,
+    ) -> Result<Index<'d>> {
+        let dataset = dataset.into_dl_tensor()?;
+        let index = Index::new()?;
         unsafe {
             check_cuvs(ffi::cuvsBruteForceBuild(
                 res.0,
-                dataset.as_ptr(),
+                dataset.to_c().as_mut_ptr(),
                 metric,
                 metric_arg.unwrap_or(2.0),
                 index.inner,
             ))?;
         }
-        index._dataset = Some(dataset);
         Ok(index)
     }
 
     /// Creates a new empty index
-    pub fn new() -> Result<Index> {
+    pub fn new() -> Result<Index<'d>> {
         unsafe {
             let mut index = std::mem::MaybeUninit::<ffi::cuvsBruteForceIndex_t>::uninit();
             check_cuvs(ffi::cuvsBruteForceIndexCreate(index.as_mut_ptr()))?;
-            Ok(Index { inner: index.assume_init(), _dataset: None })
+            Ok(Index { inner: index.assume_init(), _dataset: PhantomData })
         }
     }
 
-    /// Perform a Nearest Neighbors search on the Index
-    ///
-    /// # Arguments
+    /// Searches the index for the `k` nearest neighbors of each query.
     ///
-    /// * `res` - Resources to use
-    /// * `queries` - A matrix in device memory to query for
-    /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors
-    /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors
-    pub fn search(
+    /// `queries`, `neighbors`, and `distances` must reside in device memory and
+    /// implement [`IntoDlTensor`] /
+    /// [`IntoDlTensorMut`]. `neighbors` receives the
+    /// neighbor indices and `distances` their distances; both are written in
+    /// place.
+    pub fn search<'a>(
         &self,
         res: &Resources,
-        queries: &ManagedTensor,
-        neighbors: &ManagedTensor,
-        distances: &ManagedTensor,
+        queries: impl IntoDlTensor<'a>,
+        neighbors: impl IntoDlTensorMut<'a>,
+        distances: impl IntoDlTensorMut<'a>,
     ) -> Result<()> {
+        let queries = queries.into_dl_tensor()?;
+        let neighbors = neighbors.into_dl_tensor_mut()?;
+        let distances = distances.into_dl_tensor_mut()?;
         unsafe {
             let prefilter = ffi::cuvsFilter { addr: 0, type_: ffi::cuvsFilterType::NO_FILTER };
 
             check_cuvs(ffi::cuvsBruteForceSearch(
                 res.0,
                 self.inner,
-                queries.as_ptr(),
-                neighbors.as_ptr(),
-                distances.as_ptr(),
+                queries.to_c().as_mut_ptr(),
+                neighbors.to_c().as_mut_ptr(),
+                distances.to_c().as_mut_ptr(),
                 prefilter,
             ))
         }
     }
 }
 
-impl Drop for Index {
+impl Drop for Index<'_> {
     fn drop(&mut self) {
         if let Err(e) = check_cuvs(unsafe { ffi::cuvsBruteForceIndexDestroy(self.inner) }) {
             write!(stderr(), "failed to call bruteForceIndexDestroy {:?}", e)
@@ -101,6 +109,7 @@ impl Drop for Index {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::test_utils::DeviceTensor;
     use ndarray::s;
     use ndarray_rand::RandomExt;
     use ndarray_rand::rand_distr::Uniform;
@@ -111,39 +120,41 @@ mod tests {
         // Create a new random dataset to index
         let n_datapoints = 16;
         let n_features = 8;
-        let dataset_host =
-            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+        let dataset_host = ndarray::Array::<f32, _>::random(
+            (n_datapoints, n_features),
+            Uniform::new(0., 1.0).unwrap(),
+        );
 
-        let dataset = ManagedTensor::from(&dataset_host).to_device(&res).unwrap();
+        let dataset = DeviceTensor::from_host(&res, &dataset_host).unwrap();
 
         println!("dataset {:#?}", dataset_host);
 
         // build the brute force index
         let index =
-            Index::build(&res, metric, None, dataset).expect("failed to create brute force index");
+            Index::build(&res, metric, None, &dataset).expect("failed to create brute force index");
 
         res.sync_stream().unwrap();
 
         // use the first 4 points from the dataset as queries : will test that we get them back
         // as their own nearest neighbor
         let n_queries = 4;
-        let queries = dataset_host.slice(s![0..n_queries, ..]);
+        let queries = dataset_host.slice(s![0..n_queries, ..]).to_owned();
 
         let k = 4;
 
         println!("queries! {:#?}", queries);
-        let queries = ManagedTensor::from(&queries).to_device(&res).unwrap();
+        let queries = DeviceTensor::from_host(&res, &queries).unwrap();
         let mut neighbors_host = ndarray::Array::<i64, _>::zeros((n_queries, k));
-        let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res).unwrap();
+        let mut neighbors = DeviceTensor::<i64>::zeros(&res, &[n_queries, k]).unwrap();
 
         let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
-        let distances = ManagedTensor::from(&distances_host).to_device(&res).unwrap();
+        let mut distances = DeviceTensor::<f32>::zeros(&res, &[n_queries, k]).unwrap();
 
-        index.search(&res, &queries, &neighbors, &distances).unwrap();
+        index.search(&res, &queries, &mut neighbors, &mut distances).unwrap();
 
         // Copy back to host memory
-        distances.to_host(&res, &mut distances_host).unwrap();
-        neighbors.to_host(&res, &mut neighbors_host).unwrap();
+        distances.copy_to_host(&res, &mut distances_host).unwrap();
+        neighbors.copy_to_host(&res, &mut neighbors_host).unwrap();
         res.sync_stream().unwrap();
 
         println!("distances {:#?}", distances_host);
diff --git a/rust/cuvs/src/cagra/index.rs b/rust/cuvs/src/cagra/index.rs
index d69a4d5033..290cfaef63 100644
--- a/rust/cuvs/src/cagra/index.rs
+++ b/rust/cuvs/src/cagra/index.rs
@@ -5,16 +5,26 @@
 
 use std::ffi::CString;
 use std::io::{Write, stderr};
+use std::marker::PhantomData;
 use std::path::Path;
 
 use crate::cagra::{IndexParams, SearchParams};
-use crate::dlpack::ManagedTensor;
+use crate::dlpack::{IntoDlTensor, IntoDlTensorMut};
 use crate::error::{Error, Result, check_cuvs};
 use crate::resources::Resources;
 
-/// CAGRA ANN Index
+/// A CAGRA approximate nearest neighbor index.
+///
+/// The lifetime `'d` ties this index to the underlying dataset,
+/// passed at construction time. The C library may store a non-owning view
+/// of properly aligned device-resident data, so the dataset must outlive
+/// the index. When an index is deserialized from disk, the data is
+/// self-contained and its lifetime is `'static`.
 #[derive(Debug)]
-pub struct Index(ffi::cuvsCagraIndex_t);
+pub struct Index<'d> {
+    handle: ffi::cuvsCagraIndex_t,
+    _dataset: PhantomData<&'d ()>,
+}
 
 /// Convert a filesystem path into a `CString` suitable for the cuVS C API,
 /// returning `Error::InvalidArgument` instead of panicking for paths that are
@@ -27,63 +37,68 @@ fn path_to_cstring(path: &Path) -> Result<CString> {
         .map_err(|e| Error::InvalidArgument(format!("path contains an interior NUL byte: {e}")))
 }
 
-impl Index {
-    /// Builds a new Index from the dataset for efficient search.
-    ///
-    /// # Arguments
+impl<'d> Index<'d> {
+    /// Builds a CAGRA index over `dataset` for efficient search.
     ///
-    /// * `res` - Resources to use
-    /// * `params` - Parameters for building the index
-    /// * `dataset` - A row-major matrix on either the host or device to index
-    pub fn build<T: Into<ManagedTensor>>(
+    /// `dataset` is a row-major matrix on the host or device implementing
+    /// [`IntoDlTensor`](crate::IntoDlTensor). The C++ index keeps a non-owning
+    /// view of it, so the returned [`Index`] borrows `dataset` for `'d` and
+    /// cannot outlive it.
+    pub fn build(
         res: &Resources,
         params: &IndexParams,
-        dataset: T,
-    ) -> Result<Index> {
-        let dataset: ManagedTensor = dataset.into();
+        dataset: impl IntoDlTensor<'d>,
+    ) -> Result<Index<'d>> {
+        let dataset = dataset.into_dl_tensor()?;
         let index = Index::new()?;
         unsafe {
-            check_cuvs(ffi::cuvsCagraBuild(res.0, params.0, dataset.as_ptr(), index.0))?;
+            check_cuvs(ffi::cuvsCagraBuild(
+                res.0,
+                params.0,
+                dataset.to_c().as_mut_ptr(),
+                index.handle,
+            ))?;
         }
         Ok(index)
     }
 
     /// Creates a new empty index
-    pub fn new() -> Result<Index> {
+    pub fn new() -> Result<Index<'d>> {
         unsafe {
             let mut index = std::mem::MaybeUninit::<ffi::cuvsCagraIndex_t>::uninit();
             check_cuvs(ffi::cuvsCagraIndexCreate(index.as_mut_ptr()))?;
-            Ok(Index(index.assume_init()))
+            Ok(Index { handle: index.assume_init(), _dataset: PhantomData })
         }
     }
 
-    /// Perform a Approximate Nearest Neighbors search on the Index
-    ///
-    /// # Arguments
+    /// Searches the index for the `k` nearest neighbors of each query.
     ///
-    /// * `res` - Resources to use
-    /// * `params` - Parameters to use in searching the index
-    /// * `queries` - A matrix in device memory to query for
-    /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors
-    /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors
-    pub fn search(
+    /// `queries`, `neighbors`, and `distances` must reside in device memory and
+    /// implement [`IntoDlTensor`](crate::IntoDlTensor) /
+    /// [`IntoDlTensorMut`](crate::IntoDlTensorMut). `neighbors` (shape
+    /// `n_queries × k`) receives the neighbor indices and `distances` their
+    /// distances; both are written in place.
+    pub fn search<'a>(
         &self,
         res: &Resources,
         params: &SearchParams,
-        queries: &ManagedTensor,
-        neighbors: &ManagedTensor,
-        distances: &ManagedTensor,
+        queries: impl IntoDlTensor<'a>,
+        neighbors: impl IntoDlTensorMut<'a>,
+        distances: impl IntoDlTensorMut<'a>,
     ) -> Result<()> {
+        let queries = queries.into_dl_tensor()?;
+        let neighbors = neighbors.into_dl_tensor_mut()?;
+        let distances = distances.into_dl_tensor_mut()?;
         unsafe {
             let prefilter = ffi::cuvsFilter { addr: 0, type_: ffi::cuvsFilterType::NO_FILTER };
 
             check_cuvs(ffi::cuvsCagraSearch(
                 res.0,
                 params.0,
-                self.0,
-                queries.as_ptr(),
-                neighbors.as_ptr(),
-                distances.as_ptr(),
+                self.handle,
+                queries.to_c().as_mut_ptr(),
+                neighbors.to_c().as_mut_ptr(),
+                distances.to_c().as_mut_ptr(),
                 prefilter,
             ))
         }
@@ -91,41 +106,43 @@ impl Index {
 
     /// Perform a filtered Approximate Nearest Neighbors search on the Index
     ///
-    /// Like [`search`](Self::search), but accepts a bitset filter to exclude
-    /// vectors during graph traversal. Filtered vectors are never visited,
-    /// giving better recall than post-filtering.
-    ///
-    /// # Arguments
+    /// Like [`search`](Self::search), but applies a bitset filter to exclude
+    /// vectors during graph traversal. Filtered vectors are never visited, which
+    /// gives better recall than post-filtering.
     ///
-    /// * `res` - Resources to use
-    /// * `params` - Parameters to use in searching the index
-    /// * `queries` - A matrix in device memory to query for
-    /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors
-    /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors
-    /// * `bitset` - A 1-D `uint32` device tensor with `ceil(n_rows / 32)` elements.
-    ///   Each bit corresponds to a dataset row: bit 1 = include, bit 0 = exclude.
-    pub fn search_with_filter(
+    /// `queries`, `neighbors`, and `distances` are as in [`search`](Self::search).
+    /// `bitset` is a 1-D `uint32` device tensor of `ceil(n_rows / 32)` elements,
+    /// where each bit maps to a dataset row (1 = include, 0 = exclude).
+    pub fn search_with_filter<'a>(
         &self,
         res: &Resources,
         params: &SearchParams,
-        queries: &ManagedTensor,
-        neighbors: &ManagedTensor,
-        distances: &ManagedTensor,
-        bitset: &ManagedTensor,
+        queries: impl IntoDlTensor<'a>,
+        neighbors: impl IntoDlTensorMut<'a>,
+        distances: impl IntoDlTensorMut<'a>,
+        bitset: impl IntoDlTensor<'a>,
     ) -> Result<()> {
+        let queries = queries.into_dl_tensor()?;
+        let neighbors = neighbors.into_dl_tensor_mut()?;
+        let distances = distances.into_dl_tensor_mut()?;
+        let bitset = bitset.into_dl_tensor()?;
+        // The bitset pointer is cast to `usize` and stored in `prefilter`, then read
+        // by the search call, so its `ManagedTensorRef` must outlive both.
+        // Hence we keep it bound instead of chaining `to_c().as_mut_ptr()`.
+        let mut bitset_c = bitset.to_c();
         unsafe {
             let prefilter = ffi::cuvsFilter {
-                addr: bitset.as_ptr() as usize,
+                addr: bitset_c.as_mut_ptr() as usize,
                 type_: ffi::cuvsFilterType::BITSET,
             };
 
             check_cuvs(ffi::cuvsCagraSearch(
                 res.0,
                 params.0,
-                self.0,
-                queries.as_ptr(),
-                neighbors.as_ptr(),
-                distances.as_ptr(),
+                self.handle,
+                queries.to_c().as_mut_ptr(),
+                neighbors.to_c().as_mut_ptr(),
+                distances.to_c().as_mut_ptr(),
                 prefilter,
             ))
         }
@@ -140,6 +157,29 @@ impl Index {
     /// * `res` - Resources to use
     /// * `filename` - The file path for saving the index
     /// * `include_dataset` - Whether to write out the dataset to the file
+    ///
+    /// # Example:
+    /// ```no_run
+    /// use cuvs::cagra::{Index, IndexParams};
+    /// use cuvs::{Resources, Result};
+    ///
+    /// fn serialize_example() -> Result<()> {
+    ///     let res = Resources::new()?;
+    ///
+    ///     // Build an index (using some dataset)
+    ///     let build_params = IndexParams::new()?;
+    ///     // let index = Index::build(&res, &build_params, &dataset)?;
+    ///
+    ///     // Save the index to disk (including the dataset)
+    ///     // index.serialize(&res, "/path/to/index.bin", true)?;
+    ///
+    ///     // Later, load the index from disk
+    ///     let loaded_index = Index::deserialize(&res, "/path/to/index.bin")?;
+    ///
+    ///     // The loaded index can be used for search just like the original
+    ///     Ok(())
+    /// }
+    /// ```
     pub fn serialize<P: AsRef<Path>>(
         &self,
         res: &Resources,
@@ -148,7 +188,12 @@ impl Index {
     ) -> Result<()> {
         let c_filename = path_to_cstring(filename.as_ref())?;
         unsafe {
-            check_cuvs(ffi::cuvsCagraSerialize(res.0, c_filename.as_ptr(), self.0, include_dataset))
+            check_cuvs(ffi::cuvsCagraSerialize(
+                res.0,
+                c_filename.as_ptr(),
+                self.handle,
+                include_dataset,
+            ))
         }
     }
 
@@ -165,7 +210,9 @@ impl Index {
     /// * `filename` - The file path for saving the index
     pub fn serialize_to_hnswlib<P: AsRef<Path>>(&self, res: &Resources, filename: P) -> Result<()> {
         let c_filename = path_to_cstring(filename.as_ref())?;
-        unsafe { check_cuvs(ffi::cuvsCagraSerializeToHnswlib(res.0, c_filename.as_ptr(), self.0)) }
+        unsafe {
+            check_cuvs(ffi::cuvsCagraSerializeToHnswlib(res.0, c_filename.as_ptr(), self.handle))
+        }
     }
 
     /// Load a CAGRA index from file.
@@ -176,19 +223,19 @@ impl Index {
     ///
     /// * `res` - Resources to use
     /// * `filename` - The path of the file that stores the index
-    pub fn deserialize<P: AsRef<Path>>(res: &Resources, filename: P) -> Result<Index> {
+    pub fn deserialize<P: AsRef<Path>>(res: &Resources, filename: P) -> Result<Index<'static>> {
         let c_filename = path_to_cstring(filename.as_ref())?;
         let index = Index::new()?;
         unsafe {
-            check_cuvs(ffi::cuvsCagraDeserialize(res.0, c_filename.as_ptr(), index.0))?;
+            check_cuvs(ffi::cuvsCagraDeserialize(res.0, c_filename.as_ptr(), index.handle))?;
         }
         Ok(index)
     }
 }
 
-impl Drop for Index {
+impl Drop for Index<'_> {
     fn drop(&mut self) {
-        if let Err(e) = check_cuvs(unsafe { ffi::cuvsCagraIndexDestroy(self.0) }) {
+        if let Err(e) = check_cuvs(unsafe { ffi::cuvsCagraIndexDestroy(self.handle) }) {
             write!(stderr(), "failed to call cagraIndexDestroy {:?}", e)
                 .expect("failed to write to stderr");
         }
@@ -198,6 +245,7 @@ impl Drop for Index {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::test_utils::DeviceTensor;
     use ndarray::s;
     use ndarray_rand::RandomExt;
     use ndarray_rand::rand_distr::Uniform;
@@ -205,41 +253,32 @@ mod tests {
     const N_DATAPOINTS: usize = 256;
     const N_FEATURES: usize = 16;
 
-    /// Build a small random dataset and a CAGRA index over it.
-    fn build_test_index(
-        res: &Resources,
-        build_params: &IndexParams,
-    ) -> (ndarray::Array2<f32>, Index) {
-        let dataset =
-            ndarray::Array::<f32, _>::random((N_DATAPOINTS, N_FEATURES), Uniform::new(0., 1.0));
-        let index = Index::build(res, build_params, &dataset).expect("failed to build cagra index");
-        (dataset, index)
-    }
-
     /// Search the first `n_queries` rows of `dataset` against `index` and
     /// assert each query finds itself as the top-1 neighbor. CAGRA search
     /// requires queries and outputs to live in device memory.
     fn search_and_verify_self_neighbors(
         res: &Resources,
-        index: &Index,
+        index: &Index<'_>,
         dataset: &ndarray::Array2<f32>,
         n_queries: usize,
         k: usize,
     ) {
         let queries = dataset.slice(s![0..n_queries, ..]);
-        let queries = ManagedTensor::from(&queries).to_device(res).unwrap();
+        let queries = DeviceTensor::from_host(res, &queries.to_owned()).unwrap();
 
         let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
-        let neighbors = ManagedTensor::from(&neighbors_host).to_device(res).unwrap();
+        let mut neighbors = DeviceTensor::<u32>::zeros(res, &[n_queries, k]).unwrap();
 
         let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
-        let distances = ManagedTensor::from(&distances_host).to_device(res).unwrap();
+        let mut distances = DeviceTensor::<f32>::zeros(res, &[n_queries, k]).unwrap();
 
         let search_params = SearchParams::new().unwrap();
-        index.search(res, &search_params, &queries, &neighbors, &distances).expect("search failed");
+        index
+            .search(res, &search_params, &queries, &mut neighbors, &mut distances)
+            .expect("search failed");
 
-        distances.to_host(res, &mut distances_host).unwrap();
-        neighbors.to_host(res, &mut neighbors_host).unwrap();
+        distances.copy_to_host(res, &mut distances_host).unwrap();
+        neighbors.copy_to_host(res, &mut neighbors_host).unwrap();
 
         for i in 0..n_queries {
             assert_eq!(
@@ -252,7 +291,12 @@ mod tests {
 
     fn test_cagra(build_params: IndexParams) {
         let res = Resources::new().unwrap();
-        let (dataset, index) = build_test_index(&res, &build_params);
+        let dataset = ndarray::Array::<f32, _>::random(
+            (N_DATAPOINTS, N_FEATURES),
+            Uniform::new(0., 1.0).unwrap(),
+        );
+        let index =
+            Index::build(&res, &build_params, &*dataset).expect("failed to build cagra index");
         search_and_verify_self_neighbors(&res, &index, &dataset, 4, 10);
     }
 
@@ -278,11 +322,13 @@ mod tests {
 
         let n_datapoints = 256;
         let n_features = 16;
-        let dataset =
-            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+        let dataset = ndarray::Array::<f32, _>::random(
+            (n_datapoints, n_features),
+            Uniform::new(0., 1.0).unwrap(),
+        );
 
         let index =
-            Index::build(&res, &build_params, &dataset).expect("failed to create cagra index");
+            Index::build(&res, &build_params, &*dataset).expect("failed to create cagra index");
 
         // Build a bitset that includes only even-indexed rows
         let n_words = (n_datapoints + 31) / 32;
@@ -292,26 +338,32 @@ mod tests {
                 bitset_host[i / 32] |= 1u32 << (i % 32);
             }
         }
-        let bitset = ManagedTensor::from(&bitset_host).to_device(&res).unwrap();
+        let bitset = DeviceTensor::from_host(&res, &bitset_host).unwrap();
 
         // Query with the first 4 even-indexed rows
         let n_queries = 4;
-        let queries = dataset.slice(s![0..n_queries * 2;2, ..]); // rows 0, 2, 4, 6
-        let queries = ManagedTensor::from(&queries).to_device(&res).unwrap();
+        let queries = dataset.slice(s![0..n_queries * 2;2, ..]).to_owned(); // rows 0, 2, 4, 6
+        let queries = DeviceTensor::from_host(&res, &queries).unwrap();
 
         let k = 10;
         let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
-        let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res).unwrap();
-        let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
-        let distances = ManagedTensor::from(&distances_host).to_device(&res).unwrap();
+        let mut neighbors = DeviceTensor::<u32>::zeros(&res, &[n_queries, k]).unwrap();
+        let mut distances = DeviceTensor::<f32>::zeros(&res, &[n_queries, k]).unwrap();
 
         let search_params = SearchParams::new().unwrap();
 
         index
-            .search_with_filter(&res, &search_params, &queries, &neighbors, &distances, &bitset)
+            .search_with_filter(
+                &res,
+                &search_params,
+                &queries,
+                &mut neighbors,
+                &mut distances,
+                &bitset,
+            )
             .unwrap();
 
-        neighbors.to_host(&res, &mut neighbors_host).unwrap();
+        neighbors.copy_to_host(&res, &mut neighbors_host).unwrap();
 
         // All returned neighbors must be even-indexed (odd rows are filtered out).
         for q in 0..n_queries {
@@ -335,7 +387,12 @@ mod tests {
     fn test_cagra_multiple_searches() {
         let res = Resources::new().unwrap();
         let build_params = IndexParams::new().unwrap();
-        let (dataset, index) = build_test_index(&res, &build_params);
+        let dataset = ndarray::Array::<f32, _>::random(
+            (N_DATAPOINTS, N_FEATURES),
+            Uniform::new(0., 1.0).unwrap(),
+        );
+        let index =
+            Index::build(&res, &build_params, &*dataset).expect("failed to build cagra index");
 
         for _ in 0..3 {
             search_and_verify_self_neighbors(&res, &index, &dataset, 4, 5);
@@ -346,7 +403,12 @@ mod tests {
     fn test_cagra_serialize_deserialize() {
         let res = Resources::new().unwrap();
         let build_params = IndexParams::new().unwrap();
-        let (dataset, index) = build_test_index(&res, &build_params);
+        let dataset = ndarray::Array::<f32, _>::random(
+            (N_DATAPOINTS, N_FEATURES),
+            Uniform::new(0., 1.0).unwrap(),
+        );
+        let index =
+            Index::build(&res, &build_params, &*dataset).expect("failed to build cagra index");
 
         let filepath = std::env::temp_dir().join("test_cagra_index.bin");
         index.serialize(&res, &filepath, true).expect("failed to serialize cagra index");
@@ -371,7 +433,12 @@ mod tests {
     fn test_cagra_serialize_without_dataset() {
         let res = Resources::new().unwrap();
         let build_params = IndexParams::new().unwrap();
-        let (_dataset, index) = build_test_index(&res, &build_params);
+        let dataset = ndarray::Array::<f32, _>::random(
+            (N_DATAPOINTS, N_FEATURES),
+            Uniform::new(0., 1.0).unwrap(),
+        );
+        let index =
+            Index::build(&res, &build_params, &*dataset).expect("failed to build cagra index");
 
         let filepath = std::env::temp_dir().join("test_cagra_index_no_dataset.bin");
         index
@@ -387,7 +454,12 @@ mod tests {
     fn test_cagra_serialize_to_hnswlib() {
         let res = Resources::new().unwrap();
         let build_params = IndexParams::new().unwrap();
-        let (_dataset, index) = build_test_index(&res, &build_params);
+        let dataset = ndarray::Array::<f32, _>::random(
+            (N_DATAPOINTS, N_FEATURES),
+            Uniform::new(0., 1.0).unwrap(),
+        );
+        let index =
+            Index::build(&res, &build_params, &*dataset).expect("failed to build cagra index");
 
         let filepath = std::env::temp_dir().join("test_cagra_index_hnsw.bin");
         index
@@ -409,7 +481,12 @@ mod tests {
     fn test_cagra_serialize_rejects_interior_nul() {
         let res = Resources::new().unwrap();
         let build_params = IndexParams::new().unwrap();
-        let (_dataset, index) = build_test_index(&res, &build_params);
+        let dataset = ndarray::Array::<f32, _>::random(
+            (N_DATAPOINTS, N_FEATURES),
+            Uniform::new(0., 1.0).unwrap(),
+        );
+        let index =
+            Index::build(&res, &build_params, &*dataset).expect("failed to build cagra index");
 
         // `PathBuf::from` on Unix preserves arbitrary bytes, so we can embed a
         // NUL byte in the path and confirm the helper rejects it.
diff --git a/rust/cuvs/src/cagra/mod.rs b/rust/cuvs/src/cagra/mod.rs
index 9043b17386..a50d132840 100644
--- a/rust/cuvs/src/cagra/mod.rs
+++ b/rust/cuvs/src/cagra/mod.rs
@@ -3,91 +3,15 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-//! CAGRA is a graph-based nearest neighbors implementation with state-of-the art
-//! query performance for both small- and large-batch sized search.
-//!
-//! Example:
-//! ```
-//!
-//! use cuvs::cagra::{Index, IndexParams, SearchParams};
-//! use cuvs::{ManagedTensor, Resources, Result};
-//!
-//! use ndarray::s;
-//! use ndarray_rand::rand_distr::Uniform;
-//! use ndarray_rand::RandomExt;
-//!
-//! fn cagra_example() -> Result<()> {
-//!     let res = Resources::new()?;
-//!
-//!     // Create a new random dataset to index
-//!     let n_datapoints = 65536;
-//!     let n_features = 512;
-//!     let dataset =
-//!         ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
-//!
-//!     // build the cagra index
-//!     let build_params = IndexParams::new()?;
-//!     let index = Index::build(&res, &build_params, &dataset)?;
-//!     println!(
-//!         "Indexed {}x{} datapoints into cagra index",
-//!         n_datapoints, n_features
-//!     );
-//!
-//!     // use the first 4 points from the dataset as queries : will test that we get them back
-//!     // as their own nearest neighbor
-//!     let n_queries = 4;
-//!     let queries = dataset.slice(s![0..n_queries, ..]);
-//!
-//!     let k = 10;
-//!
-//!     // CAGRA search API requires queries and outputs to be on device memory
-//!     // copy query data over, and allocate new device memory for the distances/ neighbors
-//!     // outputs
-//!     let queries = ManagedTensor::from(&queries).to_device(&res)?;
-//!     let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
-//!     let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res)?;
-//!
-//!     let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
-//!     let distances = ManagedTensor::from(&distances_host).to_device(&res)?;
-//!
-//!     let search_params = SearchParams::new()?;
-//!
-//!     index.search(&res, &search_params, &queries, &neighbors, &distances)?;
-//!
-//!     // Copy back to host memory
-//!     distances.to_host(&res, &mut distances_host)?;
-//!     neighbors.to_host(&res, &mut neighbors_host)?;
-//!
-//!     // nearest neighbors should be themselves, since queries are from the
-//!     // dataset
-//!     println!("Neighbors {:?}", neighbors_host);
-//!     println!("Distances {:?}", distances_host);
-//!     Ok(())
-//! }
-//! ```
-//!
-//! Serialization example:
-//! ```no_run
-//! use cuvs::cagra::{Index, IndexParams};
-//! use cuvs::{Resources, Result};
-//!
-//! fn serialize_example() -> Result<()> {
-//!     let res = Resources::new()?;
-//!
-//!     // Build an index (using some dataset)
-//!     let build_params = IndexParams::new()?;
-//!     // let index = Index::build(&res, &build_params, &dataset)?;
-//!
-//!     // Save the index to disk (including the dataset)
-//!     // index.serialize(&res, "/path/to/index.bin", true)?;
-//!
-//!     // Later, load the index from disk
-//!     let loaded_index = Index::deserialize(&res, "/path/to/index.bin")?;
-//!
-//!     // The loaded index can be used for search just like the original
-//!     Ok(())
-//! }
-//! ```
+//! CAGRA: a graph-based approximate nearest neighbors algorithm with
+//! state-of-the-art query throughput for both small and large batch sizes.
+//!
+//! Build an [`Index`] from a dataset, then [`search`](Index::search) it with
+//! device-resident queries and output buffers. Tensors are passed through the
+//! [`IntoDlTensor`](crate::IntoDlTensor) /
+//! [`IntoDlTensorMut`](crate::IntoDlTensorMut) traits; see the
+//! [`dlpack`](crate::dlpack) module for the tensor model and `examples/cagra.rs`
+//! for a complete, runnable example.
 
 mod index;
 mod index_params;
diff --git a/rust/cuvs/src/cluster/kmeans/mod.rs b/rust/cuvs/src/cluster/kmeans/mod.rs
index 5015f49f45..cf1834e9a2 100644
--- a/rust/cuvs/src/cluster/kmeans/mod.rs
+++ b/rust/cuvs/src/cluster/kmeans/mod.rs
@@ -3,77 +3,53 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-//! Kmeans clustering API's
+//! K-means clustering.
 //!
-//! Example:
-//! ```
-//!
-//! use cuvs::cluster::kmeans;
-//! use cuvs::{ManagedTensor, Resources, Result};
-//!
-//! use ndarray_rand::rand_distr::Uniform;
-//! use ndarray_rand::RandomExt;
-//!
-//! fn kmeans_example() -> Result<()> {
-//!     let res = Resources::new()?;
-//!
-//!     // Create a new random dataset to index
-//!     let n_datapoints = 65536;
-//!     let n_features = 512;
-//!     let n_clusters = 8;
-//!     let dataset =
-//!         ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
-//!     let dataset = ManagedTensor::from(&dataset).to_device(&res)?;
-//!
-//!     let centroids_host = ndarray::Array::<f32, _>::zeros((n_clusters, n_features));
-//!     let mut centroids = ManagedTensor::from(&centroids_host).to_device(&res)?;
-//!
-//!     // find the centroids with the kmeans index
-//!     let kmeans_params = kmeans::Params::new()?.set_n_clusters(n_clusters as i32);
-//!     let (inertia, n_iter) = kmeans::fit(&res, &kmeans_params, &dataset, &None, &mut centroids)?;
-//!     Ok(())
-//! }
-//! ```
+//! [`fit`] computes cluster centroids for a dataset, [`predict`] assigns points
+//! to clusters, and [`cluster_cost`] reports the inertia. All inputs and outputs
+//! reside in device memory and are passed through the
+//! [`IntoDlTensor`] /
+//! [`IntoDlTensorMut`] traits; see the
+//! [`dlpack`](crate::dlpack) module for the tensor model.
 
 mod params;
 
 pub use params::Params;
 
-use crate::dlpack::ManagedTensor;
+use crate::dlpack::{IntoDlTensor, IntoDlTensorMut};
 use crate::error::{Result, check_cuvs};
 use crate::resources::Resources;
 
-/// Find clusters with the k-means algorithm
+/// Fits k-means centroids to `x`, returning `(inertia, n_iterations)`.
 ///
-/// # Arguments
-///
-/// * `res` - Resources to use
-/// * `params` - Parameters to use to fit KMeans model
-/// * `x` - A matrix in device memory - shape (m, k)
-/// * `sample_weight` - Optional device matrix shape (n_clusters, 1)
-/// * `centroids` - Output device matrix, that has the centroids for each cluster
-///   shape (n_clusters, k)
-pub fn fit(
+/// `x` (shape `m × k`) is the input matrix and `centroids` (shape
+/// `n_clusters × k`) receives the fitted centroids; `sample_weight` is an
+/// optional per-sample weight. All reside in device memory and implement
+/// [`IntoDlTensor`] /
+/// [`IntoDlTensorMut`].
+pub fn fit<'a>(
     res: &Resources,
     params: &Params,
-    x: &ManagedTensor,
-    sample_weight: &Option<ManagedTensor>,
-    centroids: &mut ManagedTensor,
+    x: impl IntoDlTensor<'a>,
+    sample_weight: Option<impl IntoDlTensor<'a>>,
+    centroids: impl IntoDlTensorMut<'a>,
 ) -> Result<(f64, i32)> {
+    let x = x.into_dl_tensor()?;
+    let sample_weight = sample_weight.map(|w| w.into_dl_tensor()).transpose()?;
+    let centroids = centroids.into_dl_tensor_mut()?;
     let mut inertia: f64 = 0.0;
     let mut niter: i32 = 0;
+    let mut sample_weight_c = sample_weight.as_ref().map(|w| w.to_c());
+    let sample_weight_ptr =
+        sample_weight_c.as_mut().map(|w| w.as_mut_ptr()).unwrap_or(std::ptr::null_mut());
 
     unsafe {
-        let sample_weight_dlpack = match sample_weight {
-            Some(tensor) => tensor.as_ptr(),
-            None => std::ptr::null_mut(),
-        };
         check_cuvs(ffi::cuvsKMeansFit(
             res.0,
             params.0,
-            x.as_ptr(),
-            sample_weight_dlpack,
-            centroids.as_ptr(),
+            x.to_c().as_mut_ptr(),
+            sample_weight_ptr,
+            centroids.to_c().as_mut_ptr(),
             &mut inertia as *mut f64,
             &mut niter as *mut i32,
         ))?;
@@ -81,40 +57,40 @@ pub fn fit(
     Ok((inertia, niter))
 }
 
-/// Predict clusters with the k-means algorithm
-///
-/// # Arguments
+/// Assigns each row of `x` to its nearest centroid, writing cluster labels into
+/// `labels` and returning the inertia.
 ///
-/// * `res` - Resources to use
-/// * `params` - Parameters to use to fit KMeans model
-/// * `x` - Input matrix in device memory - shape (m, k)
-/// * `sample_weight` - Optional device matrix shape (n_clusters, 1)
-/// * `centroids` - Centroids calculated by fit in device memory, shape (n_clusters, k)
-/// * `labels` - preallocated CUDA array interface matrix shape (m, 1) to hold the output labels
-/// * `normalize_weight` - whether or not to normalize the weights
-pub fn predict(
+/// `x` (shape `m × k`), `centroids` (shape `n_clusters × k`), the optional
+/// `sample_weight`, and `labels` (shape `m × 1`) reside in device memory and
+/// implement [`IntoDlTensor`] /
+/// [`IntoDlTensorMut`]. `normalize_weight` selects
+/// whether the sample weights are normalized.
+pub fn predict<'a>(
     res: &Resources,
     params: &Params,
-    x: &ManagedTensor,
-    sample_weight: &Option<ManagedTensor>,
-    centroids: &ManagedTensor,
-    labels: &mut ManagedTensor,
+    x: impl IntoDlTensor<'a>,
+    sample_weight: Option<impl IntoDlTensor<'a>>,
+    centroids: impl IntoDlTensor<'a>,
+    labels: impl IntoDlTensorMut<'a>,
     normalize_weight: bool,
 ) -> Result<f64> {
+    let x = x.into_dl_tensor()?;
+    let sample_weight = sample_weight.map(|w| w.into_dl_tensor()).transpose()?;
+    let centroids = centroids.into_dl_tensor()?;
+    let labels = labels.into_dl_tensor_mut()?;
     let mut inertia: f64 = 0.0;
+    let mut sample_weight_c = sample_weight.as_ref().map(|w| w.to_c());
+    let sample_weight_ptr =
+        sample_weight_c.as_mut().map(|w| w.as_mut_ptr()).unwrap_or(std::ptr::null_mut());
 
     unsafe {
-        let sample_weight_dlpack = match sample_weight {
-            Some(tensor) => tensor.as_ptr(),
-            None => std::ptr::null_mut(),
-        };
         check_cuvs(ffi::cuvsKMeansPredict(
             res.0,
             params.0,
-            x.as_ptr(),
-            sample_weight_dlpack,
-            centroids.as_ptr(),
-            labels.as_ptr(),
+            x.to_c().as_mut_ptr(),
+            sample_weight_ptr,
+            centroids.to_c().as_mut_ptr(),
+            labels.to_c().as_mut_ptr(),
             normalize_weight,
             &mut inertia as *mut f64,
         ))?;
@@ -122,20 +98,24 @@ pub fn predict(
     Ok(inertia)
 }
 
-/// Compute cluster cost given an input matrix and existing centroids
-/// # Arguments
+/// Computes the k-means cost (inertia) of `x` against existing `centroids`.
 ///
-/// * `res` - Resources to use
-/// * `x` - Input matrix in device memory - shape (m, k)
-/// * `centroids` - Centroids calculated by fit in device memory, shape (n_clusters, k)
-pub fn cluster_cost(res: &Resources, x: &ManagedTensor, centroids: &ManagedTensor) -> Result<f64> {
+/// `x` (shape `m × k`) and `centroids` (shape `n_clusters × k`) reside in device
+/// memory and implement [`IntoDlTensor`].
+pub fn cluster_cost<'a>(
+    res: &Resources,
+    x: impl IntoDlTensor<'a>,
+    centroids: impl IntoDlTensor<'a>,
+) -> Result<f64> {
+    let x = x.into_dl_tensor()?;
+    let centroids = centroids.into_dl_tensor()?;
     let mut inertia: f64 = 0.0;
 
     unsafe {
         check_cuvs(ffi::cuvsKMeansClusterCost(
             res.0,
-            x.as_ptr(),
-            centroids.as_ptr(),
+            x.to_c().as_mut_ptr(),
+            centroids.to_c().as_mut_ptr(),
             &mut inertia as *mut f64,
         ))?;
     }
@@ -145,6 +125,7 @@ pub fn cluster_cost(res: &Resources, x: &ManagedTensor, centroids: &ManagedTenso
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::test_utils::DeviceTensor;
     use ndarray_rand::RandomExt;
     use ndarray_rand::rand_distr::Uniform;
 
@@ -157,12 +138,14 @@ mod tests {
         // Create a new random dataset to index
         let n_datapoints = 256;
         let n_features = 16;
-        let dataset =
-            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
-        let dataset = ManagedTensor::from(&dataset).to_device(&res).unwrap();
+        let dataset_host = ndarray::Array::<f32, _>::random(
+            (n_datapoints, n_features),
+            Uniform::new(0., 1.0).unwrap(),
+        );
+        let dataset = DeviceTensor::from_host(&res, &dataset_host).unwrap();
 
         let centroids_host = ndarray::Array::<f32, _>::zeros((n_clusters, n_features));
-        let mut centroids = ManagedTensor::from(&centroids_host).to_device(&res).unwrap();
+        let mut centroids = DeviceTensor::from_host(&res, &centroids_host).unwrap();
 
         let params = Params::new().unwrap().set_n_clusters(n_clusters as i32);
 
@@ -170,18 +153,28 @@ mod tests {
         let original_inertia = cluster_cost(&res, &dataset, &centroids).unwrap();
 
         // fit the centroids, make sure that inertia has gone down
-        let (inertia, n_iter) = fit(&res, &params, &dataset, &None, &mut centroids).unwrap();
+        let (inertia, n_iter) =
+            fit(&res, &params, &dataset, None::<&DeviceTensor<'_, f32>>, &mut centroids).unwrap();
 
         assert!(inertia < original_inertia);
         assert!(n_iter >= 1);
 
         let mut labels_host = ndarray::Array::<i32, _>::zeros((n_clusters,));
-        let mut labels = ManagedTensor::from(&labels_host).to_device(&res).unwrap();
+        let mut labels = DeviceTensor::<i32>::zeros(&res, &[n_clusters]).unwrap();
 
         // make sure the prediction for each centroid is the centroid itself
-        predict(&res, &params, &centroids, &None, &centroids, &mut labels, false).unwrap();
-
-        labels.to_host(&res, &mut labels_host).unwrap();
+        predict(
+            &res,
+            &params,
+            &centroids,
+            None::<&DeviceTensor<'_, f32>>,
+            &centroids,
+            &mut labels,
+            false,
+        )
+        .unwrap();
+
+        labels.copy_to_host(&res, &mut labels_host).unwrap();
         assert_eq!(labels_host[[0,]], 0);
         assert_eq!(labels_host[[1,]], 1);
         assert_eq!(labels_host[[2,]], 2);
diff --git a/rust/cuvs/src/distance/mod.rs b/rust/cuvs/src/distance/mod.rs
index 36a5850905..70eca4a753 100644
--- a/rust/cuvs/src/distance/mod.rs
+++ b/rust/cuvs/src/distance/mod.rs
@@ -3,35 +3,43 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+//! Pairwise distance computation.
+//!
+//! [`pairwise_distance`] computes all pairwise distances between two device
+//! matrices. Inputs and output are passed through the
+//! [`IntoDlTensor`] /
+//! [`IntoDlTensorMut`] traits; see the
+//! [`dlpack`](crate::dlpack) module for the tensor model.
+
 use crate::distance_type::DistanceType;
-use crate::dlpack::ManagedTensor;
+use crate::dlpack::{IntoDlTensor, IntoDlTensorMut};
 use crate::error::{Result, check_cuvs};
 use crate::resources::Resources;
 
-/// Compute pairwise distances between X and Y
-///
-/// # Arguments
+/// Computes all pairwise distances between the rows of `x` (shape `m × k`) and
+/// `y` (shape `n × k`), writing the `m × n` result into `distances`.
 ///
-/// * `res` - Resources to use
-/// * `x` - A matrix in device memory - shape (m, k)
-/// * `y` - A matrix in device memory - shape (n, k)
-/// * `distances` - A matrix in device memory that receives the output distances - shape (m, n)
-/// * `metric` - DistanceType to use for building the index
-/// * `metric_arg` - Optional value of `p` for Minkowski distances
-pub fn pairwise_distance(
+/// `x`, `y`, and `distances` reside in device memory and implement
+/// [`IntoDlTensor`] /
+/// [`IntoDlTensorMut`]. `metric` selects the distance;
+/// `metric_arg` is the optional `p` for Minkowski distances (defaults to 2).
+pub fn pairwise_distance<'a>(
     res: &Resources,
-    x: &ManagedTensor,
-    y: &ManagedTensor,
-    distances: &ManagedTensor,
+    x: impl IntoDlTensor<'a>,
+    y: impl IntoDlTensor<'a>,
+    distances: impl IntoDlTensorMut<'a>,
     metric: DistanceType,
     metric_arg: Option<f32>,
 ) -> Result<()> {
+    let x = x.into_dl_tensor()?;
+    let y = y.into_dl_tensor()?;
+    let distances = distances.into_dl_tensor_mut()?;
     unsafe {
         check_cuvs(ffi::cuvsPairwiseDistance(
             res.0,
-            x.as_ptr(),
-            y.as_ptr(),
-            distances.as_ptr(),
+            x.to_c().as_mut_ptr(),
+            y.to_c().as_mut_ptr(),
+            distances.to_c().as_mut_ptr(),
             metric,
             metric_arg.unwrap_or(2.0),
         ))
@@ -41,6 +49,7 @@ pub fn pairwise_distance(
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::test_utils::DeviceTensor;
     use ndarray_rand::RandomExt;
     use ndarray_rand::rand_distr::Uniform;
 
@@ -51,25 +60,28 @@ mod tests {
         // Create a new random dataset to index
         let n_datapoints = 256;
         let n_features = 16;
-        let dataset =
-            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
-        let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap();
+        let dataset = ndarray::Array::<f32, _>::random(
+            (n_datapoints, n_features),
+            Uniform::new(0., 1.0).unwrap(),
+        );
+        let dataset_device = DeviceTensor::from_host(&res, &dataset).unwrap();
 
         let mut distances_host = ndarray::Array::<f32, _>::zeros((n_datapoints, n_datapoints));
-        let distances = ManagedTensor::from(&distances_host).to_device(&res).unwrap();
+        let mut distances =
+            DeviceTensor::<f32>::zeros(&res, &[n_datapoints, n_datapoints]).unwrap();
 
         pairwise_distance(
             &res,
             &dataset_device,
             &dataset_device,
-            &distances,
+            &mut distances,
             DistanceType::L2Expanded,
             None,
         )
         .unwrap();
 
         // Copy back to host memory
-        distances.to_host(&res, &mut distances_host).unwrap();
+        distances.copy_to_host(&res, &mut distances_host).unwrap();
 
         // Self distance should be 0
         assert_eq!(distances_host[[0, 0]], 0.0);
diff --git a/rust/cuvs/src/dlpack.rs b/rust/cuvs/src/dlpack.rs
index 1687f88d17..310e8d28de 100644
--- a/rust/cuvs/src/dlpack.rs
+++ b/rust/cuvs/src/dlpack.rs
@@ -3,151 +3,338 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-use std::convert::From;
+//! DLPack tensor interop.
+//!
+//! cuVS exchanges tensors with the C library through the DLPack ABI. This crate
+//! never owns tensor storage: every entry point accepts a value that converts
+//! into a borrowed view through the [`IntoDlTensor`] / [`IntoDlTensorMut`]
+//! traits:
+//!
+//! * [`DLTensorView`] — a read-only view, for inputs the C API only reads
+//!   (datasets, queries).
+//! * [`DLTensorViewMut`] — a writable view, for outputs the C API writes
+//!   (neighbors, distances).
+//!
+//! A view is non-owning. Its lifetime is tied to the Rust value that owns the
+//! underlying buffer, and it materializes a stack-local [`DLManagedTensor`] only
+//! for the duration of each FFI call.
+//!
+//! The crate ships no public tensor type. To hand your own GPU (or host) buffer
+//! to cuVS, implement [`IntoDlTensor`] / [`IntoDlTensorMut`] for it on top of
+//! [`DLTensorView::from_raw_parts`]. Most algorithms require search inputs and
+//! outputs to live in device memory. See `examples/cagra.rs` for a complete,
+//! runnable adapter built on the raw CUDA runtime.
 
-use crate::error::{Result, check_cuvs};
-use crate::resources::Resources;
+use std::fmt;
+use std::marker::PhantomData;
+use std::ops::Deref;
 
-/// ManagedTensor is a wrapper around a dlpack DLManagedTensor object.
-/// This lets you pass matrices in device or host memory into cuvs.
-#[derive(Debug)]
-pub struct ManagedTensor(ffi::DLManagedTensor);
+use tinyvec::TinyVec;
 
-pub trait IntoDtype {
-    fn ffi_dtype() -> ffi::DLDataType;
-}
+pub use ffi::{DLDataType, DLDataTypeCode, DLDevice, DLDeviceType, DLManagedTensor, DLTensor};
 
-impl ManagedTensor {
-    pub fn as_ptr(&self) -> *mut ffi::DLManagedTensor {
-        &self.0 as *const _ as *mut _
-    }
+/// Number of dimensions kept inline before [`TensorDims`] spills to the heap.
+/// 1-D and 2-D tensors are the norm; IVF-PQ codebooks need 3-D.
+const INLINE_DIMS: usize = 3;
 
-    /// Creates a new ManagedTensor on the current GPU device, and copies
-    /// the data into it.
-    pub fn to_device(&self, res: &Resources) -> Result<ManagedTensor> {
-        unsafe {
-            let bytes = dl_tensor_bytes(&self.0.dl_tensor);
-            let mut device_data: *mut std::ffi::c_void = std::ptr::null_mut();
+pub(crate) type TensorDims = TinyVec<[i64; INLINE_DIMS]>;
 
-            // allocate storage, copy over
-            check_cuvs(ffi::cuvsRMMAlloc(res.0, &mut device_data as *mut _, bytes))?;
+/// Conversion into a read-only [`DLTensorView`] for tensor inputs.
+///
+/// Implement this for your own tensor type by calling
+/// [`DLTensorView::from_raw_parts`] inside a small `unsafe` block, upholding its
+/// safety contract.
+///
+/// # Examples
+///
+/// A minimal adapter for a row-major matrix in device memory:
+///
+/// ```
+/// use cuvs::dlpack::{DLDevice, DLDeviceType, DLPackError, DLTensorView, DType, IntoDlTensor};
+///
+/// struct GpuMatrix<T> {
+///     ptr: *mut T,
+///     rows: usize,
+///     cols: usize,
+/// }
+///
+/// impl<'a, T: DType> IntoDlTensor<'a> for &'a GpuMatrix<T> {
+///     fn into_dl_tensor(self) -> Result<DLTensorView<'a>, DLPackError> {
+///         let shape = [self.rows as i64, self.cols as i64];
+///         // SAFETY: `ptr` points to `rows * cols` initialized elements of `T`
+///         // in device 0's memory, valid for `'a`, and is row-major contiguous.
+///         unsafe {
+///             DLTensorView::from_raw_parts(
+///                 self.ptr.cast(),
+///                 DLDevice { device_type: DLDeviceType::kDLCUDA, device_id: 0 },
+///                 &shape,
+///                 None,
+///                 T::dl_dtype(),
+///             )
+///         }
+///     }
+/// }
+/// ```
+pub trait IntoDlTensor<'a> {
+    fn into_dl_tensor(self) -> std::result::Result<DLTensorView<'a>, DLPackError>;
+}
 
-            let mut ret = ManagedTensor(self.0);
-            ret.0.dl_tensor.data = device_data;
-            ret.0.deleter = Some(rmm_free_tensor);
-            ret.0.dl_tensor.device.device_type = ffi::DLDeviceType::kDLCUDA;
+/// A public conversion trait for writable tensor inputs.
+///
+/// In addition to the [`DLTensorView::from_raw_parts`] invariants, writable
+/// adapters must guarantee exclusive access to the data region for `'a`.
+pub trait IntoDlTensorMut<'a> {
+    fn into_dl_tensor_mut(self) -> std::result::Result<DLTensorViewMut<'a>, DLPackError>;
+}
 
-            check_cuvs(ffi::cuvsMatrixCopy(res.0, self.as_ptr(), ret.as_ptr()))?;
+/// Maps a Rust element type to a DLPack [`DLDataType`].
+pub trait DType {
+    fn dl_dtype() -> ffi::DLDataType;
+}
 
-            Ok(ret)
+macro_rules! impl_dtype {
+    ($ty:ty, $code:expr, $bits:expr) => {
+        impl DType for $ty {
+            fn dl_dtype() -> ffi::DLDataType {
+                ffi::DLDataType { code: $code as u8, bits: $bits, lanes: 1 }
+            }
         }
+    };
+}
+
+impl_dtype!(f32, ffi::DLDataTypeCode::kDLFloat, 32);
+impl_dtype!(f64, ffi::DLDataTypeCode::kDLFloat, 64);
+impl_dtype!(i32, ffi::DLDataTypeCode::kDLInt, 32);
+impl_dtype!(i64, ffi::DLDataTypeCode::kDLInt, 64);
+impl_dtype!(u32, ffi::DLDataTypeCode::kDLUInt, 32);
+impl_dtype!(u64, ffi::DLDataTypeCode::kDLUInt, 64);
+impl_dtype!(u8, ffi::DLDataTypeCode::kDLUInt, 8);
+impl_dtype!(i8, ffi::DLDataTypeCode::kDLInt, 8);
+impl_dtype!(u16, ffi::DLDataTypeCode::kDLUInt, 16);
+impl_dtype!(i16, ffi::DLDataTypeCode::kDLInt, 16);
+
+/// Error when converting an external tensor to a DLPack view.
+#[derive(Debug, Clone, thiserror::Error)]
+#[non_exhaustive]
+pub enum DLPackError {
+    /// The tensor resides on a device not supported by cuVS.
+    #[error("unsupported tensor device: {0}")]
+    UnsupportedDevice(String),
+    /// The tensor dtype is not supported by the current adapter.
+    #[error("unsupported tensor dtype: {0}")]
+    UnsupportedDType(String),
+    /// A strides slice did not match the tensor rank.
+    #[error("strides length {strides} does not match tensor rank {ndim}")]
+    StridesLenMismatch { ndim: usize, strides: usize },
+    /// The source tensor reported invalid DLPack metadata.
+    #[error("invalid DLPack metadata: {0}")]
+    InvalidMetadata(&'static str),
+}
+
+/// A wrapper around [`ffi::DLManagedTensor`] with a lifetime attached.
+pub(crate) struct ManagedTensorRef<'a> {
+    pub(crate) inner: ffi::DLManagedTensor,
+    _borrow: PhantomData<&'a ()>,
+}
+
+impl ManagedTensorRef<'_> {
+    /// Returns a pointer to the inner [`ffi::DLManagedTensor`] for an FFI call.
+    ///
+    /// The pointer is valid only while this `ManagedTensorRef` is alive. This
+    /// covers calling `view.to_c().as_mut_ptr()` directly as an FFI argument,
+    /// because the temporary would live to the end of that statement.
+    pub(crate) fn as_mut_ptr(&mut self) -> *mut ffi::DLManagedTensor {
+        &mut self.inner
     }
+}
 
-    /// Copies data from device memory into host memory
-    pub fn to_host<
-        T: IntoDtype,
-        S: ndarray::RawData<Elem = T> + ndarray::RawDataMut,
-        D: ndarray::Dimension,
-    >(
-        &self,
-        res: &Resources,
-        arr: &mut ndarray::ArrayBase<S, D>,
-    ) -> Result<()> {
-        unsafe {
-            let mut dst = self.0;
-            dst.dl_tensor.data = arr.as_mut_ptr() as *mut std::ffi::c_void;
-            dst.dl_tensor.device.device_type = ffi::DLDeviceType::kDLCPU;
-            dst.deleter = None;
-
-            check_cuvs(ffi::cuvsMatrixCopy(res.0, self.as_ptr(), &mut dst))?;
-            Ok(())
+/// A non-owning, read-only DLPack tensor view.
+#[must_use]
+pub struct DLTensorView<'a> {
+    data: *mut std::ffi::c_void,
+    device: ffi::DLDevice,
+    dtype: ffi::DLDataType,
+    shape: TensorDims,
+    strides: Option<TensorDims>,
+    _marker: PhantomData<&'a ()>,
+}
+
+impl<'a> DLTensorView<'a> {
+    /// Construct a DLPack view from raw tensor metadata.
+    ///
+    /// # Safety
+    ///
+    /// The caller must guarantee that:
+    /// - `data` points to initialized storage matching `shape`, `strides`, and
+    ///   `dtype`, residing on the device described by `device`;
+    /// - that storage remains valid for the lifetime `'a`;
+    /// - the C API consumes the resulting [`DLManagedTensor`] (including its
+    ///   `shape`/`strides` pointers) only for the duration of the FFI call and
+    ///   does not retain it afterward — cuVS upholds this.
+    pub unsafe fn from_raw_parts(
+        data: *mut std::ffi::c_void,
+        device: ffi::DLDevice,
+        shape: &[i64],
+        strides: Option<&[i64]>,
+        dtype: ffi::DLDataType,
+    ) -> std::result::Result<Self, DLPackError> {
+        if let Some(s) = strides
+            && s.len() != shape.len()
+        {
+            return Err(DLPackError::StridesLenMismatch { ndim: shape.len(), strides: s.len() });
         }
+        Ok(Self {
+            data,
+            device,
+            dtype,
+            shape: shape.iter().copied().collect(),
+            strides: strides.map(|s| s.iter().copied().collect()),
+            _marker: PhantomData,
+        })
     }
-}
 
-/// Figures out how many bytes are in a DLTensor
-fn dl_tensor_bytes(tensor: &ffi::DLTensor) -> usize {
-    let mut bytes: usize = 1;
-    for dim in 0..tensor.ndim {
-        bytes *= unsafe { (*tensor.shape.add(dim as usize)) as usize };
-    }
-    bytes *= (tensor.dtype.bits / 8) as usize;
-    bytes
-}
-
-unsafe extern "C" fn rmm_free_tensor(self_: *mut ffi::DLManagedTensor) {
-    unsafe {
-        let bytes = dl_tensor_bytes(&(*self_).dl_tensor);
-        let res = Resources::new().unwrap();
-        let _ = ffi::cuvsRMMFree(res.0, (*self_).dl_tensor.data as *mut _, bytes);
-    }
-}
-
-/// Create a non-owning view of a Tensor from a ndarray
-impl<T: IntoDtype, S: ndarray::RawData<Elem = T>, D: ndarray::Dimension>
-    From<&ndarray::ArrayBase<S, D>> for ManagedTensor
-{
-    fn from(arr: &ndarray::ArrayBase<S, D>) -> Self {
-        // There is a draft PR out right now for creating dlpack directly from ndarray
-        // right now, but until its merged we have to implement ourselves
-        //https://github.com/rust-ndarray/ndarray/pull/1306/files
-        unsafe {
-            let mut ret = std::mem::MaybeUninit::<ffi::DLTensor>::uninit();
-            let tensor = ret.as_mut_ptr();
-            (*tensor).data = arr.as_ptr() as *mut std::os::raw::c_void;
-            (*tensor).device =
-                ffi::DLDevice { device_type: ffi::DLDeviceType::kDLCPU, device_id: 0 };
-            (*tensor).byte_offset = 0;
-            (*tensor).strides = std::ptr::null_mut(); // TODO: error if not rowmajor
-            (*tensor).ndim = arr.ndim() as i32;
-            (*tensor).shape = arr.shape().as_ptr() as *mut _;
-            (*tensor).dtype = T::ffi_dtype();
-            ManagedTensor(ffi::DLManagedTensor {
-                dl_tensor: ret.assume_init(),
+    /// Build a [`DLManagedTensor`] for an FFI call.
+    ///
+    /// DLPack stores `shape` and `strides` as mutable pointers, so this method
+    /// casts pointers derived from `&self` to `*mut` for C ABI compatibility.
+    /// The callee must treat them as `const`.
+    pub(crate) fn to_c(&self) -> ManagedTensorRef<'_> {
+        ManagedTensorRef {
+            inner: ffi::DLManagedTensor {
+                dl_tensor: ffi::DLTensor {
+                    data: self.data,
+                    device: self.device,
+                    ndim: self.shape.len() as i32,
+                    dtype: self.dtype,
+                    shape: self.shape.as_ptr() as *mut _,
+                    strides: self
+                        .strides
+                        .as_ref()
+                        .map_or(std::ptr::null_mut(), |s| s.as_ptr() as *mut _),
+                    byte_offset: 0,
+                },
                 manager_ctx: std::ptr::null_mut(),
                 deleter: None,
-            })
+            },
+            _borrow: PhantomData,
         }
     }
+
+    /// Number of dimensions.
+    pub fn ndim(&self) -> usize {
+        self.shape.len()
+    }
+
+    /// Shape of the tensor.
+    pub fn shape(&self) -> &[i64] {
+        &self.shape
+    }
+
+    /// Strides, if non-contiguous. `None` means row-major contiguous.
+    pub fn strides(&self) -> Option<&[i64]> {
+        self.strides.as_deref()
+    }
+
+    /// Element data type.
+    pub fn dtype(&self) -> ffi::DLDataType {
+        self.dtype
+    }
+
+    /// Device where the data resides.
+    pub fn device(&self) -> ffi::DLDevice {
+        self.device
+    }
 }
 
-impl Drop for ManagedTensor {
-    fn drop(&mut self) {
-        unsafe {
-            if let Some(deleter) = self.0.deleter {
-                deleter(&mut self.0 as *mut _);
-            }
-        }
+impl fmt::Debug for DLTensorView<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("DLTensorView")
+            .field("shape", &self.shape.as_slice())
+            .field("strides", &self.strides.as_deref())
+            .finish()
+    }
+}
+
+/// A non-owning, writable DLPack tensor view.
+#[must_use]
+pub struct DLTensorViewMut<'a> {
+    base: DLTensorView<'a>,
+    _unique: PhantomData<&'a mut ()>,
+}
+
+impl<'a> DLTensorViewMut<'a> {
+    /// Construct a writable DLPack view from raw tensor metadata.
+    ///
+    /// # Safety
+    ///
+    /// In addition to the [`DLTensorView::from_raw_parts`] invariants, the
+    /// caller must guarantee the storage is exclusively writable for `'a`.
+    pub unsafe fn from_raw_parts(
+        data: *mut std::ffi::c_void,
+        device: ffi::DLDevice,
+        shape: &[i64],
+        strides: Option<&[i64]>,
+        dtype: ffi::DLDataType,
+    ) -> std::result::Result<Self, DLPackError> {
+        Ok(Self {
+            base: unsafe { DLTensorView::from_raw_parts(data, device, shape, strides, dtype)? },
+            _unique: PhantomData,
+        })
+    }
+}
+
+impl<'a> Deref for DLTensorViewMut<'a> {
+    type Target = DLTensorView<'a>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.base
+    }
+}
+
+impl fmt::Debug for DLTensorViewMut<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("DLTensorViewMut")
+            .field("shape", &self.base.shape.as_slice())
+            .field("strides", &self.base.strides.as_deref())
+            .finish()
     }
 }
 
-impl IntoDtype for f32 {
-    fn ffi_dtype() -> ffi::DLDataType {
-        ffi::DLDataType { code: ffi::DLDataTypeCode::kDLFloat as _, bits: 32, lanes: 1 }
+impl<'a> IntoDlTensor<'a> for DLTensorView<'a> {
+    fn into_dl_tensor(self) -> std::result::Result<DLTensorView<'a>, DLPackError> {
+        Ok(self)
     }
 }
 
-impl IntoDtype for f64 {
-    fn ffi_dtype() -> ffi::DLDataType {
-        ffi::DLDataType { code: ffi::DLDataTypeCode::kDLFloat as _, bits: 64, lanes: 1 }
+impl<'a> IntoDlTensor<'a> for DLTensorViewMut<'a> {
+    fn into_dl_tensor(self) -> std::result::Result<DLTensorView<'a>, DLPackError> {
+        Ok(self.base)
     }
 }
 
-impl IntoDtype for i32 {
-    fn ffi_dtype() -> ffi::DLDataType {
-        ffi::DLDataType { code: ffi::DLDataTypeCode::kDLInt as _, bits: 32, lanes: 1 }
+impl<'a> IntoDlTensorMut<'a> for DLTensorViewMut<'a> {
+    fn into_dl_tensor_mut(self) -> std::result::Result<DLTensorViewMut<'a>, DLPackError> {
+        Ok(self)
     }
 }
 
-impl IntoDtype for i64 {
-    fn ffi_dtype() -> ffi::DLDataType {
-        ffi::DLDataType { code: ffi::DLDataTypeCode::kDLInt as _, bits: 64, lanes: 1 }
+impl<'a> IntoDlTensor<'a> for &DLTensorView<'a> {
+    fn into_dl_tensor(self) -> std::result::Result<DLTensorView<'a>, DLPackError> {
+        Ok(DLTensorView {
+            data: self.data,
+            device: self.device,
+            dtype: self.dtype,
+            shape: self.shape.clone(),
+            strides: self.strides.clone(),
+            _marker: PhantomData,
+        })
     }
 }
 
-impl IntoDtype for u32 {
-    fn ffi_dtype() -> ffi::DLDataType {
-        ffi::DLDataType { code: ffi::DLDataTypeCode::kDLUInt as _, bits: 32, lanes: 1 }
+// The result is deliberately bounded by the borrow `'b`, NOT by the data lifetime `'a`.
+impl<'a, 'b> IntoDlTensor<'b> for &'b DLTensorViewMut<'a> {
+    fn into_dl_tensor(self) -> std::result::Result<DLTensorView<'b>, DLPackError> {
+        self.deref().into_dl_tensor()
     }
 }
 
@@ -155,20 +342,76 @@ impl IntoDtype for u32 {
 mod tests {
     use super::*;
 
+    fn cpu() -> DLDevice {
+        DLDevice { device_type: DLDeviceType::kDLCPU, device_id: 0 }
+    }
+
     #[test]
-    fn test_from_ndarray() {
-        let arr = ndarray::Array::<f32, _>::zeros((8, 4));
+    fn to_c_translates_contiguous_metadata() {
+        let data = [0.0f32; 6];
+        let view = unsafe {
+            DLTensorView::from_raw_parts(
+                data.as_ptr() as *mut _,
+                cpu(),
+                &[2, 3],
+                None,
+                f32::dl_dtype(),
+            )
+        }
+        .unwrap();
 
-        let tensor = unsafe { (*(ManagedTensor::from(&arr).as_ptr())).dl_tensor };
+        let managed = view.to_c();
+        let t = &managed.inner.dl_tensor;
 
-        assert_eq!(tensor.ndim, 2);
+        assert_eq!(t.ndim, 2);
+        assert_eq!(t.data as *const f32, data.as_ptr());
+        assert_eq!(t.dtype.code, DLDataTypeCode::kDLFloat as u8);
+        assert_eq!(t.dtype.bits, 32);
+        assert_eq!(t.dtype.lanes, 1);
+        assert_eq!(t.byte_offset, 0);
+        assert!(managed.inner.manager_ctx.is_null());
+        assert!(managed.inner.deleter.is_none());
 
-        // make sure we can get the shape ok
-        assert_eq!(unsafe { *tensor.shape }, 8);
-        assert_eq!(unsafe { *tensor.shape.add(1) }, 4);
+        assert_eq!(unsafe { std::slice::from_raw_parts(t.shape, 2) }, &[2, 3]);
+        assert!(t.strides.is_null());
+    }
+
+    #[test]
+    fn to_c_preserves_explicit_strides() {
+        let data = [0.0f32; 6];
+        let view = unsafe {
+            DLTensorView::from_raw_parts(
+                data.as_ptr() as *mut _,
+                cpu(),
+                &[2, 3],
+                Some(&[1, 2]),
+                f32::dl_dtype(),
+            )
+        }
+        .unwrap();
+
+        let managed = view.to_c();
+        let t = &managed.inner.dl_tensor;
+
+        assert_eq!(unsafe { std::slice::from_raw_parts(t.shape, 2) }, &[2, 3]);
+        assert!(!t.strides.is_null());
+        assert_eq!(unsafe { std::slice::from_raw_parts(t.strides, 2) }, &[1, 2]);
+    }
+
+    #[test]
+    fn from_raw_parts_rejects_mismatched_strides_len() {
+        let data = [0.0f32; 6];
+        let err = unsafe {
+            DLTensorView::from_raw_parts(
+                data.as_ptr() as *mut _,
+                cpu(),
+                &[2, 3],
+                Some(&[1]),
+                f32::dl_dtype(),
+            )
+        }
+        .unwrap_err();
 
-        let arr = ndarray::Array::<f32, _>::zeros((8,));
-        let tensor = unsafe { (*(ManagedTensor::from(&arr).as_ptr())).dl_tensor };
-        assert_eq!(tensor.ndim, 1);
+        assert!(matches!(err, DLPackError::StridesLenMismatch { ndim: 2, strides: 1 }));
     }
 }
diff --git a/rust/cuvs/src/error.rs b/rust/cuvs/src/error.rs
index f9d5879a7b..f2942f5603 100644
--- a/rust/cuvs/src/error.rs
+++ b/rust/cuvs/src/error.rs
@@ -14,6 +14,8 @@ pub struct CuvsError {
 #[derive(Debug, Clone)]
 pub enum Error {
     CuvsError(CuvsError),
+    /// Tensor conversion into DLPack metadata failed.
+    DLPack(crate::dlpack::DLPackError),
     /// The caller passed an argument that could not be forwarded to the C API
     /// (e.g. a filename containing an interior NUL byte or invalid UTF-8).
     InvalidArgument(String),
@@ -28,6 +30,7 @@ impl fmt::Display for Error {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
             Error::CuvsError(cuvs_error) => write!(f, "cuvsError={:?}", cuvs_error),
+            Error::DLPack(err) => write!(f, "DLPack error: {}", err),
             Error::InvalidArgument(msg) => write!(f, "invalid argument: {}", msg),
         }
     }
@@ -55,3 +58,9 @@ pub fn check_cuvs(err: ffi::cuvsError_t) -> Result<()> {
         }
     }
 }
+
+impl From<crate::dlpack::DLPackError> for Error {
+    fn from(err: crate::dlpack::DLPackError) -> Self {
+        Self::DLPack(err)
+    }
+}
diff --git a/rust/cuvs/src/ivf_flat/index.rs b/rust/cuvs/src/ivf_flat/index.rs
index a602d64c05..cdf806a149 100644
--- a/rust/cuvs/src/ivf_flat/index.rs
+++ b/rust/cuvs/src/ivf_flat/index.rs
@@ -5,7 +5,7 @@
 
 use std::io::{Write, stderr};
 
-use crate::dlpack::ManagedTensor;
+use crate::dlpack::{IntoDlTensor, IntoDlTensorMut};
 use crate::error::{Result, check_cuvs};
 use crate::ivf_flat::{IndexParams, SearchParams};
 use crate::resources::Resources;
@@ -15,22 +15,29 @@ use crate::resources::Resources;
 pub struct Index(ffi::cuvsIvfFlatIndex_t);
 
 impl Index {
-    /// Builds a new Index from the dataset for efficient search.
+    /// Builds an IVF-Flat index over `dataset` for efficient search.
     ///
-    /// # Arguments
+    /// `dataset` is a row-major matrix on the host or device implementing
+    /// [`IntoDlTensor`](crate::IntoDlTensor). It is copied into the index, so the
+    /// caller may free it once this call returns (hence `Index` carries no
+    /// lifetime).
     ///
-    /// * `res` - Resources to use
-    /// * `params` - Parameters for building the index
-    /// * `dataset` - A row-major matrix on either the host or device to index
-    pub fn build<T: Into<ManagedTensor>>(
+    /// Supported dataset/query dtypes in the current C-backed implementation are
+    /// `f32`, `f16`, `i8`, and `u8`.
+    pub fn build<'a>(
         res: &Resources,
         params: &IndexParams,
-        dataset: T,
+        dataset: impl IntoDlTensor<'a>,
     ) -> Result<Index> {
-        let dataset: ManagedTensor = dataset.into();
+        let dataset = dataset.into_dl_tensor()?;
         let index = Index::new()?;
         unsafe {
-            check_cuvs(ffi::cuvsIvfFlatBuild(res.0, params.0, dataset.as_ptr(), index.0))?;
+            check_cuvs(ffi::cuvsIvfFlatBuild(
+                res.0,
+                params.0,
+                dataset.to_c().as_mut_ptr(),
+                index.0,
+            ))?;
         }
         Ok(index)
     }
@@ -44,23 +51,24 @@ impl Index {
         }
     }
 
-    /// Perform a Approximate Nearest Neighbors search on the Index
-    ///
-    /// # Arguments
+    /// Searches the index for the `k` nearest neighbors of each query.
     ///
-    /// * `res` - Resources to use
-    /// * `params` - Parameters to use in searching the index
-    /// * `queries` - A matrix in device memory to query for
-    /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors
-    /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors
-    pub fn search(
+    /// `queries`, `neighbors`, and `distances` must reside in device memory and
+    /// implement [`IntoDlTensor`](crate::IntoDlTensor) /
+    /// [`IntoDlTensorMut`](crate::IntoDlTensorMut). `neighbors` receives the
+    /// neighbor indices and `distances` their distances; both are written in
+    /// place.
+    pub fn search<'a>(
         &self,
         res: &Resources,
         params: &SearchParams,
-        queries: &ManagedTensor,
-        neighbors: &ManagedTensor,
-        distances: &ManagedTensor,
+        queries: impl IntoDlTensor<'a>,
+        neighbors: impl IntoDlTensorMut<'a>,
+        distances: impl IntoDlTensorMut<'a>,
     ) -> Result<()> {
+        let queries = queries.into_dl_tensor()?;
+        let neighbors = neighbors.into_dl_tensor_mut()?;
+        let distances = distances.into_dl_tensor_mut()?;
         unsafe {
             let prefilter = ffi::cuvsFilter { addr: 0, type_: ffi::cuvsFilterType::NO_FILTER };
 
@@ -68,9 +76,9 @@ impl Index {
                 res.0,
                 params.0,
                 self.0,
-                queries.as_ptr(),
-                neighbors.as_ptr(),
-                distances.as_ptr(),
+                queries.to_c().as_mut_ptr(),
+                neighbors.to_c().as_mut_ptr(),
+                distances.to_c().as_mut_ptr(),
                 prefilter,
             ))
         }
@@ -89,6 +97,7 @@ impl Drop for Index {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::test_utils::DeviceTensor;
     use ndarray::s;
     use ndarray_rand::RandomExt;
     use ndarray_rand::rand_distr::Uniform;
@@ -102,39 +111,41 @@ mod tests {
         // Create a new random dataset to index
         let n_datapoints = 1024;
         let n_features = 16;
-        let dataset =
-            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+        let dataset = ndarray::Array::<f32, _>::random(
+            (n_datapoints, n_features),
+            Uniform::new(0., 1.0).unwrap(),
+        );
 
-        let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap();
+        let dataset_device = DeviceTensor::from_host(&res, &dataset).unwrap();
 
         // build the ivf-flat index
-        let index = Index::build(&res, &build_params, dataset_device)
+        let index = Index::build(&res, &build_params, &dataset_device)
             .expect("failed to create ivf-flat index");
 
         // use the first 4 points from the dataset as queries : will test that we get them back
         // as their own nearest neighbor
         let n_queries = 4;
-        let queries = dataset.slice(s![0..n_queries, ..]);
+        let queries = dataset.slice(s![0..n_queries, ..]).to_owned();
 
         let k = 10;
 
         // IvfFlat search API requires queries and outputs to be on device memory
         // copy query data over, and allocate new device memory for the distances/ neighbors
         // outputs
-        let queries = ManagedTensor::from(&queries).to_device(&res).unwrap();
+        let queries = DeviceTensor::from_host(&res, &queries).unwrap();
         let mut neighbors_host = ndarray::Array::<i64, _>::zeros((n_queries, k));
-        let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res).unwrap();
+        let mut neighbors = DeviceTensor::<i64>::zeros(&res, &[n_queries, k]).unwrap();
 
         let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
-        let distances = ManagedTensor::from(&distances_host).to_device(&res).unwrap();
+        let mut distances = DeviceTensor::<f32>::zeros(&res, &[n_queries, k]).unwrap();
 
         let search_params = SearchParams::new().unwrap();
 
-        index.search(&res, &search_params, &queries, &neighbors, &distances).unwrap();
+        index.search(&res, &search_params, &queries, &mut neighbors, &mut distances).unwrap();
 
         // Copy back to host memory
-        distances.to_host(&res, &mut distances_host).unwrap();
-        neighbors.to_host(&res, &mut neighbors_host).unwrap();
+        distances.copy_to_host(&res, &mut distances_host).unwrap();
+        neighbors.copy_to_host(&res, &mut neighbors_host).unwrap();
 
         // nearest neighbors should be themselves, since queries are from the
         // dataset
@@ -154,13 +165,15 @@ mod tests {
         // Create a random dataset
         let n_datapoints = 1024;
         let n_features = 16;
-        let dataset =
-            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+        let dataset = ndarray::Array::<f32, _>::random(
+            (n_datapoints, n_features),
+            Uniform::new(0., 1.0).unwrap(),
+        );
 
-        let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap();
+        let dataset_device = DeviceTensor::from_host(&res, &dataset).unwrap();
 
         // Build the index once
-        let index = Index::build(&res, &build_params, dataset_device)
+        let index = Index::build(&res, &build_params, &dataset_device)
             .expect("failed to create ivf-flat index");
 
         let search_params = SearchParams::new().unwrap();
@@ -169,23 +182,23 @@ mod tests {
         // Perform multiple searches on the same index
         for search_iter in 0..3 {
             let n_queries = 4;
-            let queries = dataset.slice(s![0..n_queries, ..]);
-            let queries = ManagedTensor::from(&queries).to_device(&res).unwrap();
+            let queries = dataset.slice(s![0..n_queries, ..]).to_owned();
+            let queries = DeviceTensor::from_host(&res, &queries).unwrap();
 
             let mut neighbors_host = ndarray::Array::<i64, _>::zeros((n_queries, k));
-            let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res).unwrap();
+            let mut neighbors = DeviceTensor::<i64>::zeros(&res, &[n_queries, k]).unwrap();
 
             let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
-            let distances = ManagedTensor::from(&distances_host).to_device(&res).unwrap();
+            let mut distances = DeviceTensor::<f32>::zeros(&res, &[n_queries, k]).unwrap();
 
             // This should work on every iteration because search() takes &self
             index
-                .search(&res, &search_params, &queries, &neighbors, &distances)
+                .search(&res, &search_params, &queries, &mut neighbors, &mut distances)
                 .expect(&format!("search iteration {} failed", search_iter));
 
             // Copy back to host memory
-            distances.to_host(&res, &mut distances_host).unwrap();
-            neighbors.to_host(&res, &mut neighbors_host).unwrap();
+            distances.copy_to_host(&res, &mut distances_host).unwrap();
+            neighbors.copy_to_host(&res, &mut neighbors_host).unwrap();
 
             // Verify results are consistent
             assert_eq!(
diff --git a/rust/cuvs/src/ivf_flat/mod.rs b/rust/cuvs/src/ivf_flat/mod.rs
index 7417116965..b3fdcc0ed4 100644
--- a/rust/cuvs/src/ivf_flat/mod.rs
+++ b/rust/cuvs/src/ivf_flat/mod.rs
@@ -1,71 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
-//! The IVF-Flat method is an ANN algorithm. It uses an inverted file index (IVF) with
-//! unmodified (that is, flat) vectors. This algorithm provides simple knobs to reduce
-//! the overall search space and to trade-off accuracy for speed.
-//!
-//! Example:
-//! ```
-//!
-//! use cuvs::ivf_flat::{Index, IndexParams, SearchParams};
-//! use cuvs::{ManagedTensor, Resources, Result};
-//!
-//! use ndarray::s;
-//! use ndarray_rand::rand_distr::Uniform;
-//! use ndarray_rand::RandomExt;
-//!
-//! fn ivf_flat_example() -> Result<()> {
-//!     let res = Resources::new()?;
-//!
-//!     // Create a new random dataset to index
-//!     let n_datapoints = 65536;
-//!     let n_features = 512;
-//!     let dataset =
-//!         ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
-//!
-//!     // build the ivf-flat index
-//!     let build_params = IndexParams::new()?;
-//!     let index = Index::build(&res, &build_params, &dataset)?;
-//!     println!(
-//!         "Indexed {}x{} datapoints into ivf-flat index",
-//!         n_datapoints, n_features
-//!     );
-//!
-//!     // use the first 4 points from the dataset as queries : will test that we get them back
-//!     // as their own nearest neighbor
-//!     let n_queries = 4;
-//!     let queries = dataset.slice(s![0..n_queries, ..]);
-//!
-//!     let k = 10;
-//!
-//!     // Ivf-Flat search API requires queries and outputs to be on device memory
-//!     // copy query data over, and allocate new device memory for the distances/ neighbors
-//!     // outputs
-//!     let queries = ManagedTensor::from(&queries).to_device(&res)?;
-//!     let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
-//!     let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res)?;
-//!
-//!     let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
-//!     let distances = ManagedTensor::from(&distances_host).to_device(&res)?;
-//!
-//!     let search_params = SearchParams::new()?;
-//!
-//!     index.search(&res, &search_params, &queries, &neighbors, &distances)?;
-//!
-//!     // Copy back to host memory
-//!     distances.to_host(&res, &mut distances_host)?;
-//!     neighbors.to_host(&res, &mut neighbors_host)?;
-//!
-//!     // nearest neighbors should be themselves, since queries are from the
-//!     // dataset
-//!     println!("Neighbors {:?}", neighbors_host);
-//!     println!("Distances {:?}", distances_host);
-//!     Ok(())
-//! }
-//! ```
+//! IVF-Flat: an inverted-file index over uncompressed ("flat") vectors. It
+//! partitions the dataset into `n_lists` clusters and, at query time, scans only
+//! the `n_probes` closest clusters — a simple knob to trade recall for speed.
+//!
+//! Build an [`Index`] from a dataset, then [`search`](Index::search) it with
+//! device-resident queries and output buffers. Tensors are passed through the
+//! [`IntoDlTensor`](crate::IntoDlTensor) /
+//! [`IntoDlTensorMut`](crate::IntoDlTensorMut) traits; see the
+//! [`dlpack`](crate::dlpack) module for the tensor model and `examples/cagra.rs`
+//! for the same build/search workflow.
 
 mod index;
 mod index_params;
diff --git a/rust/cuvs/src/ivf_pq/index.rs b/rust/cuvs/src/ivf_pq/index.rs
index 492fefa0f1..e2ce741a33 100644
--- a/rust/cuvs/src/ivf_pq/index.rs
+++ b/rust/cuvs/src/ivf_pq/index.rs
@@ -5,7 +5,7 @@
 
 use std::io::{Write, stderr};
 
-use crate::dlpack::ManagedTensor;
+use crate::dlpack::{IntoDlTensor, IntoDlTensorMut};
 use crate::error::{Result, check_cuvs};
 use crate::ivf_pq::{IndexParams, SearchParams};
 use crate::resources::Resources;
@@ -15,22 +15,24 @@ use crate::resources::Resources;
 pub struct Index(ffi::cuvsIvfPqIndex_t);
 
 impl Index {
-    /// Builds a new Index from the dataset for efficient search.
+    /// Builds an IVF-PQ index over `dataset` for efficient search.
     ///
-    /// # Arguments
+    /// `dataset` is a row-major matrix on the host or device implementing
+    /// [`IntoDlTensor`](crate::IntoDlTensor). It is copied (and quantized) into
+    /// the index, so the caller may free it once this call returns (hence
+    /// `Index` carries no lifetime).
     ///
-    /// * `res` - Resources to use
-    /// * `params` - Parameters for building the index
-    /// * `dataset` - A row-major matrix on either the host or device to index
-    pub fn build<T: Into<ManagedTensor>>(
+    /// Supported dataset/query dtypes in the current C-backed implementation are
+    /// `f32`, `f16`, `i8`, and `u8`.
+    pub fn build<'a>(
         res: &Resources,
         params: &IndexParams,
-        dataset: T,
+        dataset: impl IntoDlTensor<'a>,
     ) -> Result<Index> {
-        let dataset: ManagedTensor = dataset.into();
+        let dataset = dataset.into_dl_tensor()?;
         let index = Index::new()?;
         unsafe {
-            check_cuvs(ffi::cuvsIvfPqBuild(res.0, params.0, dataset.as_ptr(), index.0))?;
+            check_cuvs(ffi::cuvsIvfPqBuild(res.0, params.0, dataset.to_c().as_mut_ptr(), index.0))?;
         }
         Ok(index)
     }
@@ -44,31 +46,32 @@ impl Index {
         }
     }
 
-    /// Perform a Approximate Nearest Neighbors search on the Index
+    /// Searches the index for the `k` nearest neighbors of each query.
     ///
-    /// # Arguments
-    ///
-    /// * `res` - Resources to use
-    /// * `params` - Parameters to use in searching the index
-    /// * `queries` - A matrix in device memory to query for
-    /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors
-    /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors
-    pub fn search(
+    /// `queries`, `neighbors`, and `distances` must reside in device memory and
+    /// implement [`IntoDlTensor`](crate::IntoDlTensor) /
+    /// [`IntoDlTensorMut`](crate::IntoDlTensorMut). `neighbors` receives the
+    /// neighbor indices and `distances` their distances; both are written in
+    /// place.
+    pub fn search<'a>(
         &self,
         res: &Resources,
         params: &SearchParams,
-        queries: &ManagedTensor,
-        neighbors: &ManagedTensor,
-        distances: &ManagedTensor,
+        queries: impl IntoDlTensor<'a>,
+        neighbors: impl IntoDlTensorMut<'a>,
+        distances: impl IntoDlTensorMut<'a>,
     ) -> Result<()> {
+        let queries = queries.into_dl_tensor()?;
+        let neighbors = neighbors.into_dl_tensor_mut()?;
+        let distances = distances.into_dl_tensor_mut()?;
         unsafe {
             check_cuvs(ffi::cuvsIvfPqSearch(
                 res.0,
                 params.0,
                 self.0,
-                queries.as_ptr(),
-                neighbors.as_ptr(),
-                distances.as_ptr(),
+                queries.to_c().as_mut_ptr(),
+                neighbors.to_c().as_mut_ptr(),
+                distances.to_c().as_mut_ptr(),
             ))
         }
     }
@@ -86,6 +89,7 @@ impl Drop for Index {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::test_utils::DeviceTensor;
     use ndarray::s;
     use ndarray_rand::RandomExt;
     use ndarray_rand::rand_distr::Uniform;
@@ -99,39 +103,41 @@ mod tests {
         // Create a new random dataset to index
         let n_datapoints = 1024;
         let n_features = 16;
-        let dataset =
-            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+        let dataset = ndarray::Array::<f32, _>::random(
+            (n_datapoints, n_features),
+            Uniform::new(0., 1.0).unwrap(),
+        );
 
-        let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap();
+        let dataset_device = DeviceTensor::from_host(&res, &dataset).unwrap();
 
         // build the ivf-pq index
-        let index = Index::build(&res, &build_params, dataset_device)
+        let index = Index::build(&res, &build_params, &dataset_device)
             .expect("failed to create ivf-pq index");
 
         // use the first 4 points from the dataset as queries : will test that we get them back
         // as their own nearest neighbor
         let n_queries = 4;
-        let queries = dataset.slice(s![0..n_queries, ..]);
+        let queries = dataset.slice(s![0..n_queries, ..]).to_owned();
 
         let k = 10;
 
         // Ivf-Pq search API requires queries and outputs to be on device memory
         // copy query data over, and allocate new device memory for the distances/ neighbors
         // outputs
-        let queries = ManagedTensor::from(&queries).to_device(&res).unwrap();
+        let queries = DeviceTensor::from_host(&res, &queries).unwrap();
         let mut neighbors_host = ndarray::Array::<i64, _>::zeros((n_queries, k));
-        let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res).unwrap();
+        let mut neighbors = DeviceTensor::<i64>::zeros(&res, &[n_queries, k]).unwrap();
 
         let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
-        let distances = ManagedTensor::from(&distances_host).to_device(&res).unwrap();
+        let mut distances = DeviceTensor::<f32>::zeros(&res, &[n_queries, k]).unwrap();
 
         let search_params = SearchParams::new().unwrap();
 
-        index.search(&res, &search_params, &queries, &neighbors, &distances).unwrap();
+        index.search(&res, &search_params, &queries, &mut neighbors, &mut distances).unwrap();
 
         // Copy back to host memory
-        distances.to_host(&res, &mut distances_host).unwrap();
-        neighbors.to_host(&res, &mut neighbors_host).unwrap();
+        distances.copy_to_host(&res, &mut distances_host).unwrap();
+        neighbors.copy_to_host(&res, &mut neighbors_host).unwrap();
 
         // nearest neighbors should be themselves, since queries are from the
         // dataset
@@ -151,13 +157,15 @@ mod tests {
         // Create a random dataset
         let n_datapoints = 1024;
         let n_features = 16;
-        let dataset =
-            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+        let dataset = ndarray::Array::<f32, _>::random(
+            (n_datapoints, n_features),
+            Uniform::new(0., 1.0).unwrap(),
+        );
 
-        let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap();
+        let dataset_device = DeviceTensor::from_host(&res, &dataset).unwrap();
 
         // Build the index once
-        let index = Index::build(&res, &build_params, dataset_device)
+        let index = Index::build(&res, &build_params, &dataset_device)
             .expect("failed to create ivf-pq index");
 
         let search_params = SearchParams::new().unwrap();
@@ -166,23 +174,23 @@ mod tests {
         // Perform multiple searches on the same index
         for search_iter in 0..3 {
             let n_queries = 4;
-            let queries = dataset.slice(s![0..n_queries, ..]);
-            let queries = ManagedTensor::from(&queries).to_device(&res).unwrap();
+            let queries = dataset.slice(s![0..n_queries, ..]).to_owned();
+            let queries = DeviceTensor::from_host(&res, &queries).unwrap();
 
             let mut neighbors_host = ndarray::Array::<i64, _>::zeros((n_queries, k));
-            let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res).unwrap();
+            let mut neighbors = DeviceTensor::<i64>::zeros(&res, &[n_queries, k]).unwrap();
 
             let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
-            let distances = ManagedTensor::from(&distances_host).to_device(&res).unwrap();
+            let mut distances = DeviceTensor::<f32>::zeros(&res, &[n_queries, k]).unwrap();
 
             // This should work on every iteration because search() takes &self
             index
-                .search(&res, &search_params, &queries, &neighbors, &distances)
+                .search(&res, &search_params, &queries, &mut neighbors, &mut distances)
                 .expect(&format!("search iteration {} failed", search_iter));
 
             // Copy back to host memory
-            distances.to_host(&res, &mut distances_host).unwrap();
-            neighbors.to_host(&res, &mut neighbors_host).unwrap();
+            distances.copy_to_host(&res, &mut distances_host).unwrap();
+            neighbors.copy_to_host(&res, &mut neighbors_host).unwrap();
 
             // Verify results are consistent
             assert_eq!(
diff --git a/rust/cuvs/src/ivf_pq/mod.rs b/rust/cuvs/src/ivf_pq/mod.rs
index c4676cd1aa..dce6c9810a 100644
--- a/rust/cuvs/src/ivf_pq/mod.rs
+++ b/rust/cuvs/src/ivf_pq/mod.rs
@@ -1,68 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
-//! Inverted File Product Quantization
-//!
-//! Example:
-//! ```
-//!
-//! use cuvs::ivf_pq::{Index, IndexParams, SearchParams};
-//! use cuvs::{ManagedTensor, Resources, Result};
-//!
-//! use ndarray::s;
-//! use ndarray_rand::rand_distr::Uniform;
-//! use ndarray_rand::RandomExt;
-//!
-//! fn ivf_pq_example() -> Result<()> {
-//!     let res = Resources::new()?;
-//!
-//!     // Create a new random dataset to index
-//!     let n_datapoints = 65536;
-//!     let n_features = 512;
-//!     let dataset =
-//!         ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
-//!
-//!     // build the ivf-pq index
-//!     let build_params = IndexParams::new()?;
-//!     let index = Index::build(&res, &build_params, &dataset)?;
-//!     println!(
-//!         "Indexed {}x{} datapoints into ivf-pq index",
-//!         n_datapoints, n_features
-//!     );
-//!
-//!     // use the first 4 points from the dataset as queries : will test that we get them back
-//!     // as their own nearest neighbor
-//!     let n_queries = 4;
-//!     let queries = dataset.slice(s![0..n_queries, ..]);
-//!
-//!     let k = 10;
-//!
-//!     // Ivf-Pq search API requires queries and outputs to be on device memory
-//!     // copy query data over, and allocate new device memory for the distances/ neighbors
-//!     // outputs
-//!     let queries = ManagedTensor::from(&queries).to_device(&res)?;
-//!     let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
-//!     let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res)?;
-//!
-//!     let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
-//!     let distances = ManagedTensor::from(&distances_host).to_device(&res)?;
-//!
-//!     let search_params = SearchParams::new()?;
-//!
-//!     index.search(&res, &search_params, &queries, &neighbors, &distances)?;
-//!
-//!     // Copy back to host memory
-//!     distances.to_host(&res, &mut distances_host)?;
-//!     neighbors.to_host(&res, &mut neighbors_host)?;
-//!
-//!     // nearest neighbors should be themselves, since queries are from the
-//!     // dataset
-//!     println!("Neighbors {:?}", neighbors_host);
-//!     println!("Distances {:?}", distances_host);
-//!     Ok(())
-//! }
-//! ```
+//! IVF-PQ: an inverted-file index that product-quantizes the vectors. Like
+//! IVF-Flat it partitions the dataset into `n_lists` clusters and scans the
+//! `n_probes` closest at query time, but compresses each vector into `pq_dim`
+//! codes of `pq_bits` bits — much smaller, slightly less accurate.
+//!
+//! Build an [`Index`] from a dataset, then [`search`](Index::search) it with
+//! device-resident queries and output buffers. Tensors are passed through the
+//! [`IntoDlTensor`](crate::IntoDlTensor) /
+//! [`IntoDlTensorMut`](crate::IntoDlTensorMut) traits; see the
+//! [`dlpack`](crate::dlpack) module for the tensor model and `examples/cagra.rs`
+//! for the same build/search workflow.
 
 mod index;
 mod index_params;
diff --git a/rust/cuvs/src/lib.rs b/rust/cuvs/src/lib.rs
index 519519440b..dea2329336 100644
--- a/rust/cuvs/src/lib.rs
+++ b/rust/cuvs/src/lib.rs
@@ -14,13 +14,17 @@ pub mod cagra;
 pub mod cluster;
 pub mod distance;
 pub mod distance_type;
-mod dlpack;
+pub mod dlpack;
 mod error;
 pub mod ivf_flat;
 pub mod ivf_pq;
 mod resources;
+#[cfg(test)]
+pub(crate) mod test_utils;
 pub mod vamana;
 
-pub use dlpack::ManagedTensor;
+pub use dlpack::{
+    DLPackError, DLTensorView, DLTensorViewMut, DType, IntoDlTensor, IntoDlTensorMut,
+};
 pub use error::{Error, Result};
 pub use resources::Resources;
diff --git a/rust/cuvs/src/resources.rs b/rust/cuvs/src/resources.rs
index 70f128abb7..f6e9c40626 100644
--- a/rust/cuvs/src/resources.rs
+++ b/rust/cuvs/src/resources.rs
@@ -23,7 +23,12 @@ impl Resources {
     }
 
     /// Sets the current cuda stream
-    pub fn set_cuda_stream(&self, stream: ffi::cudaStream_t) -> Result<()> {
+    ///
+    /// # Safety
+    ///
+    /// `stream` must be a valid CUDA stream that remains valid for as long as it
+    /// is used by this resources handle.
+    pub unsafe fn set_cuda_stream(&self, stream: ffi::cudaStream_t) -> Result<()> {
         unsafe { check_cuvs(ffi::cuvsStreamSet(self.0, stream)) }
     }
 
diff --git a/rust/cuvs/src/test_utils.rs b/rust/cuvs/src/test_utils.rs
new file mode 100644
index 0000000000..192fba39f8
--- /dev/null
+++ b/rust/cuvs/src/test_utils.rs
@@ -0,0 +1,236 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+//! Test-only tensor adapters.
+//!
+//! [`DeviceTensor`] is an RMM-backed device matrix, and the `ndarray` host
+//! adapters below implement [`IntoDlTensor`]/[`IntoDlTensorMut`] for plain host
+//! arrays. We use `ndarray` only as a dev-dependency to assist with unit tests.
+
+use std::marker::PhantomData;
+
+use crate::dlpack::{
+    DLPackError, DLTensorView, DLTensorViewMut, DType, IntoDlTensor, IntoDlTensorMut,
+};
+use crate::error::{Result, check_cuvs};
+use crate::ffi;
+use crate::resources::Resources;
+
+pub(crate) struct DeviceTensor<'res, T: DType> {
+    data: *mut std::ffi::c_void,
+    shape: Vec<i64>,
+    capacity_bytes: usize,
+    resources: &'res Resources,
+    _marker: PhantomData<T>,
+}
+
+impl<'res, T: DType> DeviceTensor<'res, T> {
+    pub(crate) fn zeros(res: &'res Resources, shape: &[usize]) -> Result<Self> {
+        let capacity_bytes = shape.iter().product::<usize>() * std::mem::size_of::<T>();
+        let mut data: *mut std::ffi::c_void = std::ptr::null_mut();
+        unsafe {
+            check_cuvs(ffi::cuvsRMMAlloc(res.0, &mut data, capacity_bytes))?;
+        }
+
+        Ok(Self {
+            data,
+            shape: shape.iter().map(|&dim| dim as i64).collect(),
+            capacity_bytes,
+            resources: res,
+            _marker: PhantomData,
+        })
+    }
+
+    pub(crate) fn from_host<D>(res: &'res Resources, host: &ndarray::ArrayRef<T, D>) -> Result<Self>
+    where
+        D: ndarray::Dimension,
+    {
+        let shape: Vec<usize> = host.shape().to_vec();
+        let mut device = Self::zeros(res, &shape)?;
+
+        let host_shape: Vec<i64> = host.shape().iter().map(|&dim| dim as i64).collect();
+        let host_strides: Option<Vec<i64>> = if host.is_standard_layout() {
+            None
+        } else {
+            Some(host.strides().iter().map(|&stride| stride as i64).collect())
+        };
+        let host = unsafe {
+            DLTensorView::from_raw_parts(
+                host.as_ptr() as *mut _,
+                ffi::DLDevice { device_type: ffi::DLDeviceType::kDLCPU, device_id: 0 },
+                &host_shape,
+                host_strides.as_deref(),
+                T::dl_dtype(),
+            )?
+        };
+        let device_view = (&mut device).into_dl_tensor_mut()?;
+        unsafe {
+            check_cuvs(ffi::cuvsMatrixCopy(
+                res.0,
+                host.to_c().as_mut_ptr(),
+                device_view.to_c().as_mut_ptr(),
+            ))?;
+        }
+
+        Ok(device)
+    }
+
+    pub(crate) fn copy_to_host<D>(
+        &self,
+        res: &Resources,
+        host: &mut ndarray::ArrayRef<T, D>,
+    ) -> Result<()>
+    where
+        D: ndarray::Dimension,
+    {
+        let host_shape: Vec<i64> = host.shape().iter().map(|&dim| dim as i64).collect();
+        let host_strides: Option<Vec<i64>> = if host.is_standard_layout() {
+            None
+        } else {
+            Some(host.strides().iter().map(|&stride| stride as i64).collect())
+        };
+        let host = unsafe {
+            DLTensorViewMut::from_raw_parts(
+                host.as_mut_ptr() as *mut _,
+                ffi::DLDevice { device_type: ffi::DLDeviceType::kDLCPU, device_id: 0 },
+                &host_shape,
+                host_strides.as_deref(),
+                T::dl_dtype(),
+            )?
+        };
+        let device = self.into_dl_tensor()?;
+        unsafe {
+            check_cuvs(ffi::cuvsMatrixCopy(
+                res.0,
+                device.to_c().as_mut_ptr(),
+                host.to_c().as_mut_ptr(),
+            ))?;
+        }
+
+        Ok(())
+    }
+}
+
+impl<T: DType> Drop for DeviceTensor<'_, T> {
+    fn drop(&mut self) {
+        if !self.data.is_null() {
+            let _ = unsafe { ffi::cuvsRMMFree(self.resources.0, self.data, self.capacity_bytes) };
+        }
+    }
+}
+
+impl<'a, 'res, T: DType> IntoDlTensor<'a> for &'a DeviceTensor<'res, T> {
+    fn into_dl_tensor(self) -> std::result::Result<DLTensorView<'a>, DLPackError> {
+        unsafe {
+            DLTensorView::from_raw_parts(
+                self.data,
+                ffi::DLDevice { device_type: ffi::DLDeviceType::kDLCUDA, device_id: 0 },
+                &self.shape,
+                None,
+                T::dl_dtype(),
+            )
+        }
+    }
+}
+
+impl<'a, 'res, T: DType> IntoDlTensorMut<'a> for &'a mut DeviceTensor<'res, T> {
+    fn into_dl_tensor_mut(self) -> std::result::Result<DLTensorViewMut<'a>, DLPackError> {
+        unsafe {
+            DLTensorViewMut::from_raw_parts(
+                self.data,
+                ffi::DLDevice { device_type: ffi::DLDeviceType::kDLCUDA, device_id: 0 },
+                &self.shape,
+                None,
+                T::dl_dtype(),
+            )
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// ndarray host adapters (test-only)
+// ---------------------------------------------------------------------------
+
+fn array_layout<A, D>(arr: &ndarray::ArrayRef<A, D>) -> (Vec<i64>, Option<Vec<i64>>)
+where
+    D: ndarray::Dimension,
+{
+    let shape = arr.shape().iter().map(|&d| d as i64).collect();
+    let strides = if arr.is_standard_layout() {
+        None
+    } else {
+        Some(arr.strides().iter().map(|&s| s as i64).collect())
+    };
+    (shape, strides)
+}
+
+impl<'a, A, D> IntoDlTensor<'a> for &'a ndarray::ArrayRef<A, D>
+where
+    A: DType,
+    D: ndarray::Dimension,
+{
+    fn into_dl_tensor(self) -> std::result::Result<DLTensorView<'a>, DLPackError> {
+        let (shape, strides) = array_layout(self);
+        unsafe {
+            DLTensorView::from_raw_parts(
+                self.as_ptr() as *mut _,
+                ffi::DLDevice { device_type: ffi::DLDeviceType::kDLCPU, device_id: 0 },
+                &shape,
+                strides.as_deref(),
+                A::dl_dtype(),
+            )
+        }
+    }
+}
+
+impl<'a, A, D> IntoDlTensorMut<'a> for &'a mut ndarray::ArrayRef<A, D>
+where
+    A: DType,
+    D: ndarray::Dimension,
+{
+    fn into_dl_tensor_mut(self) -> std::result::Result<DLTensorViewMut<'a>, DLPackError> {
+        let (shape, strides) = array_layout(self);
+        unsafe {
+            DLTensorViewMut::from_raw_parts(
+                self.as_mut_ptr() as *mut _,
+                ffi::DLDevice { device_type: ffi::DLDeviceType::kDLCPU, device_id: 0 },
+                &shape,
+                strides.as_deref(),
+                A::dl_dtype(),
+            )
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // The ndarray adapter must report a contiguous array as contiguous (no
+    // strides) with the correct shape, device, and dtype.
+    #[test]
+    fn ndarray_contiguous_view_omits_strides() {
+        let arr = ndarray::Array2::<f32>::zeros((10, 20));
+        let view = (&arr).into_dl_tensor().unwrap();
+
+        assert_eq!(view.shape(), &[10, 20]);
+        assert!(view.strides().is_none());
+        assert_eq!(view.device().device_type, ffi::DLDeviceType::kDLCPU);
+        assert_eq!(view.dtype().code, ffi::DLDataTypeCode::kDLFloat as u8);
+        assert_eq!(view.dtype().bits, 32);
+    }
+
+    // A non-standard layout must surface explicit strides so cuVS reads the
+    // data in the right order.
+    #[test]
+    fn ndarray_transposed_view_reports_strides() {
+        let arr = ndarray::Array2::<f32>::zeros((10, 20));
+        let transposed = arr.t();
+        let view = (&transposed).into_dl_tensor().unwrap();
+
+        assert_eq!(view.shape(), &[20, 10]);
+        assert_eq!(view.strides(), Some(&[1i64, 20][..]));
+    }
+}
diff --git a/rust/cuvs/src/vamana/index.rs b/rust/cuvs/src/vamana/index.rs
index 485f8ac008..975fb79006 100644
--- a/rust/cuvs/src/vamana/index.rs
+++ b/rust/cuvs/src/vamana/index.rs
@@ -6,7 +6,7 @@
 use std::ffi::CString;
 use std::io::{Write, stderr};
 
-use crate::dlpack::ManagedTensor;
+use crate::dlpack::IntoDlTensor;
 use crate::error::{Result, check_cuvs};
 use crate::resources::Resources;
 use crate::vamana::IndexParams;
@@ -24,21 +24,25 @@ impl Index {
     /// all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied
     /// to improve graph quality. The index_params struct controls the degree of the final graph.
     ///
-    ///
-    /// # Arguments
-    ///
-    /// * `res` - Resources to use
-    /// * `params` - Parameters for building the index
-    /// * `dataset` - A row-major matrix on either the host or device to index
-    pub fn build<T: Into<ManagedTensor>>(
+    /// `dataset` is a row-major matrix on the host or device implementing
+    /// [`IntoDlTensor`](crate::IntoDlTensor); it is copied into the index.
+    pub fn build<'a>(
         res: &Resources,
         params: &IndexParams,
-        dataset: T,
+        dataset: impl IntoDlTensor<'a>,
     ) -> Result<Index> {
-        let dataset: ManagedTensor = dataset.into();
+        let dataset = dataset.into_dl_tensor()?;
         let index = Index::new()?;
+        // `cuvsVamanaBuild` copies the dataset into the index, so the index does not
+        // retain a view into `dataset`; the borrow only needs to be valid for the
+        // duration of this call. That is why `Index` carries no lifetime.
         unsafe {
-            check_cuvs(ffi::cuvsVamanaBuild(res.0, params.0, dataset.as_ptr(), index.0))?;
+            check_cuvs(ffi::cuvsVamanaBuild(
+                res.0,
+                params.0,
+                dataset.to_c().as_mut_ptr(),
+                index.0,
+            ))?;
         }
         Ok(index)
     }
@@ -88,7 +92,7 @@ impl Drop for Index {
 #[cfg(test)]
 mod tests {
     use super::*;
-
+    use crate::test_utils::DeviceTensor;
     use ndarray_rand::RandomExt;
     use ndarray_rand::rand_distr::Uniform;
 
@@ -101,13 +105,15 @@ mod tests {
         // Create a new random dataset to index
         let n_datapoints = 1024;
         let n_features = 16;
-        let dataset =
-            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+        let dataset = ndarray::Array::<f32, _>::random(
+            (n_datapoints, n_features),
+            Uniform::new(0., 1.0).unwrap(),
+        );
 
-        let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap();
+        let dataset_device = DeviceTensor::from_host(&res, &dataset).unwrap();
 
         // build the vamana index
-        let _index = Index::build(&res, &build_params, dataset_device)
+        let _index = Index::build(&res, &build_params, &dataset_device)
             .expect("failed to create vamana index");
     }
 }
diff --git a/rust/cuvs/src/vamana/mod.rs b/rust/cuvs/src/vamana/mod.rs
index a3ae4ee9ff..4dd2434407 100644
--- a/rust/cuvs/src/vamana/mod.rs
+++ b/rust/cuvs/src/vamana/mod.rs
@@ -2,7 +2,11 @@
  * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
-//! Vamana
+//! Vamana: builds a DiskANN-style Vamana graph over a dataset.
+//!
+//! Build an [`Index`] from a dataset (then typically serialize it). The dataset
+//! is passed through the [`IntoDlTensor`](crate::IntoDlTensor) trait; see the
+//! [`dlpack`](crate::dlpack) module for the tensor model.
 
 mod index;
 mod index_params;