Incorporate review feedback

wjakob · wjakob · commit 58d45e5b6a6b · 2025-04-18T09:39:24.000+09:00
diff --git a/docs/coop_vec.rst b/docs/coop_vec.rst
@@ -28,6 +28,7 @@ Dr.Jit supports cooperative vectors on both of its backends:
   <https://raytracing-docs.nvidia.com/optix9/guide/index.html#cooperative_vectors#neural-rendering-with-cooperative-vectors>`__,
   leveraging built-in `tensor cores
   <https://www.nvidia.com/en-us/data-center/tensor-cores/>`__ for acceleration.
+  Driver version R570 or newer is required to use this feature.
 
 - On the **CPU (LLVM) backend**, compilation of cooperative vector operations
   targets the available instruction set extensions (AVX512, NEON, etc.).
diff --git a/docs/nn.rst b/docs/nn.rst
@@ -90,39 +90,41 @@ mixed-precision training.
     net = net.alloc(TensorXf16, 2)
 
     # Convert to training-optimal layout
-    coeffs, net = nn.pack(net, layout='training')
+    weights, net = nn.pack(net, layout='training')
     print(net)
 
-    # Optimize a single precision copy of the parameters
-    opt = Adam(lr=1e-3, params={'coeffs': Float32(coeffs)})
+    # Optimize a single-precision copy of the parameters
+    opt = Adam(lr=1e-3, params={'weights': Float32(weights)})
 
     # This is an adaptive mixed-precision (AMP) optimization, where a half
-    # precision computation runs within a larger single precision program.
+    # precision computation runs within a larger single-precision program.
     # Gradient scaling is required to make this numerically well-behaved.
     scaler = GradScaler()
 
     res = 256
 
     for i in tqdm(range(40000)):
         # Update network state from optimizer
-        coeffs[:] = Float16(opt['coeffs'])
+        weights[:] = Float16(opt['weights'])
 
         # Generate jittered positions on [0, 1]^2
         t = dr.arange(Float32, res)
-        p = (Array2f(dr.meshgrid(t, t)) + dr.rand(Array2f, (2, res*res))) / res
+        p = (Array2f(dr.meshgrid(t, t)) + dr.rand(Array2f, (2, res * res))) / res
 
         # Evaluate neural net + L2 loss
         img = Array3f(net(nn.CoopVec(p)))
-        loss = dr.squared_norm(tex.eval(p)-img)
+        loss = dr.squared_norm(tex.eval(p) - img)
 
         # Mixed-precision training: take suitably scaled steps
         dr.backward(scaler.scale(loss))
         scaler.step(opt)
 
     # Done optimizing, now let's plot the result
     t = dr.linspace(Float32, 0, 1, res)
-    p= Array2f(dr.meshgrid(t, t))
+    p = Array2f(dr.meshgrid(t, t))
     img = Array3f(net(nn.CoopVec(p)))
+
+    # Convert 'img' with shape 3 x (N*N) into a N x N x 3 tensor
     img = dr.reshape(TensorXf(img, flip_axes=True), (res, res, 3))
 
     import matplotlib.pyplot as plt
diff --git a/docs/what.rst b/docs/what.rst
@@ -24,7 +24,7 @@ Using Dr.Jit involves two steps:
 Perhaps the most significant difference to the majority of existing tools is
 that Dr.Jit is *not primarily* a machine learning library. While it does
 provide support for neural network :ref:`evaluation and training <neural_nets>`,
-it its sweet spot are non-neural programs characterized by *embarrassing
+its sweet spot are non-neural programs characterized by *embarrassing
 parallelism*---that is to say, programs with large data-parallel regions. A
 good example of this are `Monte Carlo
 <https://en.wikipedia.org/wiki/Monte_Carlo_method>`__ methods with their
diff --git a/drjit/nn.py b/drjit/nn.py
@@ -61,6 +61,16 @@ def __call__(self, arg: CoopVec, /) -> CoopVec:
         raise NotImplementedError(f"{type(self).__name__}.__call__() implementation is missing.")
 
     def _alloc(self, dtype: Type[drjit.ArrayBase], size: int, /) -> Tuple[Module, int]:
+        """
+        Internal method used to propagate argument sizes and allocate weight
+        storage of all NN modules.
+
+        The method takes to parameters as input: a weight storage type
+        ``dtype`` (e.g., :py:class:`drjit.cuda.ad.TensorXf16`) and ``size``,
+        the number of input arguments of the module. The function returns a
+        potentially new module instance with allocated weights, plus the number
+        of outputs.
+        """
         return self, size
 
     def alloc(self, dtype: Type[drjit.ArrayBase], size: int = -1) -> Module:
@@ -110,7 +120,7 @@ def __len__(self):
         """Return the number of contained models"""
         return len(self.layers)
 
-    def __getitem__(self, index: Union[int], /) -> Module: # type: ignore
+    def __getitem__(self, index: int, /) -> Module: # type: ignore
         """Return the model at position ``index``"""
         return self.layers[index]
 
@@ -155,8 +165,8 @@ class LeakyReLU(Module):
        \end{cases}
     """
 
-    DRJIT_STRUCT = { 'negative_slope': float }
-    def __init__(self, negative_slope: float = 1e-2):
+    DRJIT_STRUCT = { 'negative_slope': Union[float, drjit.ArrayBase] }
+    def __init__(self, negative_slope: Union[float, drjit.ArrayBase] = 1e-2):
         self.negative_slope = negative_slope
 
     def __call__(self, arg: CoopVec, /) -> CoopVec:
@@ -449,8 +459,8 @@ def __init__(self, octaves: int = 0, shift: float = 0) -> None:
         if shift == 0:
             self.shift = None
         else:
-            self.shift = (drjit.sin(shift*2*drjit.pi),
-                          drjit.cos(shift*2*drjit.pi))
+            self.shift = (drjit.sin(shift * 2 * drjit.pi),
+                          drjit.cos(shift * 2 * drjit.pi))
 
     def _alloc(self, dtype: Type[drjit.ArrayBase], size : int = -1, /) -> Tuple[Module, int]:
         return self, size * self.octaves * 2
diff --git a/include/drjit/extra.h b/include/drjit/extra.h
@@ -546,7 +546,7 @@ extern DRJIT_EXTRA_EXPORT uint64_t ad_coop_vec_matvec(uint64_t A_index,
                                                       int transpose);
 
 /// Cast a cooperative vector to a different precision
-extern JIT_EXPORT uint64_t ad_coop_vec_cast(uint64_t index, VarType vt);
+extern DRJIT_EXTRA_EXPORT uint64_t ad_coop_vec_cast(uint64_t index, VarType vt);
 
 #if defined(__cplusplus)
 }
diff --git a/src/python/coop_vec.cpp b/src/python/coop_vec.cpp
@@ -561,7 +561,7 @@ void export_coop_vec(nb::module_ &m) {
     coop_vector_type = nb::class_<CoopVec>(nn, "CoopVec", nb::is_generic(), nb::sig("class CoopVec(typing.Generic[T])"))
         .def(nb::init<nb::args>(),
              nb::sig("def __init__(self, *args: typing.Unpack[typing.Tuple[typing.Union[drjit.ArrayBase[SelfT, SelfCpT, ValT, ValCpT, T, PlainT, MaskT], float, int], ...]]) -> None"),
-             doc_coop_CoopVec_init)
+             doc_nn_CoopVec_init)
         .def("__iter__", [](const CoopVec &v) { return iter(v.expand_to_list()); },
              nb::sig("def __iter__(self, /) -> typing.Iterator[T]"))
         .def("__add__", &coop_vec_binary_op<JitOp::Add>,
@@ -587,7 +587,7 @@ void export_coop_vec(nb::module_ &m) {
                              jit_var_size(v.m_index));
              });
 
-    view_type = nb::class_<MatrixView>(nn, "MatrixView", doc_coop_MatrixView)
+    view_type = nb::class_<MatrixView>(nn, "MatrixView", doc_nn_MatrixView)
         .def(nb::init<>())
         .def("__repr__", &MatrixView::repr)
         .def("__getitem__", &MatrixView::getitem,
@@ -669,12 +669,12 @@ void export_coop_vec(nb::module_ &m) {
     view_type.attr("DRJIT_STRUCT") = drjit_struct;
 
     nn.def("view", &view,
-           doc_coop_view);
+           doc_nn_view);
 
     nn.def("pack", [](nb::handle arg, const char *layout) { return repack("pack", layout, arg); },
            nb::arg(), "layout"_a = "inference",
            nb::sig("def pack(arg: MatrixView | drjit.AnyArray, *, layout: typing.Literal['inference', 'training'] = 'inference') -> typing.Tuple[drjit.ArrayBase, MatrixView]"),
-           doc_coop_pack);
+           doc_nn_pack);
 
     nn.def("pack",
            [](nb::args args, const char *layout) {
@@ -692,7 +692,7 @@ void export_coop_vec(nb::module_ &m) {
     nn.def("unpack", [](nb::handle arg) {
         return repack("unpack", nullptr, arg); },
            nb::sig("def unpack(arg: MatrixView | drjit.AnyArray, /) -> typing.Tuple[drjit.ArrayBase, MatrixView]"),
-           doc_coop_unpack);
+           doc_nn_unpack);
 
     nn.def("unpack",
            [](nb::args args) {
@@ -710,7 +710,7 @@ void export_coop_vec(nb::module_ &m) {
            "b"_a.noconvert() = nb::none(), "transpose"_a = false,
             nb::sig("def matvec(A: MatrixView, x: drjit.nn.CoopVec[T], b: typing.Optional[MatrixView] = "
                     "None, /, transpose: bool = False) -> drjit.nn.CoopVec[T]"),
-            doc_coop_matvec);
+            doc_nn_matvec);
 
     nn.def("cast",
            [](CoopVec vec, nb::type_object_t<drjit::ArrayBase> tp) {
@@ -721,7 +721,7 @@ void export_coop_vec(nb::module_ &m) {
                return CoopVec(ad_coop_vec_cast(vec.m_index, (VarType) s.type),
                               vec.m_size, new_type);
            }, nb::sig("def cast(arg0: CoopVec[T], arg1: typing.Type[ArrayT], /) -> CoopVec[ArrayT]"),
-           doc_coop_cast
+           doc_nn_cast
     );
 
     m.def("fma", &coop_vec_ternary_op<JitOp::Fma>);
diff --git a/src/python/docstr.rst b/src/python/docstr.rst
@@ -8130,12 +8130,14 @@
     Returns:
         object: The computed array as described above
 
-.. topic:: coop_CoopVec
+.. topic:: nn_CoopVec
 
    A *cooperative vector* is a dynamically-sized container of elements of a
    consistent type. It admits both floating point and integer 1D arrays as
    elements (e.g., :py:class:`drjit.cuda.Float16`,
-   :py:class:`drjit.llvm.UInt32`).
+   :py:class:`drjit.llvm.UInt32`). Cooperative vectors primarily exist to
+   enable the compilation of expressions that make use of matrix-vector
+   multiplication.
 
    Seen from a high level, cooperative vectors resemble nested array types,
    such as as :py:class:`drjit.cuda.ArrayXf16`. A variety of conversions
@@ -8177,7 +8179,7 @@
    To unpack a cooperative vector into its components, use an expression
    like ``x, y, z = vec``, ``ArrayXf(vec)``, or ``list(vec)``.
 
-.. topic:: coop_CoopVec_init
+.. topic:: nn_CoopVec_init
 
    The constructor accepts a variable number of arguments including Dr.Jit
    arrays, scalar Python integers and floating point values, and :ref:`PyTrees
@@ -8188,7 +8190,7 @@
    the input contains Dr.Jit arrays of inconsistent scalar types (e.g.,
    :py:class:`drjit.cuda.Array2f` and :py:class:`drjit.cuda.UInt`).
 
-.. topic:: coop_MatrixView
+.. topic:: nn_MatrixView
 
    The :py:class:`drjit.nn.MatrixView` provides pointer into a buffer along with
    shape and type metadata.
@@ -8203,7 +8205,7 @@
    representation. The returned views can then be passed to
    :py:func:`drjit.nn.matvec()`.
 
-.. topic:: coop_view
+.. topic:: nn_view
 
    Convert a Dr.Jit array or tensor into a *view*.
 
@@ -8221,13 +8223,13 @@
      directly re-packed into optimal layouts without performing further
      unnecessary copies.
 
-.. topic:: coop_pack
+.. topic:: nn_pack
 
-   A training-optimal layout must be used used if the program
-   *backpropagates* (as in :py:func:`dr.backward*() <drjit.backward>`)
-   gradients through matrix-vector products. Forward derivative propagation (as
-   in :py:func:`dr.forward*() <drjit.forward>`) does not require a
-   training-optimal layout.
+   A training-optimal layout must be used used if the program *backpropagates*
+   (as in :py:func:`dr.backward*() <drjit.backward>`) gradients through
+   matrix-vector products. Inference (primal evaluation) and forward derivative
+   propagation (as in :py:func:`dr.forward*() <drjit.forward>`) does not
+   require a training-optimal layout.
 
    If the input matrices are already packed in a row-major layout, call
    :py:func:`dr.nn.view() <drjit.nn.view>` to create an efficient reference
@@ -8244,7 +8246,7 @@
           mat_view[32:64, :]
       )
 
-.. topic:: coop_unpack
+.. topic:: nn_unpack
 
    The function :py:func:`dr.nn.unpack() <drjit.nn.unpack>` transforms a
    sequence (or :ref:`PyTree <pytrees>`) of vectors and optimal-layout matrices
@@ -8255,13 +8257,14 @@
       A_out, b_out = dr.nn.unpack(A_opt, b_opt)
 
    Note that the output of this function are (row-major) *views* into a shared
-   buffer. These views can be converted back into regular tensors:
+   buffer. Each view holds a reference to the shared buffer. Views can be
+   converted back into regular tensors:
 
    .. code-block:: python
 
       A = TensorXf16(A)
 
-.. topic:: coop_matvec
+.. topic:: nn_matvec
 
    Evaluate a matrix-vector multiplication involving a cooperative vector.
 
@@ -8275,9 +8278,9 @@
    + b``). This bias vector ``b`` should also be specified as a view.
 
    Specify ``tranpose=True`` to multiply by the transpose of the matrix ``A``.
-   On the CUDA/OptiX backend, this feature requires that ``A`` is inference
+   On the CUDA/OptiX backend, this feature requires that ``A`` is in inference
    or training-optimal layout.
 
-.. topic:: coop_cast
+.. topic:: nn_cast
 
    Cast the numeric type underlying a cooperative vector
diff --git a/src/python/eval.cpp b/src/python/eval.cpp
@@ -68,6 +68,16 @@ static void make_opaque(nb::handle h) {
 
             ad_var_dec_ref(index_new);
         }
+
+        void traverse_unknown(nb::handle h) override {
+            if (h.type().is(local_type)) {
+                Local & local = nb::cast<Local&>(h);
+                for (uint32_t index : local.arrays())
+                    result |= (bool) jit_var_schedule(index);
+            }
+            if (h.type().is(coop_vector_type))
+                nb::raise("Cooperative vectors cannot be evaluated. They must be unpacked into regular variables.");
+        }
     };
 
     ScheduleForceCallback sfc;
diff --git a/src/python/tracker.cpp b/src/python/tracker.cpp
@@ -333,7 +333,7 @@ bool VariableTracker::Impl::traverse(Context &ctx, nb::handle h) {
                   ctx.label.c_str(), nb::inst_name(prev).c_str(),
                   nb::type_name(tp).c_str());
 
-    // Were there any external changes to sub-PyTree variable indices (As
+    // Were there any external changes to sub-PyTree variable indices (as
     // opposed to changes done by the VariableTracker)
     bool changed = false;
 
diff --git a/tests/test_coop_vec.py b/tests/test_coop_vec.py
@@ -6,7 +6,7 @@
 def skip_if_coopvec_not_supported(t):
     if dr.backend_v(t) == dr.JitBackend.CUDA:
         if dr.detail.cuda_version() < (12, 8):
-            pytest.skip("CUDA driver does not support cooperative vectors")
+            pytest.skip("CUDA driver does not support cooperative vectors (Driver R570) or later is required")
 
 @pytest.test_arrays('jit,float16,shape=(3, *),-diff', 'jit,float32,shape=(3, *),-diff')
 def test01_pack_unpack(t):
@@ -20,6 +20,7 @@ def test01_pack_unpack(t):
     assert len(nn.CoopVec(*x, 2, (4, 5), *x)) == 19
     y = list(x)
     z = m.ArrayXf(x)
+    assert len(y) == 8 and len(z) == 8
     result_ok = True
     for i in range(8):
         result_ok &= dr.all(y[i] == i+1)
@@ -258,7 +259,7 @@ def test10_fwd_addition(t):
 def test11_bwd_mul(t):
     skip_if_coopvec_not_supported(t)
 
-    # Propagate forward gradients through an addition
+    # Propagate forward gradients through a multiplication
     a, b = t(8), t(9)
     c, d = t(3), t(2)
     dr.enable_grad(a, b, c, d)
@@ -523,5 +524,9 @@ def test19_no_eval(t):
     # Cooperative vectors cannot be evaluted via dr.eval()
     UInt32 = dr.uint32_array_t(t)
     a = nn.CoopVec(t(1), t(2))
+    with pytest.raises(RuntimeError, match="Cooperative vectors cannot be evaluated"):
+        dr.schedule(a)
     with pytest.raises(RuntimeError, match="Cooperative vectors cannot be evaluated"):
         dr.eval(a)
+    with pytest.raises(RuntimeError, match="Cooperative vectors cannot be evaluated"):
+        dr.make_opaque(a)

Original file line number	Diff line number	Diff line change
`@@ -546,7 +546,7 @@ extern DRJIT_EXTRA_EXPORT uint64_t ad_coop_vec_matvec(uint64_t A_index,`
`546`	`546`	`int transpose);`
`547`	`547`
`548`	`548`	`/// Cast a cooperative vector to a different precision`
`549`		`-extern JIT_EXPORT uint64_t ad_coop_vec_cast(uint64_t index, VarType vt);`
	`549`	`+extern DRJIT_EXTRA_EXPORT uint64_t ad_coop_vec_cast(uint64_t index, VarType vt);`
`550`	`550`
`551`	`551`	`#if defined(__cplusplus)`
`552`	`552`	`}`