diff --git a/CMakeLists.txt b/CMakeLists.txt index 7177846ce..2ccbfb694 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ option(MLX_METAL_JIT "Use JIT compilation for Metal kernels" OFF) option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF) if(NOT MLX_VERSION) - set(MLX_VERSION 0.17.0) + set(MLX_VERSION 0.17.1) endif() # --------------------- Processor tests ------------------------- diff --git a/mlx/backend/metal/reduce.cpp b/mlx/backend/metal/reduce.cpp index 2c8f18430..95f6e2eff 100644 --- a/mlx/backend/metal/reduce.cpp +++ b/mlx/backend/metal/reduce.cpp @@ -308,7 +308,11 @@ void all_reduce_dispatch( compute_encoder.dispatchThreads(grid_dims, group_dims); // 2nd pass - compute_encoder->setComputePipelineState(kernel); + std::ostringstream kname_2nd_pass; + kname_2nd_pass << "all_reduce_" << op_name << type_to_name(intermediate); + auto kernel_2nd_pass = + get_reduce_kernel(d, kname_2nd_pass.str(), op_name, intermediate, out); + compute_encoder->setComputePipelineState(kernel_2nd_pass); size_t intermediate_size = n_rows; grid_dims = MTL::Size(threadgroup_2nd_pass, 1, 1); group_dims = MTL::Size(threadgroup_2nd_pass, 1, 1); diff --git a/python/tests/test_reduce.py b/python/tests/test_reduce.py index f47a357dd..c59e7f490 100644 --- a/python/tests/test_reduce.py +++ b/python/tests/test_reduce.py @@ -124,6 +124,13 @@ def test_edge_case(self): z = np.array(x).sum((0, 2, 3)) self.assertTrue(np.all(z == y)) + def test_sum_bool(self): + x = np.random.uniform(0, 1, size=(10, 10, 10)) > 0.5 + y = mx.array(x) + npsum = x.sum().item() + mxsum = y.sum().item() + self.assertEqual(npsum, mxsum) + if __name__ == "__main__": unittest.main(failfast=True) diff --git a/setup.py b/setup.py index 3a2fbac3b..410ae262e 100644 --- a/setup.py +++ b/setup.py @@ -163,7 +163,7 @@ def run(self) -> None: setup( name="mlx", - version=get_version("0.17.0"), + version=get_version("0.17.1"), author="MLX Contributors", author_email="mlx@group.apple.com", description="A framework for machine learning on Apple silicon.",