diff --git a/hat/examples/matmul/src/main/java/matmul/Main.java b/hat/examples/matmul/src/main/java/matmul/Main.java
index 178d62b5cd8..31773877b6a 100644
--- a/hat/examples/matmul/src/main/java/matmul/Main.java
+++ b/hat/examples/matmul/src/main/java/matmul/Main.java
@@ -32,17 +32,17 @@
 import hat.NDRange.Local2D;
 import hat.annotations.Kernel;
 import hat.backend.Backend;
-import hat.examples.common.HATExampleException;
-import hat.examples.common.ParseArgs;
-import hat.types.F16;
 import hat.buffer.F16Array;
 import hat.buffer.F32Array;
 import hat.buffer.F32ArrayPadded;
-import hat.types.Float4;
 import hat.device.DeviceSchema;
 import hat.device.NonMappableIface;
-import optkl.ifacemapper.MappableIface.WO;
+import hat.examples.common.HATExampleException;
+import hat.examples.common.ParseArgs;
+import hat.types.F16;
+import hat.types.Float4;
 import jdk.incubator.code.Reflect;
+import optkl.ifacemapper.MappableIface.WO;
 
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
@@ -144,7 +144,6 @@ private interface MyLocalArrayFixedSize extends NonMappableIface {
         DeviceSchema<MyLocalArrayFixedSize> deviceSchema = DeviceSchema.of(MyLocalArrayFixedSize.class,
                 myPrivateArray -> myPrivateArray.array("array", 256));// It is a bound schema, so we fix the size here
 
-
         static MyLocalArrayFixedSize create(Accelerator accelerator) {
             return null;
         }
@@ -690,7 +689,7 @@ public static void matrixMultiplyKernel1DWithFunctionCalls(KernelContext kc, F32
 
     @Reflect
     public static void matrixMultiply1D(@RO ComputeContext cc, @RO F32Array matrixA, @RO F32Array matrixB, @WO F32Array matrixC, int globalSize) {
-        cc.dispatchKernel(of1D(globalSize,16),
+        cc.dispatchKernel(of1D(globalSize, 16),
                 kc -> matrixMultiplyKernel1D(kc, matrixA, matrixB, matrixC, globalSize)
         );
     }
@@ -700,21 +699,21 @@ public static void matrixMultiply1D(@RO ComputeContext cc, @RO F32Array matrixA,
 
     @Reflect
     public static void matrixMultiply1DWithFunctionCalls(@RO ComputeContext cc, @RO F32Array matrixA, @RO F32Array matrixB, @WO F32Array matrixC, int size) {
-        cc.dispatchKernel(of1D(size,16),
+        cc.dispatchKernel(of1D(size, 16),
                 kc -> matrixMultiplyKernel1DWithFunctionCalls(kc, matrixA, matrixB, matrixC, size)
         );
     }
 
     @Reflect
     public static void matrixMultiply2D(@RO ComputeContext cc, @RO F32Array matrixA, @RO F32Array matrixB, @WO F32Array matrixC, int globalSize) {
-        cc.dispatchKernel(of2D(globalSize, globalSize,BLOCK_SIZE, BLOCK_SIZE),
+        cc.dispatchKernel(of2D(globalSize, globalSize, BLOCK_SIZE, BLOCK_SIZE),
                 kc -> matrixMultiplyKernel2D(kc, matrixA, matrixB, matrixC, globalSize)
         );
     }
 
     @Reflect
     public static void matrixMultiply2DLI(@RO ComputeContext cc, @RO F32Array matrixA, @RO F32Array matrixB, @WO F32Array matrixC, int globalSize) {
-        cc.dispatchKernel(of2D(globalSize, globalSize,BLOCK_SIZE, BLOCK_SIZE),
+        cc.dispatchKernel(of2D(globalSize, globalSize, BLOCK_SIZE, BLOCK_SIZE),
                 kc -> matrixMultiplyKernel2DLI(kc, matrixA, matrixB, matrixC, globalSize)
         );
     }
@@ -750,7 +749,7 @@ public static void matrixMultiply2DRegisterTiling(@RO ComputeContext cc, @RO F32
     public static void matrixMultiply2DRegisterTilingVectorizedAccesses(@RO ComputeContext cc, @RO F32ArrayPadded matrixA, @RO F32ArrayPadded matrixB, @WO F32ArrayPadded matrixC, int globalSize) {
         // Note: if we change the static constant BM, we also need to adapt the BM and BN within the kernel to match the same value
         int size = ceil(globalSize, BM) * BLOCK_SIZE;
-        cc.dispatchKernel(of2D(size, size ,BLOCK_SIZE, BLOCK_SIZE),
+        cc.dispatchKernel(of2D(size, size, BLOCK_SIZE, BLOCK_SIZE),
                 kc -> matrixMultiplyKernel2DRegisterTilingVectorized(kc, matrixA, matrixB, matrixC, globalSize)
         );
     }
@@ -998,7 +997,7 @@ static void main(String[] args) {
                 default -> throw new HATExampleException("Unknown configuration: " + configuration);
             }
             long end = System.nanoTime();
-            timers.add((end-start));
+            timers.add((end - start));
             if (verbose) {
                 IO.println("Elapsed Time: " + (end - start) + " ns");
             }
diff --git a/hat/examples/pom.xml b/hat/examples/pom.xml
index 1f0b9bcdacc..6839628812b 100644
--- a/hat/examples/pom.xml
+++ b/hat/examples/pom.xml
@@ -1,28 +1,30 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0">
-<!--Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
-DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
+         xmlns="http://maven.apache.org/POM/4.0.0">
+    <!--Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+    DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 
-This code is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License version 2 only, as
-published by the Free Software Foundation.  Oracle designates this
-particular file as subject to the "Classpath" exception as provided
-by Oracle in the LICENSE file that accompanied this code.
+    This code is free software; you can redistribute it and/or modify it
+    under the terms of the GNU General Public License version 2 only, as
+    published by the Free Software Foundation.  Oracle designates this
+    particular file as subject to the "Classpath" exception as provided
+    by Oracle in the LICENSE file that accompanied this code.
 
-This code is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-version 2 for more details (a copy is included in the LICENSE file that
-accompanied this code).
+    This code is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    version 2 for more details (a copy is included in the LICENSE file that
+    accompanied this code).
 
-You should have received a copy of the GNU General Public License version
-2 along with this work; if not, write to the Free Software Foundation,
-Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+    You should have received a copy of the GNU General Public License version
+    2 along with this work; if not, write to the Free Software Foundation,
+    Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 
-Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
-or visit www.oracle.com if you need additional information or have any
-questions.
---><!--Auto generated by mkpoms-->
+    Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+    or visit www.oracle.com if you need additional information or have any
+    questions.
+    --><!--Auto generated by mkpoms-->
     <modelVersion>4.0.0</modelVersion>
     <packaging>pom</packaging>
     <artifactId>hat-examples</artifactId>
@@ -54,13 +56,14 @@ questions.
         <module>view</module>
         <module>life</module>
         <module>nbody</module>
+        <module>tensors</module>
         <module>experiments</module>
     </modules>
     <profiles>
         <profile>
             <id>opengl</id>
             <modules>
-               <module>nbodygl</module>
+                <module>nbodygl</module>
             </modules>
         </profile>
     </profiles>
diff --git a/hat/examples/shared/src/main/java/hat/examples/common/ParseArgs.java b/hat/examples/shared/src/main/java/hat/examples/common/ParseArgs.java
index 65d3c6c2fae..0ae9b694db8 100644
--- a/hat/examples/shared/src/main/java/hat/examples/common/ParseArgs.java
+++ b/hat/examples/shared/src/main/java/hat/examples/common/ParseArgs.java
@@ -86,5 +86,14 @@ private static void showHelp() {
 
     }
 
-    public record Options(boolean verbose, int size, int iterations, boolean skipSequential, boolean checkResult) {}
+    public record Options(boolean verbose, int size, int iterations, boolean skipSequential, boolean checkResult) {
+        public void printOptions() {
+            IO.println("Options:");
+            IO.println("Size           : " + size);
+            IO.println("Iterations     : " + iterations);
+            IO.println("Verbose        : " + verbose);
+            IO.println("Skip Sequential: " + skipSequential);
+            IO.println("Check Result   : " + checkResult);
+        }
+    }
 }
diff --git a/hat/examples/tensors/pom.xml b/hat/examples/tensors/pom.xml
new file mode 100644
index 00000000000..cf780132a73
--- /dev/null
+++ b/hat/examples/tensors/pom.xml
@@ -0,0 +1,78 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License version 2 only, as
+published by the Free Software Foundation.  Oracle designates this
+particular file as subject to the "Classpath" exception as provided
+by Oracle in the LICENSE file that accompanied this code.
+
+This code is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+version 2 for more details (a copy is included in the LICENSE file that
+accompanied this code).
+
+You should have received a copy of the GNU General Public License version
+2 along with this work; if not, write to the Free Software Foundation,
+Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+or visit www.oracle.com if you need additional information or have any
+questions.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>oracle.code</groupId>
+    <artifactId>hat-example-tensors</artifactId>
+    <version>1.0</version>
+    <packaging>jar</packaging>
+
+    <parent>
+        <groupId>oracle.code</groupId>
+        <artifactId>hat-examples</artifactId>
+        <version>1.0</version>
+    </parent>
+    <dependencies>
+        <dependency>
+            <groupId>oracle.code</groupId>
+            <artifactId>hat-core</artifactId>
+            <version>1.0</version>
+        </dependency>
+        <dependency>
+            <groupId>oracle.code</groupId>
+            <artifactId>hat-example-shared</artifactId>
+            <version>1.0</version>
+            <scope>compile</scope>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-antrun-plugin</artifactId>
+                <version>1.8</version>
+                <executions>
+                    <execution>
+                        <id>1</id>
+                        <phase>install</phase>
+                        <goals>
+                            <goal>run</goal>
+                        </goals>
+                        <configuration>
+                            <target>
+                                <copy file="target/${project.artifactId}-${project.version}.jar" toDir="${hat.build}"/>
+                            </target>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>
diff --git a/hat/examples/tensors/src/main/java/tensors/Main.java b/hat/examples/tensors/src/main/java/tensors/Main.java
new file mode 100644
index 00000000000..769249f6023
--- /dev/null
+++ b/hat/examples/tensors/src/main/java/tensors/Main.java
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package tensors;
+
+import hat.Accelerator;
+import hat.Accelerator.Compute;
+import hat.ComputeContext;
+import hat.KernelContext;
+import hat.NDRange;
+import hat.backend.Backend;
+import hat.buffer.F16Array;
+import hat.buffer.F32Array;
+import hat.examples.common.ParseArgs;
+import hat.examples.common.ParseArgs.Options;
+import hat.types.F16;
+import hat.types.Tensor;
+import jdk.incubator.code.Reflect;
+import optkl.ifacemapper.MappableIface;
+
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import java.util.stream.IntStream;
+
+import static hat.NDRange.Global2D;
+import static hat.NDRange.Local2D;
+import static hat.NDRange.NDRange2D;
+import static hat.NDRange.Warp2D;
+import static hat.NDRange.of2D;
+import static hat.examples.common.StatUtils.dumpStatsToCSVFile;
+
+/**
+ * Example to test the Tensor/Tile API vs the Thread API.
+ *
+ * <p>
+ * How to run? We recommend running this example with the CUDA backend to be
+ * able to generate tensor operations.
+ * <code>java @hat/run ffi-cuda tensors --iterations=100 --verbose --size=2048</code>
+ * OpenCL:
+ * <code>java @hat/run ffi-opencl tensors --iterations=100 --verbose --size=2048</code>
+ * </p>
+ */
+public class Main {
+
+    @Reflect
+    public static void mxmTensorsCM(@MappableIface.RO KernelContext kc, @MappableIface.RO F16Array matrixA, @MappableIface.RO F16Array matrixB, @MappableIface.WO F32Array matrixC, int size) {
+        final int WMMA_M = 16;
+        final int WMMA_N = 16;
+        final int WMMA_K = 16;
+        int warpM = kc.gix / kc.wrs;
+        int warpN = kc.giy;
+
+        final int lda = size;
+        final int ldb = size;
+        final int ldc = size;
+
+        Tensor tensorA = Tensor.create(Tensor.FIRST, Tensor.shape(16, 16, 16), F16.class, Tensor.ofColumnMajor());
+        Tensor tensorB = Tensor.create(Tensor.SECOND, Tensor.shape(16, 16, 16), F16.class, Tensor.ofColumnMajor());
+        Tensor acc = Tensor.create(Tensor.ACC, Tensor.shape(16, 16, 16), float.class);
+
+        Tensor.fill(acc, 0.0f);
+
+        for (int i = 0; i < size; i += WMMA_K) {
+            int aRow = warpM * WMMA_M;
+            int aCol = i;
+
+            int bRow = i;
+            int bCol = warpN * WMMA_N;
+
+            if (aRow < lda && aCol < lda && bRow < ldb && bCol < ldb) {
+
+                tensorA = Tensor.load(matrixA, aRow, aCol, lda);
+                tensorB = Tensor.load(matrixB, bRow, bCol, ldb);
+
+                // acc = tensorA * tensorB + acc
+                Tensor.mma(acc, tensorA, tensorB, acc);
+            }
+        }
+        int cRow = warpM * WMMA_M;
+        int cCol = warpN * WMMA_N;
+        Tensor.store(matrixC, cRow, cCol, acc, ldc, Tensor.ofColumnMajor());
+    }
+
+    @Reflect
+    public static void mxmTensorsCM(@MappableIface.RO ComputeContext cc, @MappableIface.RO F16Array matrixA, @MappableIface.RO F16Array matrixB, @MappableIface.WO F32Array matrixC, int globalSize) {
+        var ndRange = NDRange2D.of(Global2D.of(globalSize, globalSize),
+                Local2D.of(128, 4),
+                NDRange.Tile2D.of(16, 16),
+                Warp2D.of(true, false));
+        cc.dispatchKernel(ndRange, kc -> mxmTensorsCM(kc, matrixA, matrixB, matrixC, globalSize));
+    }
+
+    @Reflect
+    public static void mxmNaiveF32(KernelContext kc, F32Array matrixA, F32Array matrixB, F32Array matrixC, int size) {
+        if (kc.gix < kc.gsx && kc.giy < kc.gsy) {
+            float acc = 0.0f;
+            for (int k = 0; k < size; k++) {
+                acc += (matrixA.array(k * size + kc.giy) * matrixB.array(kc.gix * size + k));
+            }
+            matrixC.array(kc.gix * size + kc.giy, acc);
+        }
+    }
+
+    @Reflect
+    public static void mxmNaiveF32(@MappableIface.RO ComputeContext cc, @MappableIface.RO F32Array matrixA, @MappableIface.RO F32Array matrixB, @MappableIface.WO F32Array matrixC, int globalSize) {
+        cc.dispatchKernel(of2D(globalSize, globalSize, 16, 16),
+                kc -> mxmNaiveF32(kc, matrixA, matrixB, matrixC, globalSize)
+        );
+    }
+
+    @Reflect
+    public static void mxmNaiveF16(@MappableIface.RO KernelContext kc, @MappableIface.RO F16Array matrixA, @MappableIface.RO F16Array matrixB, @MappableIface.WO F32Array matrixC, int size) {
+        if (kc.gix < kc.gsx && kc.giy < kc.gsy) {
+            float acc = 0.0f;
+            for (int k = 0; k < size; k++) {
+                F16 ha = matrixA.array(k * size + kc.giy);
+                F16 hb = matrixB.array(kc.gix * size + k);
+                F16 hc = F16.mul(ha, hb);
+                float fc = F16.f16ToFloat(hc);
+                acc += fc;
+            }
+            matrixC.array(kc.gix * size + kc.giy, acc);
+        }
+
+    }
+
+    @Reflect
+    public static void mxmNaiveF16(@MappableIface.RO ComputeContext cc, @MappableIface.RO F16Array matrixA, @MappableIface.RO F16Array matrixB, @MappableIface.WO F32Array matrixC, int globalSize) {
+        cc.dispatchKernel(of2D(globalSize, globalSize, 16, 16),
+                kc -> mxmNaiveF16(kc, matrixA, matrixB, matrixC, globalSize)
+        );
+    }
+
+    private static void runSequentialColumnMajor(F32Array matrixA, F32Array matrixB, F32Array matrixC, final int size) {
+        for (int i = 0; i < size; i++) {
+            for (int j = 0; j < size; j++) {
+                float sum = 0.0f;
+                for (int k = 0; k < size; k++) {
+                    float a = matrixA.array((long) k * size + i);
+                    float b = matrixB.array((long) j * size + k);
+                    sum += (a * b);
+                }
+                matrixC.array((long) j * size + i, sum);
+            }
+        }
+    }
+
+    private static void runMultiThreadedWithStreamsColumnMajor(F32Array matrixA, F32Array matrixB, F32Array matrixC, int size) {
+        IntStream.range(0, size)
+                .parallel()
+                .forEach(i -> IntStream.range(0, size)
+                        .parallel()
+                        .forEach(j -> {
+                            float sum = 0.0f;
+                            for (int k = 0; k < size; k++) {
+                                sum += matrixA.array(k * size + i) * matrixB.array(j * size + k);
+                            }
+                            matrixC.array(j * size + i, sum);
+                        }));
+    }
+
+    private static boolean checkResult(F32Array reference, F32Array output, int size) {
+        for (int i = 0; i < size; i++) {
+            for (int j = 0; j < size; j++) {
+                final float expected = reference.array(i * size + j);
+                final float got = output.array(i * size + j);
+                if (Math.abs(expected - got) > 0.1f) {
+                    IO.println("[Error] GOT: " + got + " - but expected: " + expected);
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    private static void printResult(String version, boolean check) {
+        if (check) {
+            IO.println("Result-" + version + " is correct!");
+        } else {
+            IO.println("Result-" + version + " is wrong!");
+        }
+    }
+
+    static void runBenchmark(Options options) {
+        final int size = options.size();
+        final int numIterations = options.iterations();
+
+        options.printOptions();
+
+        List<Long> timersJava = new ArrayList<>();
+        List<Long> timersParallelStreams = new ArrayList<>();
+        List<Long> timersHATNaiveF32 = new ArrayList<>();
+        List<Long> timersHATNaiveF16 = new ArrayList<>();
+        List<Long> timersHATTensors = new ArrayList<>();
+
+        var accelerator = new Accelerator(MethodHandles.lookup(), Backend.FIRST);
+
+        F16Array matrixAHalf = F16Array.create(accelerator, size * size);
+        F16Array matrixBHalf = F16Array.create(accelerator, size * size);
+        F32Array matrixA = F32Array.create(accelerator, size * size);
+        F32Array matrixB = F32Array.create(accelerator, size * size);
+        F32Array resultNativeF32 = F32Array.create(accelerator, size * size);
+        F32Array resultNativeF16 = F32Array.create(accelerator, size * size);
+        F32Array resultTensor = F32Array.create(accelerator, size * size);
+        F32Array matrixReference = F32Array.create(accelerator, size * size);
+        F32Array resultStreams = F32Array.create(accelerator, size * size);
+
+        Random r = new Random(19);
+        for (int j = 0; j < matrixAHalf.length(); j++) {
+            float a = r.nextFloat();
+            float b = r.nextFloat();
+            matrixAHalf.array(j).value(F16.floatToF16(a).value());
+            matrixA.array(j, a);
+            matrixBHalf.array(j).value(F16.floatToF16(b).value());
+            matrixB.array(j, b);
+        }
+
+        // Java Sequential
+        if (!options.skipSequential()) {
+            for (int i = 0; i < numIterations; i++) {
+                long start = System.nanoTime();
+                runSequentialColumnMajor(matrixA, matrixB, matrixReference, size);
+                long end = System.nanoTime();
+                if (options.verbose()) {
+                    IO.println("Java Seq Timer: " + (end - start));
+                }
+                timersJava.add((end - start));
+            }
+        }
+
+        // Java Parallel Streams
+        for (int i = 0; i < numIterations; i++) {
+            long start = System.nanoTime();
+            runMultiThreadedWithStreamsColumnMajor(matrixA, matrixB, resultStreams, size);
+            long end = System.nanoTime();
+            if (options.verbose()) {
+                IO.println("Java Parallel-Stream Timer: " + (end - start));
+            }
+            timersParallelStreams.add((end - start));
+        }
+
+        // HAT Parallel Naive F32
+        for (int i = 0; i < numIterations; i++) {
+            long start = System.nanoTime();
+            accelerator.compute((@Reflect Compute) cc -> mxmNaiveF32(cc, matrixA, matrixB, resultNativeF32, size));
+            long end = System.nanoTime();
+            if (options.verbose()) {
+                IO.println("HAT GPU-Naive-F32 Timer: " + (end - start));
+            }
+            timersHATNaiveF32.add((end - start));
+        }
+
+        // HAT Parallel Naive F16
+        for (int i = 0; i < numIterations; i++) {
+            long start = System.nanoTime();
+            accelerator.compute((@Reflect Compute) cc -> mxmNaiveF16(cc, matrixAHalf, matrixBHalf, resultNativeF16, size));
+            long end = System.nanoTime();
+            if (options.verbose()) {
+                IO.println("HAT GPU-Naive-F16 Timer: " + (end - start));
+            }
+            timersHATNaiveF16.add((end - start));
+        }
+
+        // HAT Parallel Tensor
+        for (int i = 0; i < numIterations; i++) {
+            long start = System.nanoTime();
+            accelerator.compute((@Reflect Compute) cc -> mxmTensorsCM(cc, matrixAHalf, matrixBHalf, resultTensor, size));
+            long end = System.nanoTime();
+            if (options.verbose()) {
+                IO.println("HAT GPU-Tensors Timer: " + (end - start));
+            }
+            timersHATTensors.add((end - start));
+        }
+
+        if (options.checkResult() && !options.skipSequential()) {
+            printResult("streams", checkResult(matrixReference, resultStreams, size));
+            printResult("HAT-NaiveF32", checkResult(matrixReference, resultNativeF32, size));
+            printResult("HAT-NaiveF16", checkResult(matrixReference, resultNativeF16, size));
+            printResult("HAT-Tensors", checkResult(matrixReference, resultTensor, size));
+        }
+
+        // Write CSV table for all the results
+        List<List<Long>> timers = options.skipSequential() ?
+                List.of(timersParallelStreams, timersHATNaiveF32, timersHATNaiveF16, timersHATTensors) :
+                List.of(timersJava, timersParallelStreams, timersHATNaiveF32, timersHATNaiveF16, timersHATTensors);
+
+        List<String> headers = options.skipSequential() ?
+                List.of("Java-streams-fp32-" + size, "HAT-naive-fp32-" + size, "HAT-naive-fp16-" + size, "HAT-tensors-fp16-" + size) :
+                List.of("Java-fp32-" + size, "Java-streams-fp32-" + size, "HAT-naive-fp32-" + size, "HAT-naive-fp16-" + size, "HAT-tensors-fp16-" + size);
+
+        final String tableName = "table-tensors-" + size + ".csv";
+        dumpStatsToCSVFile(timers, headers, tableName);
+    }
+
+    static void main(String[] args) {
+        IO.println("Example of Matmul with Tensors");
+
+        final int defaultSize = 1024;
+        int numIterations = 100;
+        ParseArgs parseArgs = new ParseArgs(args);
+        Options options = parseArgs.parseWithDefaults(defaultSize, numIterations);
+
+        // check input size
+        if (options.size() % 16 != 0 || options.size() < 128) {
+            throw new RuntimeException("Input size must of a multiple of 16, and larger than 128");
+        }
+        runBenchmark(options);
+    }
+}
diff --git a/hat/hat.java b/hat/hat.java
index bcc8c4a5b54..9a201be2073 100644
--- a/hat/hat.java
+++ b/hat/hat.java
@@ -167,6 +167,7 @@ public static void main(String[] argArr) throws IOException, InterruptedExceptio
         var example_dft = hat.jar("example{s}-dft", core, example_shared);
         var example_fft = hat.jar("example{s}-fft", core, example_shared);
         var example_matmul = hat.jar("example{s}-matmul", core, example_shared);
+        var example_tensors = hat.jar("example{s}-tensors", core, example_shared);
 
         // These examples use example_shared, so they are UI based
         var example_mandel = hat.jar("example{s}-mandel", example_shared);
@@ -199,6 +200,7 @@ public static void main(String[] argArr) throws IOException, InterruptedExceptio
            example_squares,
            example_matmul,
            example_flash_attention,
+           example_tensors,
            example_blackscholes,
            example_view,
            example_normmap,
diff --git a/hat/hat/bld.java b/hat/hat/bld.java
index 90600d97d82..4a525286035 100644
--- a/hat/hat/bld.java
+++ b/hat/hat/bld.java
@@ -29,118 +29,118 @@
 
 void main(String[] args) {
     var layout = """
-       └──./
-           ├──hat                                  //  All build scripts in each case 'foo' has java options for (and points to) 'foo.java'
-           │    ├──bld                             //  --enable-preview --source 26 hat/bld.java
-           │    ├──bld.java
-           │    ├──run                             //  --enable-preview --source 26 hat/run.java
-           │    ├──run.java
-           │    └──Script                          //  Contains all the tools for building
-           ├──build/                               // All jars, native libs and executables
-           │    ├── cmake-build-debug/             // All intermediate cmake artifacts
-           │    ├── hat-wrap-*-1.0.jar             // Wrapper jars around extracted * (opencl, glwrap, opencl)
-           │    ├── hat-core-1.0.jar               // Base hat jar
-           │    ├── hat-example-*-1.0.jar          // Example jars (hat-example-nbody-1.0.jar, hat-example-life-1.0.jar)
-           │    ├── hat-extracted-opencl-1.0.jar   // Raw extraction jars (hat-extracted-opencl-1.0.jar ....)
-           │    ├── lib*_backend.[dylib|so]        // ffi library backends
-           │    └── *(no suffix)                   // various generated executables (opencl_info, cuda_info, cuda_squares)
-           ├──extractions/
-           │   ├──CMakeFiles.txt
-           │   ├── opencl/
-           │   │   └──CMakeFiles.txt
-           │   ├── cuda/
-           │   │   └──CMakeFiles.txt
-           │   └── opengl/
-           │       └──CMakeFiles.txt
-           ├──wraps/
-           │   ├──shared/
-           │   │   └──src/main/java
-           │   ├──cuda/
-           │   │   └──src/main/java
-           │   ├──opencl/
-           │   │   └──src/main/java
-           │   └──opengl/
-           │       └──src/main/java
-           │
-           ├──core
-           │    ├──src/main/java
-           │    └──src/main/test
-           │
-           ├──tools  : core
-           │    ├──src/main/java
-           │    └──src/main/test
-           │
-           ├──backends
-           │    ├──java
-           │    │    ├──mt
-           │    │    │    ├──src/main/java
-           │    │    │    └──src/main/resources
-           │    │    └──seq
-           │    │         ├──src/main/java
-           │    │         └──src/main/resources
-           │    ├──jextracted
-           │    │    └──opencl
-           │    │         ├──src/main/java
-           │    │         ├──src/main/native
-           │    │         └──src/main/resources
-           │    └──ffi
-           │         ├──CMakeFiles.txt
-           │         ├──opencl
-           │         │    ├──CMakeFiles.txt
-           │         │    ├──src/main/java
-           │         │    ├──src/main/native
-           │         │    └──src/main/resources
-           │         ├──cuda
-           │         │    ├──CMakeFiles.txt
-           │         │    ├──src/main/java
-           │         │    ├──src/main/native
-           │         │    └──src/main/resources
-           │         ├──mock
-           │         │    ├──CMakeFiles.txt
-           │         │    ├──src/main/java
-           │         │    ├──src/main/native
-           │         │    └──src/main/resources
-           │         ├──spirv
-           │         │    ├──CMakeFiles.txt
-           │         │    ├──src/main/java
-           │         │    ├──src/main/native
-           │         │    └──src/main/resources
-           │         └──hip
-           │              ├──CMakeFiles.txt
-           │              ├──src/main/java
-           │              ├──src/main/native
-           │              └──src/main/resources
-           │
-           └──examples
-                ├──shared
+            └──./
+                ├──hat                                  //  All build scripts in each case 'foo' has java options for (and points to) 'foo.java'
+                │    ├──bld                             //  --enable-preview --source 26 hat/bld.java
+                │    ├──bld.java
+                │    ├──run                             //  --enable-preview --source 26 hat/run.java
+                │    ├──run.java
+                │    └──Script                          //  Contains all the tools for building
+                ├──build/                               // All jars, native libs and executables
+                │    ├── cmake-build-debug/             // All intermediate cmake artifacts
+                │    ├── hat-wrap-*-1.0.jar             // Wrapper jars around extracted * (opencl, glwrap, opencl)
+                │    ├── hat-core-1.0.jar               // Base hat jar
+                │    ├── hat-example-*-1.0.jar          // Example jars (hat-example-nbody-1.0.jar, hat-example-life-1.0.jar)
+                │    ├── hat-extracted-opencl-1.0.jar   // Raw extraction jars (hat-extracted-opencl-1.0.jar ....)
+                │    ├── lib*_backend.[dylib|so]        // ffi library backends
+                │    └── *(no suffix)                   // various generated executables (opencl_info, cuda_info, cuda_squares)
+                ├──extractions/
+                │   ├──CMakeFiles.txt
+                │   ├── opencl/
+                │   │   └──CMakeFiles.txt
+                │   ├── cuda/
+                │   │   └──CMakeFiles.txt
+                │   └── opengl/
+                │       └──CMakeFiles.txt
+                ├──wraps/
+                │   ├──shared/
+                │   │   └──src/main/java
+                │   ├──cuda/
+                │   │   └──src/main/java
+                │   ├──opencl/
+                │   │   └──src/main/java
+                │   └──opengl/
+                │       └──src/main/java
+                │
+                ├──core
                 │    ├──src/main/java
-                │    └──src/main/resources
-                ├──mandel
+                │    └──src/main/test
+                │
+                ├──tools  : core
                 │    ├──src/main/java
-                │    └──src/main/resources
-                ├──squares
-                │    ├──src/main/java
-                │    └──src/main/resources
-                ├──heal
-                │    ├──src/main/java
-                │    └──src/main/resources
-                ├──life
-                │    ├──src/main/java
-                │    └──src/main/resources
-                ├──nbody
-                │    ├──src/main/java
-                │    └──src/main/resources
-                ├──experiments
-                │    ├──src/main/java
-                │    └──src/main/resources
-                ├──violajones
-                │    ├──src/main/java
-                │    └──src/main/resources
-                └──matmul
-                     ├──src/main/java
-                     └──src/main/resources
-       """;
-    class Artifacts{
+                │    └──src/main/test
+                │
+                ├──backends
+                │    ├──java
+                │    │    ├──mt
+                │    │    │    ├──src/main/java
+                │    │    │    └──src/main/resources
+                │    │    └──seq
+                │    │         ├──src/main/java
+                │    │         └──src/main/resources
+                │    ├──jextracted
+                │    │    └──opencl
+                │    │         ├──src/main/java
+                │    │         ├──src/main/native
+                │    │         └──src/main/resources
+                │    └──ffi
+                │         ├──CMakeFiles.txt
+                │         ├──opencl
+                │         │    ├──CMakeFiles.txt
+                │         │    ├──src/main/java
+                │         │    ├──src/main/native
+                │         │    └──src/main/resources
+                │         ├──cuda
+                │         │    ├──CMakeFiles.txt
+                │         │    ├──src/main/java
+                │         │    ├──src/main/native
+                │         │    └──src/main/resources
+                │         ├──mock
+                │         │    ├──CMakeFiles.txt
+                │         │    ├──src/main/java
+                │         │    ├──src/main/native
+                │         │    └──src/main/resources
+                │         ├──spirv
+                │         │    ├──CMakeFiles.txt
+                │         │    ├──src/main/java
+                │         │    ├──src/main/native
+                │         │    └──src/main/resources
+                │         └──hip
+                │              ├──CMakeFiles.txt
+                │              ├──src/main/java
+                │              ├──src/main/native
+                │              └──src/main/resources
+                │
+                └──examples
+                     ├──shared
+                     │    ├──src/main/java
+                     │    └──src/main/resources
+                     ├──mandel
+                     │    ├──src/main/java
+                     │    └──src/main/resources
+                     ├──squares
+                     │    ├──src/main/java
+                     │    └──src/main/resources
+                     ├──heal
+                     │    ├──src/main/java
+                     │    └──src/main/resources
+                     ├──life
+                     │    ├──src/main/java
+                     │    └──src/main/resources
+                     ├──nbody
+                     │    ├──src/main/java
+                     │    └──src/main/resources
+                     ├──experiments
+                     │    ├──src/main/java
+                     │    └──src/main/resources
+                     ├──violajones
+                     │    ├──src/main/java
+                     │    └──src/main/resources
+                     └──matmul
+                          ├──src/main/java
+                          └──src/main/resources
+            """;
+    class Artifacts {
         static Script.MavenStyleProject core;
         static Script.MavenStyleProject tools;
         static Script.MavenStyleProject tests;
@@ -182,16 +182,16 @@ class Artifacts{
 
     var extractionsCmakeBuildDir = extractionsDir.buildDir("cmake-build-debug");
     if (!extractionsCmakeBuildDir.exists()) {
-        Script.cmake($ -> $ .verbose(false) .source_dir(extractionsDir) .build_dir(extractionsCmakeBuildDir));
+        Script.cmake($ -> $.verbose(false).source_dir(extractionsDir).build_dir(extractionsCmakeBuildDir));
     }
-    Script.cmake($ -> $ .build(extractionsCmakeBuildDir) .target("extract"));
+    Script.cmake($ -> $.build(extractionsCmakeBuildDir).target("extract"));
 
     var extraction_opencl_dir = extractionsDir.dir("opencl");
     if (extraction_opencl_dir.dir("src").exists()) {
         Artifacts.extraction_opencl = buildDir.mavenStyleBuild(
                 extraction_opencl_dir, "hat-extracted-opencl-1.0.jar"
         );
-    }else{
+    } else {
         print("no src for extraction_opencl");
     }
 
@@ -200,7 +200,7 @@ class Artifacts{
         Artifacts.extraction_opengl = buildDir.mavenStyleBuild(
                 extraction_opengl_dir, "hat-extracted-opengl-1.0.jar"
         );
-    }else{
+    } else {
         print("no src for extraction_opengl");
     }
 
@@ -211,13 +211,12 @@ class Artifacts{
         );
     }
 
-
     var wrapsDir = dir.existingDir("wraps");
 
-    Artifacts.wrap_shared = buildDir.mavenStyleBuild( wrapsDir.existingDir("shared"), "hat-wrap-shared-1.0.jar");
+    Artifacts.wrap_shared = buildDir.mavenStyleBuild(wrapsDir.existingDir("shared"), "hat-wrap-shared-1.0.jar");
 
-    if (Artifacts.extraction_opencl != null){
-        Artifacts.wrap_opencl = buildDir.mavenStyleBuild( wrapsDir.dir("opencl"), "hat-wrap-opencl-1.0.jar", Artifacts.wrap_shared, Artifacts.core, Artifacts.extraction_opencl);
+    if (Artifacts.extraction_opencl != null) {
+        Artifacts.wrap_opencl = buildDir.mavenStyleBuild(wrapsDir.dir("opencl"), "hat-wrap-opencl-1.0.jar", Artifacts.wrap_shared, Artifacts.core, Artifacts.extraction_opencl);
     }
 // on jetson
 // ls extractions/opengl/src/main/java/opengl/glutKeyboardFunc*
@@ -234,21 +233,21 @@ class Artifacts{
         String exclude = null;
         if (!Artifacts.extraction_opengl.jarFile.select(Script.Regex.of("^.*glutKeyboardFunc\\$func.class$")).isEmpty()) {
             exclude = "Callback";
-        }else if (!Artifacts.extraction_opengl.jarFile.select(Script.Regex.of("^.*glutKeyboardFunc\\$callback.class$")).isEmpty()) {
+        } else if (!Artifacts.extraction_opengl.jarFile.select(Script.Regex.of("^.*glutKeyboardFunc\\$callback.class$")).isEmpty()) {
             exclude = "Func";
-        }else {
+        } else {
             println("We can't build wrap_opengl unless exclude one of GLFuncEventHandler or GLCallbackEventHandler something");
         }
         if (exclude != null) {
-            final var excludeMeSigh = "^.*/GL"+exclude+"EventHandler\\.java$";
-            println("exclude ="+exclude+" "+excludeMeSigh);
+            final var excludeMeSigh = "^.*/GL" + exclude + "EventHandler\\.java$";
+            println("exclude =" + exclude + " " + excludeMeSigh);
             Artifacts.wrap_opengl = Script.mavenStyleProject(
                     buildDir, wrapsDir.dir("opengl"), buildDir.jarFile("hat-wrap-opengl-1.0.jar"), Artifacts.wrap_shared, Artifacts.core, Artifacts.extraction_opengl
             ).buildExcluding(javaSrc -> javaSrc.matches(excludeMeSigh));
         }
     }
 
-    if (false && Artifacts.extraction_cuda != null ) {
+    if (false && Artifacts.extraction_cuda != null) {
         Artifacts.wrap_cuda = buildDir.mavenStyleBuild(
                 wrapsDir.dir("cuda"), "hat-wrap-cuda-1.0.jar", Artifacts.extraction_cuda
         );
@@ -261,20 +260,20 @@ class Artifacts{
             ffiBackendsDir.existingDir("shared"), "hat-backend-ffi-shared-1.0.jar", Artifacts.core
     );
 
-    if (ffiBackendsDir.optionalDir("opencl") instanceof Script.DirEntry ffiBackendDir ) {
+    if (ffiBackendsDir.optionalDir("opencl") instanceof Script.DirEntry ffiBackendDir) {
         Artifacts.backend_ffi_opencl = buildDir.mavenStyleBuild(
-                ffiBackendDir, "hat-backend-ffi-"+ffiBackendDir.fileName()+ "-1.0.jar", Artifacts.core, Artifacts.backend_ffi_shared
+                ffiBackendDir, "hat-backend-ffi-" + ffiBackendDir.fileName() + "-1.0.jar", Artifacts.core, Artifacts.backend_ffi_shared
         );
     }
     if (ffiBackendsDir.optionalDir("mock") instanceof Script.DirEntry ffiBackendDir) {
         Artifacts.backend_ffi_mock = buildDir.mavenStyleBuild(
-                ffiBackendDir, "hat-backend-ffi-"+ffiBackendDir.fileName()+ "-1.0.jar", Artifacts.core, Artifacts.backend_ffi_shared
+                ffiBackendDir, "hat-backend-ffi-" + ffiBackendDir.fileName() + "-1.0.jar", Artifacts.core, Artifacts.backend_ffi_shared
         );
     }
 
     if (ffiBackendsDir.optionalDir("cuda") instanceof Script.DirEntry ffiBackendDir) {
         Artifacts.backend_ffi_cuda = buildDir.mavenStyleBuild(
-                ffiBackendDir, "hat-backend-ffi-"+ffiBackendDir.fileName()+ "-1.0.jar", Artifacts.core, Artifacts.backend_ffi_shared
+                ffiBackendDir, "hat-backend-ffi-" + ffiBackendDir.fileName() + "-1.0.jar", Artifacts.core, Artifacts.backend_ffi_shared
         );
     }
 
@@ -299,28 +298,27 @@ class Artifacts{
     }
 
     var javaBackendsDir = backendsDir.existingDir("java");
-    Artifacts.backend_java_mt =  buildDir.mavenStyleBuild(javaBackendsDir.existingDir("mt"),
+    Artifacts.backend_java_mt = buildDir.mavenStyleBuild(javaBackendsDir.existingDir("mt"),
             "hat-backend-java-mt-1.0.jar", Artifacts.core
     );
-    Artifacts.backend_java_seq =  buildDir.mavenStyleBuild(javaBackendsDir.existingDir("seq"),
+    Artifacts.backend_java_seq = buildDir.mavenStyleBuild(javaBackendsDir.existingDir("seq"),
             "hat-backend-java-seq-1.0.jar", Artifacts.core
     );
 
     var examplesDir = dir.existingDir("examples");
 
-
-    Stream.of( "blackscholes", "squares", "matmul", "flashattention")
+    Stream.of("blackscholes", "squares", "matmul", "flashattention", "tensors")
             .parallel()
             .map(examplesDir::existingDir)
-            .forEach(exampleDir->buildDir.mavenStyleBuild(
-                    exampleDir, "hat-example-"+exampleDir.fileName()+"-1.0.jar", Artifacts.core
+            .forEach(exampleDir -> buildDir.mavenStyleBuild(
+                    exampleDir, "hat-example-" + exampleDir.fileName() + "-1.0.jar", Artifacts.core
             ));
 
-    Stream.of( "experiments")   // this has hardcoded references to opencl backend
+    Stream.of("experiments")   // this has hardcoded references to opencl backend
             .parallel()
             .map(examplesDir::existingDir)
-            .forEach(exampleDir->buildDir.mavenStyleBuild(
-                    exampleDir, "hat-example-"+exampleDir.fileName()+"-1.0.jar",
+            .forEach(exampleDir -> buildDir.mavenStyleBuild(
+                    exampleDir, "hat-example-" + exampleDir.fileName() + "-1.0.jar",
                     Artifacts.core, Artifacts.backend_ffi_shared, Artifacts.backend_ffi_opencl
             ));
 
@@ -328,11 +326,11 @@ class Artifacts{
             examplesDir.existingDir("shared"), "hat-example-shared-1.0.jar", Artifacts.core
     );
 
-    Stream.of( "heal", "life", "mandel", "violajones")   // these require example_shared ui stuff
+    Stream.of("heal", "life", "mandel", "violajones")   // these require example_shared ui stuff
             .parallel()
             .map(examplesDir::existingDir)
-            .forEach(exampleDir->buildDir.mavenStyleBuild(
-                    exampleDir, "hat-example-"+exampleDir.fileName()+"-1.0.jar", Artifacts.core, Artifacts.example_shared
+            .forEach(exampleDir -> buildDir.mavenStyleBuild(
+                    exampleDir, "hat-example-" + exampleDir.fileName() + "-1.0.jar", Artifacts.core, Artifacts.example_shared
             ));
 
     var nbodyDependencies = new Script.MavenStyleProject[]{
@@ -347,14 +345,14 @@ class Artifacts{
 
     boolean foundNull = false;
 
-    for (var o:nbodyDependencies){
-        if (o == null){
+    for (var o : nbodyDependencies) {
+        if (o == null) {
             foundNull = true;
         }
     }
-    if (foundNull){
+    if (foundNull) {
         print("incomplete nbody dependencies ");
-    }else {
+    } else {
         Artifacts.example_nbody = buildDir.mavenStyleBuild(
                 examplesDir.existingDir("nbody"), "hat-example-nbody-1.0.jar", nbodyDependencies
         );
@@ -362,9 +360,9 @@ class Artifacts{
 
     var cmakeBuildDir = buildDir.buildDir("cmake-build-debug");
     if (!cmakeBuildDir.exists()) {
-        Script.cmake($ -> $ .verbose(false) .source_dir(ffiBackendsDir) .build_dir(cmakeBuildDir) .copy_to(buildDir));
+        Script.cmake($ -> $.verbose(false).source_dir(ffiBackendsDir).build_dir(cmakeBuildDir).copy_to(buildDir));
     }
-    Script.cmake($ -> $ .build(cmakeBuildDir));
+    Script.cmake($ -> $.build(cmakeBuildDir));
 
 }
 
diff --git a/hat/hat/job.jar b/hat/hat/job.jar
index 911febffcb4..43aa878ad92 100644
Binary files a/hat/hat/job.jar and b/hat/hat/job.jar differ
diff --git a/hat/pom.xml b/hat/pom.xml
index ef1f45dc593..cdc7bf9bbb2 100644
--- a/hat/pom.xml
+++ b/hat/pom.xml
@@ -45,6 +45,7 @@ questions.
     <module>extractions</module>
     <module>wraps</module>
     <module>tests</module>
+      <module>examples/tensors</module>
   </modules>
   <build>
     <plugins>