diff --git a/hat/examples/matmul/src/main/java/matmul/Main.java b/hat/examples/matmul/src/main/java/matmul/Main.java index 178d62b5cd8..31773877b6a 100644 --- a/hat/examples/matmul/src/main/java/matmul/Main.java +++ b/hat/examples/matmul/src/main/java/matmul/Main.java @@ -32,17 +32,17 @@ import hat.NDRange.Local2D; import hat.annotations.Kernel; import hat.backend.Backend; -import hat.examples.common.HATExampleException; -import hat.examples.common.ParseArgs; -import hat.types.F16; import hat.buffer.F16Array; import hat.buffer.F32Array; import hat.buffer.F32ArrayPadded; -import hat.types.Float4; import hat.device.DeviceSchema; import hat.device.NonMappableIface; -import optkl.ifacemapper.MappableIface.WO; +import hat.examples.common.HATExampleException; +import hat.examples.common.ParseArgs; +import hat.types.F16; +import hat.types.Float4; import jdk.incubator.code.Reflect; +import optkl.ifacemapper.MappableIface.WO; import java.lang.invoke.MethodHandles; import java.util.ArrayList; @@ -144,7 +144,6 @@ private interface MyLocalArrayFixedSize extends NonMappableIface { DeviceSchema deviceSchema = DeviceSchema.of(MyLocalArrayFixedSize.class, myPrivateArray -> myPrivateArray.array("array", 256));// It is a bound schema, so we fix the size here - static MyLocalArrayFixedSize create(Accelerator accelerator) { return null; } @@ -690,7 +689,7 @@ public static void matrixMultiplyKernel1DWithFunctionCalls(KernelContext kc, F32 @Reflect public static void matrixMultiply1D(@RO ComputeContext cc, @RO F32Array matrixA, @RO F32Array matrixB, @WO F32Array matrixC, int globalSize) { - cc.dispatchKernel(of1D(globalSize,16), + cc.dispatchKernel(of1D(globalSize, 16), kc -> matrixMultiplyKernel1D(kc, matrixA, matrixB, matrixC, globalSize) ); } @@ -700,21 +699,21 @@ public static void matrixMultiply1D(@RO ComputeContext cc, @RO F32Array matrixA, @Reflect public static void matrixMultiply1DWithFunctionCalls(@RO ComputeContext cc, @RO F32Array matrixA, @RO F32Array matrixB, @WO F32Array matrixC, int size) { - cc.dispatchKernel(of1D(size,16), + cc.dispatchKernel(of1D(size, 16), kc -> matrixMultiplyKernel1DWithFunctionCalls(kc, matrixA, matrixB, matrixC, size) ); } @Reflect public static void matrixMultiply2D(@RO ComputeContext cc, @RO F32Array matrixA, @RO F32Array matrixB, @WO F32Array matrixC, int globalSize) { - cc.dispatchKernel(of2D(globalSize, globalSize,BLOCK_SIZE, BLOCK_SIZE), + cc.dispatchKernel(of2D(globalSize, globalSize, BLOCK_SIZE, BLOCK_SIZE), kc -> matrixMultiplyKernel2D(kc, matrixA, matrixB, matrixC, globalSize) ); } @Reflect public static void matrixMultiply2DLI(@RO ComputeContext cc, @RO F32Array matrixA, @RO F32Array matrixB, @WO F32Array matrixC, int globalSize) { - cc.dispatchKernel(of2D(globalSize, globalSize,BLOCK_SIZE, BLOCK_SIZE), + cc.dispatchKernel(of2D(globalSize, globalSize, BLOCK_SIZE, BLOCK_SIZE), kc -> matrixMultiplyKernel2DLI(kc, matrixA, matrixB, matrixC, globalSize) ); } @@ -750,7 +749,7 @@ public static void matrixMultiply2DRegisterTiling(@RO ComputeContext cc, @RO F32 public static void matrixMultiply2DRegisterTilingVectorizedAccesses(@RO ComputeContext cc, @RO F32ArrayPadded matrixA, @RO F32ArrayPadded matrixB, @WO F32ArrayPadded matrixC, int globalSize) { // Note: if we change the static constant BM, we also need to adapt the BM and BN within the kernel to match the same value int size = ceil(globalSize, BM) * BLOCK_SIZE; - cc.dispatchKernel(of2D(size, size ,BLOCK_SIZE, BLOCK_SIZE), + cc.dispatchKernel(of2D(size, size, BLOCK_SIZE, BLOCK_SIZE), kc -> matrixMultiplyKernel2DRegisterTilingVectorized(kc, matrixA, matrixB, matrixC, globalSize) ); } @@ -998,7 +997,7 @@ static void main(String[] args) { default -> throw new HATExampleException("Unknown configuration: " + configuration); } long end = System.nanoTime(); - timers.add((end-start)); + timers.add((end - start)); if (verbose) { IO.println("Elapsed Time: " + (end - start) + " ns"); } diff --git a/hat/examples/pom.xml b/hat/examples/pom.xml index 1f0b9bcdacc..6839628812b 100644 --- a/hat/examples/pom.xml +++ b/hat/examples/pom.xml @@ -1,28 +1,30 @@ - - + Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + or visit www.oracle.com if you need additional information or have any + questions. + --> 4.0.0 pom hat-examples @@ -54,13 +56,14 @@ questions. view life nbody + tensors experiments opengl - nbodygl + nbodygl diff --git a/hat/examples/shared/src/main/java/hat/examples/common/ParseArgs.java b/hat/examples/shared/src/main/java/hat/examples/common/ParseArgs.java index 65d3c6c2fae..0ae9b694db8 100644 --- a/hat/examples/shared/src/main/java/hat/examples/common/ParseArgs.java +++ b/hat/examples/shared/src/main/java/hat/examples/common/ParseArgs.java @@ -86,5 +86,14 @@ private static void showHelp() { } - public record Options(boolean verbose, int size, int iterations, boolean skipSequential, boolean checkResult) {} + public record Options(boolean verbose, int size, int iterations, boolean skipSequential, boolean checkResult) { + public void printOptions() { + IO.println("Options:"); + IO.println("Size : " + size); + IO.println("Iterations : " + iterations); + IO.println("Verbose : " + verbose); + IO.println("Skip Sequential: " + skipSequential); + IO.println("Check Result : " + checkResult); + } + } } diff --git a/hat/examples/tensors/pom.xml b/hat/examples/tensors/pom.xml new file mode 100644 index 00000000000..cf780132a73 --- /dev/null +++ b/hat/examples/tensors/pom.xml @@ -0,0 +1,78 @@ + + + + 4.0.0 + oracle.code + hat-example-tensors + 1.0 + jar + + + oracle.code + hat-examples + 1.0 + + + + oracle.code + hat-core + 1.0 + + + oracle.code + hat-example-shared + 1.0 + compile + + + + + + + org.apache.maven.plugins + maven-antrun-plugin + 1.8 + + + 1 + install + + run + + + + + + + + + + + + + diff --git a/hat/examples/tensors/src/main/java/tensors/Main.java b/hat/examples/tensors/src/main/java/tensors/Main.java new file mode 100644 index 00000000000..769249f6023 --- /dev/null +++ b/hat/examples/tensors/src/main/java/tensors/Main.java @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package tensors; + +import hat.Accelerator; +import hat.Accelerator.Compute; +import hat.ComputeContext; +import hat.KernelContext; +import hat.NDRange; +import hat.backend.Backend; +import hat.buffer.F16Array; +import hat.buffer.F32Array; +import hat.examples.common.ParseArgs; +import hat.examples.common.ParseArgs.Options; +import hat.types.F16; +import hat.types.Tensor; +import jdk.incubator.code.Reflect; +import optkl.ifacemapper.MappableIface; + +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import java.util.stream.IntStream; + +import static hat.NDRange.Global2D; +import static hat.NDRange.Local2D; +import static hat.NDRange.NDRange2D; +import static hat.NDRange.Warp2D; +import static hat.NDRange.of2D; +import static hat.examples.common.StatUtils.dumpStatsToCSVFile; + +/** + * Example to test the Tensor/Tile API vs the Thread API. + * + *

+ * How to run? We recommend running this example with the CUDA backend to be + * able to generate tensor operations. + * java @hat/run ffi-cuda tensors --iterations=100 --verbose --size=2048 + * OpenCL: + * java @hat/run ffi-opencl tensors --iterations=100 --verbose --size=2048 + *

+ */ +public class Main { + + @Reflect + public static void mxmTensorsCM(@MappableIface.RO KernelContext kc, @MappableIface.RO F16Array matrixA, @MappableIface.RO F16Array matrixB, @MappableIface.WO F32Array matrixC, int size) { + final int WMMA_M = 16; + final int WMMA_N = 16; + final int WMMA_K = 16; + int warpM = kc.gix / kc.wrs; + int warpN = kc.giy; + + final int lda = size; + final int ldb = size; + final int ldc = size; + + Tensor tensorA = Tensor.create(Tensor.FIRST, Tensor.shape(16, 16, 16), F16.class, Tensor.ofColumnMajor()); + Tensor tensorB = Tensor.create(Tensor.SECOND, Tensor.shape(16, 16, 16), F16.class, Tensor.ofColumnMajor()); + Tensor acc = Tensor.create(Tensor.ACC, Tensor.shape(16, 16, 16), float.class); + + Tensor.fill(acc, 0.0f); + + for (int i = 0; i < size; i += WMMA_K) { + int aRow = warpM * WMMA_M; + int aCol = i; + + int bRow = i; + int bCol = warpN * WMMA_N; + + if (aRow < lda && aCol < lda && bRow < ldb && bCol < ldb) { + + tensorA = Tensor.load(matrixA, aRow, aCol, lda); + tensorB = Tensor.load(matrixB, bRow, bCol, ldb); + + // acc = tensorA * tensorB + acc + Tensor.mma(acc, tensorA, tensorB, acc); + } + } + int cRow = warpM * WMMA_M; + int cCol = warpN * WMMA_N; + Tensor.store(matrixC, cRow, cCol, acc, ldc, Tensor.ofColumnMajor()); + } + + @Reflect + public static void mxmTensorsCM(@MappableIface.RO ComputeContext cc, @MappableIface.RO F16Array matrixA, @MappableIface.RO F16Array matrixB, @MappableIface.WO F32Array matrixC, int globalSize) { + var ndRange = NDRange2D.of(Global2D.of(globalSize, globalSize), + Local2D.of(128, 4), + NDRange.Tile2D.of(16, 16), + Warp2D.of(true, false)); + cc.dispatchKernel(ndRange, kc -> mxmTensorsCM(kc, matrixA, matrixB, matrixC, globalSize)); + } + + @Reflect + public static void mxmNaiveF32(KernelContext kc, F32Array matrixA, F32Array matrixB, F32Array matrixC, int size) { + if (kc.gix < kc.gsx && kc.giy < kc.gsy) { + float acc = 0.0f; + for (int k = 0; k < size; k++) { + acc += (matrixA.array(k * size + kc.giy) * matrixB.array(kc.gix * size + k)); + } + matrixC.array(kc.gix * size + kc.giy, acc); + } + } + + @Reflect + public static void mxmNaiveF32(@MappableIface.RO ComputeContext cc, @MappableIface.RO F32Array matrixA, @MappableIface.RO F32Array matrixB, @MappableIface.WO F32Array matrixC, int globalSize) { + cc.dispatchKernel(of2D(globalSize, globalSize, 16, 16), + kc -> mxmNaiveF32(kc, matrixA, matrixB, matrixC, globalSize) + ); + } + + @Reflect + public static void mxmNaiveF16(@MappableIface.RO KernelContext kc, @MappableIface.RO F16Array matrixA, @MappableIface.RO F16Array matrixB, @MappableIface.WO F32Array matrixC, int size) { + if (kc.gix < kc.gsx && kc.giy < kc.gsy) { + float acc = 0.0f; + for (int k = 0; k < size; k++) { + F16 ha = matrixA.array(k * size + kc.giy); + F16 hb = matrixB.array(kc.gix * size + k); + F16 hc = F16.mul(ha, hb); + float fc = F16.f16ToFloat(hc); + acc += fc; + } + matrixC.array(kc.gix * size + kc.giy, acc); + } + + } + + @Reflect + public static void mxmNaiveF16(@MappableIface.RO ComputeContext cc, @MappableIface.RO F16Array matrixA, @MappableIface.RO F16Array matrixB, @MappableIface.WO F32Array matrixC, int globalSize) { + cc.dispatchKernel(of2D(globalSize, globalSize, 16, 16), + kc -> mxmNaiveF16(kc, matrixA, matrixB, matrixC, globalSize) + ); + } + + private static void runSequentialColumnMajor(F32Array matrixA, F32Array matrixB, F32Array matrixC, final int size) { + for (int i = 0; i < size; i++) { + for (int j = 0; j < size; j++) { + float sum = 0.0f; + for (int k = 0; k < size; k++) { + float a = matrixA.array((long) k * size + i); + float b = matrixB.array((long) j * size + k); + sum += (a * b); + } + matrixC.array((long) j * size + i, sum); + } + } + } + + private static void runMultiThreadedWithStreamsColumnMajor(F32Array matrixA, F32Array matrixB, F32Array matrixC, int size) { + IntStream.range(0, size) + .parallel() + .forEach(i -> IntStream.range(0, size) + .parallel() + .forEach(j -> { + float sum = 0.0f; + for (int k = 0; k < size; k++) { + sum += matrixA.array(k * size + i) * matrixB.array(j * size + k); + } + matrixC.array(j * size + i, sum); + })); + } + + private static boolean checkResult(F32Array reference, F32Array output, int size) { + for (int i = 0; i < size; i++) { + for (int j = 0; j < size; j++) { + final float expected = reference.array(i * size + j); + final float got = output.array(i * size + j); + if (Math.abs(expected - got) > 0.1f) { + IO.println("[Error] GOT: " + got + " - but expected: " + expected); + return false; + } + } + } + return true; + } + + private static void printResult(String version, boolean check) { + if (check) { + IO.println("Result-" + version + " is correct!"); + } else { + IO.println("Result-" + version + " is wrong!"); + } + } + + static void runBenchmark(Options options) { + final int size = options.size(); + final int numIterations = options.iterations(); + + options.printOptions(); + + List timersJava = new ArrayList<>(); + List timersParallelStreams = new ArrayList<>(); + List timersHATNaiveF32 = new ArrayList<>(); + List timersHATNaiveF16 = new ArrayList<>(); + List timersHATTensors = new ArrayList<>(); + + var accelerator = new Accelerator(MethodHandles.lookup(), Backend.FIRST); + + F16Array matrixAHalf = F16Array.create(accelerator, size * size); + F16Array matrixBHalf = F16Array.create(accelerator, size * size); + F32Array matrixA = F32Array.create(accelerator, size * size); + F32Array matrixB = F32Array.create(accelerator, size * size); + F32Array resultNativeF32 = F32Array.create(accelerator, size * size); + F32Array resultNativeF16 = F32Array.create(accelerator, size * size); + F32Array resultTensor = F32Array.create(accelerator, size * size); + F32Array matrixReference = F32Array.create(accelerator, size * size); + F32Array resultStreams = F32Array.create(accelerator, size * size); + + Random r = new Random(19); + for (int j = 0; j < matrixAHalf.length(); j++) { + float a = r.nextFloat(); + float b = r.nextFloat(); + matrixAHalf.array(j).value(F16.floatToF16(a).value()); + matrixA.array(j, a); + matrixBHalf.array(j).value(F16.floatToF16(b).value()); + matrixB.array(j, b); + } + + // Java Sequential + if (!options.skipSequential()) { + for (int i = 0; i < numIterations; i++) { + long start = System.nanoTime(); + runSequentialColumnMajor(matrixA, matrixB, matrixReference, size); + long end = System.nanoTime(); + if (options.verbose()) { + IO.println("Java Seq Timer: " + (end - start)); + } + timersJava.add((end - start)); + } + } + + // Java Parallel Streams + for (int i = 0; i < numIterations; i++) { + long start = System.nanoTime(); + runMultiThreadedWithStreamsColumnMajor(matrixA, matrixB, resultStreams, size); + long end = System.nanoTime(); + if (options.verbose()) { + IO.println("Java Parallel-Stream Timer: " + (end - start)); + } + timersParallelStreams.add((end - start)); + } + + // HAT Parallel Naive F32 + for (int i = 0; i < numIterations; i++) { + long start = System.nanoTime(); + accelerator.compute((@Reflect Compute) cc -> mxmNaiveF32(cc, matrixA, matrixB, resultNativeF32, size)); + long end = System.nanoTime(); + if (options.verbose()) { + IO.println("HAT GPU-Naive-F32 Timer: " + (end - start)); + } + timersHATNaiveF32.add((end - start)); + } + + // HAT Parallel Naive F16 + for (int i = 0; i < numIterations; i++) { + long start = System.nanoTime(); + accelerator.compute((@Reflect Compute) cc -> mxmNaiveF16(cc, matrixAHalf, matrixBHalf, resultNativeF16, size)); + long end = System.nanoTime(); + if (options.verbose()) { + IO.println("HAT GPU-Naive-F16 Timer: " + (end - start)); + } + timersHATNaiveF16.add((end - start)); + } + + // HAT Parallel Tensor + for (int i = 0; i < numIterations; i++) { + long start = System.nanoTime(); + accelerator.compute((@Reflect Compute) cc -> mxmTensorsCM(cc, matrixAHalf, matrixBHalf, resultTensor, size)); + long end = System.nanoTime(); + if (options.verbose()) { + IO.println("HAT GPU-Tensors Timer: " + (end - start)); + } + timersHATTensors.add((end - start)); + } + + if (options.checkResult() && !options.skipSequential()) { + printResult("streams", checkResult(matrixReference, resultStreams, size)); + printResult("HAT-NaiveF32", checkResult(matrixReference, resultNativeF32, size)); + printResult("HAT-NaiveF16", checkResult(matrixReference, resultNativeF16, size)); + printResult("HAT-Tensors", checkResult(matrixReference, resultTensor, size)); + } + + // Write CSV table for all the results + List> timers = options.skipSequential() ? + List.of(timersParallelStreams, timersHATNaiveF32, timersHATNaiveF16, timersHATTensors) : + List.of(timersJava, timersParallelStreams, timersHATNaiveF32, timersHATNaiveF16, timersHATTensors); + + List headers = options.skipSequential() ? + List.of("Java-streams-fp32-" + size, "HAT-naive-fp32-" + size, "HAT-naive-fp16-" + size, "HAT-tensors-fp16-" + size) : + List.of("Java-fp32-" + size, "Java-streams-fp32-" + size, "HAT-naive-fp32-" + size, "HAT-naive-fp16-" + size, "HAT-tensors-fp16-" + size); + + final String tableName = "table-tensors-" + size + ".csv"; + dumpStatsToCSVFile(timers, headers, tableName); + } + + static void main(String[] args) { + IO.println("Example of Matmul with Tensors"); + + final int defaultSize = 1024; + int numIterations = 100; + ParseArgs parseArgs = new ParseArgs(args); + Options options = parseArgs.parseWithDefaults(defaultSize, numIterations); + + // check input size + if (options.size() % 16 != 0 || options.size() < 128) { + throw new RuntimeException("Input size must of a multiple of 16, and larger than 128"); + } + runBenchmark(options); + } +} diff --git a/hat/hat.java b/hat/hat.java index bcc8c4a5b54..9a201be2073 100644 --- a/hat/hat.java +++ b/hat/hat.java @@ -167,6 +167,7 @@ public static void main(String[] argArr) throws IOException, InterruptedExceptio var example_dft = hat.jar("example{s}-dft", core, example_shared); var example_fft = hat.jar("example{s}-fft", core, example_shared); var example_matmul = hat.jar("example{s}-matmul", core, example_shared); + var example_tensors = hat.jar("example{s}-tensors", core, example_shared); // These examples use example_shared, so they are UI based var example_mandel = hat.jar("example{s}-mandel", example_shared); @@ -199,6 +200,7 @@ public static void main(String[] argArr) throws IOException, InterruptedExceptio example_squares, example_matmul, example_flash_attention, + example_tensors, example_blackscholes, example_view, example_normmap, diff --git a/hat/hat/bld.java b/hat/hat/bld.java index 90600d97d82..4a525286035 100644 --- a/hat/hat/bld.java +++ b/hat/hat/bld.java @@ -29,118 +29,118 @@ void main(String[] args) { var layout = """ - └──./ - ├──hat // All build scripts in each case 'foo' has java options for (and points to) 'foo.java' - │ ├──bld // --enable-preview --source 26 hat/bld.java - │ ├──bld.java - │ ├──run // --enable-preview --source 26 hat/run.java - │ ├──run.java - │ └──Script // Contains all the tools for building - ├──build/ // All jars, native libs and executables - │ ├── cmake-build-debug/ // All intermediate cmake artifacts - │ ├── hat-wrap-*-1.0.jar // Wrapper jars around extracted * (opencl, glwrap, opencl) - │ ├── hat-core-1.0.jar // Base hat jar - │ ├── hat-example-*-1.0.jar // Example jars (hat-example-nbody-1.0.jar, hat-example-life-1.0.jar) - │ ├── hat-extracted-opencl-1.0.jar // Raw extraction jars (hat-extracted-opencl-1.0.jar ....) - │ ├── lib*_backend.[dylib|so] // ffi library backends - │ └── *(no suffix) // various generated executables (opencl_info, cuda_info, cuda_squares) - ├──extractions/ - │ ├──CMakeFiles.txt - │ ├── opencl/ - │ │ └──CMakeFiles.txt - │ ├── cuda/ - │ │ └──CMakeFiles.txt - │ └── opengl/ - │ └──CMakeFiles.txt - ├──wraps/ - │ ├──shared/ - │ │ └──src/main/java - │ ├──cuda/ - │ │ └──src/main/java - │ ├──opencl/ - │ │ └──src/main/java - │ └──opengl/ - │ └──src/main/java - │ - ├──core - │ ├──src/main/java - │ └──src/main/test - │ - ├──tools : core - │ ├──src/main/java - │ └──src/main/test - │ - ├──backends - │ ├──java - │ │ ├──mt - │ │ │ ├──src/main/java - │ │ │ └──src/main/resources - │ │ └──seq - │ │ ├──src/main/java - │ │ └──src/main/resources - │ ├──jextracted - │ │ └──opencl - │ │ ├──src/main/java - │ │ ├──src/main/native - │ │ └──src/main/resources - │ └──ffi - │ ├──CMakeFiles.txt - │ ├──opencl - │ │ ├──CMakeFiles.txt - │ │ ├──src/main/java - │ │ ├──src/main/native - │ │ └──src/main/resources - │ ├──cuda - │ │ ├──CMakeFiles.txt - │ │ ├──src/main/java - │ │ ├──src/main/native - │ │ └──src/main/resources - │ ├──mock - │ │ ├──CMakeFiles.txt - │ │ ├──src/main/java - │ │ ├──src/main/native - │ │ └──src/main/resources - │ ├──spirv - │ │ ├──CMakeFiles.txt - │ │ ├──src/main/java - │ │ ├──src/main/native - │ │ └──src/main/resources - │ └──hip - │ ├──CMakeFiles.txt - │ ├──src/main/java - │ ├──src/main/native - │ └──src/main/resources - │ - └──examples - ├──shared + └──./ + ├──hat // All build scripts in each case 'foo' has java options for (and points to) 'foo.java' + │ ├──bld // --enable-preview --source 26 hat/bld.java + │ ├──bld.java + │ ├──run // --enable-preview --source 26 hat/run.java + │ ├──run.java + │ └──Script // Contains all the tools for building + ├──build/ // All jars, native libs and executables + │ ├── cmake-build-debug/ // All intermediate cmake artifacts + │ ├── hat-wrap-*-1.0.jar // Wrapper jars around extracted * (opencl, glwrap, opencl) + │ ├── hat-core-1.0.jar // Base hat jar + │ ├── hat-example-*-1.0.jar // Example jars (hat-example-nbody-1.0.jar, hat-example-life-1.0.jar) + │ ├── hat-extracted-opencl-1.0.jar // Raw extraction jars (hat-extracted-opencl-1.0.jar ....) + │ ├── lib*_backend.[dylib|so] // ffi library backends + │ └── *(no suffix) // various generated executables (opencl_info, cuda_info, cuda_squares) + ├──extractions/ + │ ├──CMakeFiles.txt + │ ├── opencl/ + │ │ └──CMakeFiles.txt + │ ├── cuda/ + │ │ └──CMakeFiles.txt + │ └── opengl/ + │ └──CMakeFiles.txt + ├──wraps/ + │ ├──shared/ + │ │ └──src/main/java + │ ├──cuda/ + │ │ └──src/main/java + │ ├──opencl/ + │ │ └──src/main/java + │ └──opengl/ + │ └──src/main/java + │ + ├──core │ ├──src/main/java - │ └──src/main/resources - ├──mandel + │ └──src/main/test + │ + ├──tools : core │ ├──src/main/java - │ └──src/main/resources - ├──squares - │ ├──src/main/java - │ └──src/main/resources - ├──heal - │ ├──src/main/java - │ └──src/main/resources - ├──life - │ ├──src/main/java - │ └──src/main/resources - ├──nbody - │ ├──src/main/java - │ └──src/main/resources - ├──experiments - │ ├──src/main/java - │ └──src/main/resources - ├──violajones - │ ├──src/main/java - │ └──src/main/resources - └──matmul - ├──src/main/java - └──src/main/resources - """; - class Artifacts{ + │ └──src/main/test + │ + ├──backends + │ ├──java + │ │ ├──mt + │ │ │ ├──src/main/java + │ │ │ └──src/main/resources + │ │ └──seq + │ │ ├──src/main/java + │ │ └──src/main/resources + │ ├──jextracted + │ │ └──opencl + │ │ ├──src/main/java + │ │ ├──src/main/native + │ │ └──src/main/resources + │ └──ffi + │ ├──CMakeFiles.txt + │ ├──opencl + │ │ ├──CMakeFiles.txt + │ │ ├──src/main/java + │ │ ├──src/main/native + │ │ └──src/main/resources + │ ├──cuda + │ │ ├──CMakeFiles.txt + │ │ ├──src/main/java + │ │ ├──src/main/native + │ │ └──src/main/resources + │ ├──mock + │ │ ├──CMakeFiles.txt + │ │ ├──src/main/java + │ │ ├──src/main/native + │ │ └──src/main/resources + │ ├──spirv + │ │ ├──CMakeFiles.txt + │ │ ├──src/main/java + │ │ ├──src/main/native + │ │ └──src/main/resources + │ └──hip + │ ├──CMakeFiles.txt + │ ├──src/main/java + │ ├──src/main/native + │ └──src/main/resources + │ + └──examples + ├──shared + │ ├──src/main/java + │ └──src/main/resources + ├──mandel + │ ├──src/main/java + │ └──src/main/resources + ├──squares + │ ├──src/main/java + │ └──src/main/resources + ├──heal + │ ├──src/main/java + │ └──src/main/resources + ├──life + │ ├──src/main/java + │ └──src/main/resources + ├──nbody + │ ├──src/main/java + │ └──src/main/resources + ├──experiments + │ ├──src/main/java + │ └──src/main/resources + ├──violajones + │ ├──src/main/java + │ └──src/main/resources + └──matmul + ├──src/main/java + └──src/main/resources + """; + class Artifacts { static Script.MavenStyleProject core; static Script.MavenStyleProject tools; static Script.MavenStyleProject tests; @@ -182,16 +182,16 @@ class Artifacts{ var extractionsCmakeBuildDir = extractionsDir.buildDir("cmake-build-debug"); if (!extractionsCmakeBuildDir.exists()) { - Script.cmake($ -> $ .verbose(false) .source_dir(extractionsDir) .build_dir(extractionsCmakeBuildDir)); + Script.cmake($ -> $.verbose(false).source_dir(extractionsDir).build_dir(extractionsCmakeBuildDir)); } - Script.cmake($ -> $ .build(extractionsCmakeBuildDir) .target("extract")); + Script.cmake($ -> $.build(extractionsCmakeBuildDir).target("extract")); var extraction_opencl_dir = extractionsDir.dir("opencl"); if (extraction_opencl_dir.dir("src").exists()) { Artifacts.extraction_opencl = buildDir.mavenStyleBuild( extraction_opencl_dir, "hat-extracted-opencl-1.0.jar" ); - }else{ + } else { print("no src for extraction_opencl"); } @@ -200,7 +200,7 @@ class Artifacts{ Artifacts.extraction_opengl = buildDir.mavenStyleBuild( extraction_opengl_dir, "hat-extracted-opengl-1.0.jar" ); - }else{ + } else { print("no src for extraction_opengl"); } @@ -211,13 +211,12 @@ class Artifacts{ ); } - var wrapsDir = dir.existingDir("wraps"); - Artifacts.wrap_shared = buildDir.mavenStyleBuild( wrapsDir.existingDir("shared"), "hat-wrap-shared-1.0.jar"); + Artifacts.wrap_shared = buildDir.mavenStyleBuild(wrapsDir.existingDir("shared"), "hat-wrap-shared-1.0.jar"); - if (Artifacts.extraction_opencl != null){ - Artifacts.wrap_opencl = buildDir.mavenStyleBuild( wrapsDir.dir("opencl"), "hat-wrap-opencl-1.0.jar", Artifacts.wrap_shared, Artifacts.core, Artifacts.extraction_opencl); + if (Artifacts.extraction_opencl != null) { + Artifacts.wrap_opencl = buildDir.mavenStyleBuild(wrapsDir.dir("opencl"), "hat-wrap-opencl-1.0.jar", Artifacts.wrap_shared, Artifacts.core, Artifacts.extraction_opencl); } // on jetson // ls extractions/opengl/src/main/java/opengl/glutKeyboardFunc* @@ -234,21 +233,21 @@ class Artifacts{ String exclude = null; if (!Artifacts.extraction_opengl.jarFile.select(Script.Regex.of("^.*glutKeyboardFunc\\$func.class$")).isEmpty()) { exclude = "Callback"; - }else if (!Artifacts.extraction_opengl.jarFile.select(Script.Regex.of("^.*glutKeyboardFunc\\$callback.class$")).isEmpty()) { + } else if (!Artifacts.extraction_opengl.jarFile.select(Script.Regex.of("^.*glutKeyboardFunc\\$callback.class$")).isEmpty()) { exclude = "Func"; - }else { + } else { println("We can't build wrap_opengl unless exclude one of GLFuncEventHandler or GLCallbackEventHandler something"); } if (exclude != null) { - final var excludeMeSigh = "^.*/GL"+exclude+"EventHandler\\.java$"; - println("exclude ="+exclude+" "+excludeMeSigh); + final var excludeMeSigh = "^.*/GL" + exclude + "EventHandler\\.java$"; + println("exclude =" + exclude + " " + excludeMeSigh); Artifacts.wrap_opengl = Script.mavenStyleProject( buildDir, wrapsDir.dir("opengl"), buildDir.jarFile("hat-wrap-opengl-1.0.jar"), Artifacts.wrap_shared, Artifacts.core, Artifacts.extraction_opengl ).buildExcluding(javaSrc -> javaSrc.matches(excludeMeSigh)); } } - if (false && Artifacts.extraction_cuda != null ) { + if (false && Artifacts.extraction_cuda != null) { Artifacts.wrap_cuda = buildDir.mavenStyleBuild( wrapsDir.dir("cuda"), "hat-wrap-cuda-1.0.jar", Artifacts.extraction_cuda ); @@ -261,20 +260,20 @@ class Artifacts{ ffiBackendsDir.existingDir("shared"), "hat-backend-ffi-shared-1.0.jar", Artifacts.core ); - if (ffiBackendsDir.optionalDir("opencl") instanceof Script.DirEntry ffiBackendDir ) { + if (ffiBackendsDir.optionalDir("opencl") instanceof Script.DirEntry ffiBackendDir) { Artifacts.backend_ffi_opencl = buildDir.mavenStyleBuild( - ffiBackendDir, "hat-backend-ffi-"+ffiBackendDir.fileName()+ "-1.0.jar", Artifacts.core, Artifacts.backend_ffi_shared + ffiBackendDir, "hat-backend-ffi-" + ffiBackendDir.fileName() + "-1.0.jar", Artifacts.core, Artifacts.backend_ffi_shared ); } if (ffiBackendsDir.optionalDir("mock") instanceof Script.DirEntry ffiBackendDir) { Artifacts.backend_ffi_mock = buildDir.mavenStyleBuild( - ffiBackendDir, "hat-backend-ffi-"+ffiBackendDir.fileName()+ "-1.0.jar", Artifacts.core, Artifacts.backend_ffi_shared + ffiBackendDir, "hat-backend-ffi-" + ffiBackendDir.fileName() + "-1.0.jar", Artifacts.core, Artifacts.backend_ffi_shared ); } if (ffiBackendsDir.optionalDir("cuda") instanceof Script.DirEntry ffiBackendDir) { Artifacts.backend_ffi_cuda = buildDir.mavenStyleBuild( - ffiBackendDir, "hat-backend-ffi-"+ffiBackendDir.fileName()+ "-1.0.jar", Artifacts.core, Artifacts.backend_ffi_shared + ffiBackendDir, "hat-backend-ffi-" + ffiBackendDir.fileName() + "-1.0.jar", Artifacts.core, Artifacts.backend_ffi_shared ); } @@ -299,28 +298,27 @@ class Artifacts{ } var javaBackendsDir = backendsDir.existingDir("java"); - Artifacts.backend_java_mt = buildDir.mavenStyleBuild(javaBackendsDir.existingDir("mt"), + Artifacts.backend_java_mt = buildDir.mavenStyleBuild(javaBackendsDir.existingDir("mt"), "hat-backend-java-mt-1.0.jar", Artifacts.core ); - Artifacts.backend_java_seq = buildDir.mavenStyleBuild(javaBackendsDir.existingDir("seq"), + Artifacts.backend_java_seq = buildDir.mavenStyleBuild(javaBackendsDir.existingDir("seq"), "hat-backend-java-seq-1.0.jar", Artifacts.core ); var examplesDir = dir.existingDir("examples"); - - Stream.of( "blackscholes", "squares", "matmul", "flashattention") + Stream.of("blackscholes", "squares", "matmul", "flashattention", "tensors") .parallel() .map(examplesDir::existingDir) - .forEach(exampleDir->buildDir.mavenStyleBuild( - exampleDir, "hat-example-"+exampleDir.fileName()+"-1.0.jar", Artifacts.core + .forEach(exampleDir -> buildDir.mavenStyleBuild( + exampleDir, "hat-example-" + exampleDir.fileName() + "-1.0.jar", Artifacts.core )); - Stream.of( "experiments") // this has hardcoded references to opencl backend + Stream.of("experiments") // this has hardcoded references to opencl backend .parallel() .map(examplesDir::existingDir) - .forEach(exampleDir->buildDir.mavenStyleBuild( - exampleDir, "hat-example-"+exampleDir.fileName()+"-1.0.jar", + .forEach(exampleDir -> buildDir.mavenStyleBuild( + exampleDir, "hat-example-" + exampleDir.fileName() + "-1.0.jar", Artifacts.core, Artifacts.backend_ffi_shared, Artifacts.backend_ffi_opencl )); @@ -328,11 +326,11 @@ class Artifacts{ examplesDir.existingDir("shared"), "hat-example-shared-1.0.jar", Artifacts.core ); - Stream.of( "heal", "life", "mandel", "violajones") // these require example_shared ui stuff + Stream.of("heal", "life", "mandel", "violajones") // these require example_shared ui stuff .parallel() .map(examplesDir::existingDir) - .forEach(exampleDir->buildDir.mavenStyleBuild( - exampleDir, "hat-example-"+exampleDir.fileName()+"-1.0.jar", Artifacts.core, Artifacts.example_shared + .forEach(exampleDir -> buildDir.mavenStyleBuild( + exampleDir, "hat-example-" + exampleDir.fileName() + "-1.0.jar", Artifacts.core, Artifacts.example_shared )); var nbodyDependencies = new Script.MavenStyleProject[]{ @@ -347,14 +345,14 @@ class Artifacts{ boolean foundNull = false; - for (var o:nbodyDependencies){ - if (o == null){ + for (var o : nbodyDependencies) { + if (o == null) { foundNull = true; } } - if (foundNull){ + if (foundNull) { print("incomplete nbody dependencies "); - }else { + } else { Artifacts.example_nbody = buildDir.mavenStyleBuild( examplesDir.existingDir("nbody"), "hat-example-nbody-1.0.jar", nbodyDependencies ); @@ -362,9 +360,9 @@ class Artifacts{ var cmakeBuildDir = buildDir.buildDir("cmake-build-debug"); if (!cmakeBuildDir.exists()) { - Script.cmake($ -> $ .verbose(false) .source_dir(ffiBackendsDir) .build_dir(cmakeBuildDir) .copy_to(buildDir)); + Script.cmake($ -> $.verbose(false).source_dir(ffiBackendsDir).build_dir(cmakeBuildDir).copy_to(buildDir)); } - Script.cmake($ -> $ .build(cmakeBuildDir)); + Script.cmake($ -> $.build(cmakeBuildDir)); } diff --git a/hat/hat/job.jar b/hat/hat/job.jar index 911febffcb4..43aa878ad92 100644 Binary files a/hat/hat/job.jar and b/hat/hat/job.jar differ diff --git a/hat/pom.xml b/hat/pom.xml index ef1f45dc593..cdc7bf9bbb2 100644 --- a/hat/pom.xml +++ b/hat/pom.xml @@ -45,6 +45,7 @@ questions. extractions wraps tests + examples/tensors