rbrchen
diff --git a/‎hat/backends/ffi/shared/src/main/native/cpp/shared.cpp‎
Lines changed: 1 addition & 1 deletion b/‎hat/backends/ffi/shared/src/main/native/cpp/shared.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎hat/core/src/main/java/hat/BufferTagger.java‎
Lines changed: 15 additions & 15 deletions b/‎hat/core/src/main/java/hat/BufferTagger.java‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎hat/core/src/main/java/hat/buffer/ArgArray.java‎
Lines changed: 2 additions & 6 deletions b/‎hat/core/src/main/java/hat/buffer/ArgArray.java‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎hat/examples/blackscholes/src/main/java/blackscholes/Main.java‎
Lines changed: 6 additions & 7 deletions b/‎hat/examples/blackscholes/src/main/java/blackscholes/Main.java‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎hat/examples/dft/src/main/java/dft/Main.java‎
Lines changed: 2 additions & 2 deletions b/‎hat/examples/dft/src/main/java/dft/Main.java‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎hat/examples/flashattention/src/main/java/flashattention/Main.java‎
Lines changed: 10 additions & 10 deletions b/‎hat/examples/flashattention/src/main/java/flashattention/Main.java‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎hat/examples/heal/src/main/java/heal/ComputeHeal.java‎
Lines changed: 11 additions & 11 deletions b/‎hat/examples/heal/src/main/java/heal/ComputeHeal.java‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎hat/examples/life/src/main/java/life/Main.java‎
Lines changed: 3 additions & 3 deletions b/‎hat/examples/life/src/main/java/life/Main.java‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎hat/examples/mandel/src/main/java/mandel/Main.java‎
Lines changed: 1 addition & 1 deletion b/‎hat/examples/mandel/src/main/java/mandel/Main.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎hat/examples/matmul/src/main/java/matmul/Main.java‎
Lines changed: 8 additions & 8 deletions b/‎hat/examples/matmul/src/main/java/matmul/Main.java‎
Lines changed: 8 additions & 8 deletions
@@ -403,7 +403,7 @@ long Backend::CompilationUnit::Kernel::ndrange(void *argArray) {
             }
 
             auto *buffer = static_cast<Buffer *>(bufferState->vendorPtr);
-            if (kernelWroteToThisArg || compilationUnit->backend->config->alwaysCopy) {
+            if (kernelWroteToThisArg && compilationUnit->backend->config->alwaysCopy) {
                 compilationUnit->backend->queue->copyFromDevice(buffer);
                 bufferState->state = BufferState::HOST_OWNED;
                 if (compilationUnit->backend->config->traceCopies || compilationUnit->backend->config->traceEnqueues) {
 
@@ -25,6 +25,7 @@
 
 package hat;
 
+import hat.phases.HATPhaseUtils;
 import jdk.incubator.code.dialect.java.JavaOp;
 import optkl.IfaceValue;
 import optkl.OpHelper;
@@ -147,23 +148,22 @@ private static void mapBranch(MethodHandles.Lookup lookup, Block.Reference block
         }
     }
 
-    // retrieves "root" value of an op, the origin of the parameter (or value) used by the op
+    // retrieves "root" value of an op, which is how we track accesses
+    // we will map the return value of this method to the accessType
     private  static Value getRootValue(Op op) {
-        if (op.operands().isEmpty()) {
-            return op.result();
-        } else if (op.operands().getFirst() instanceof Block.Parameter param) {
-            return param;
-        }
+        // the op is a field load, an invoke, or something that reduces to one or the other
+        // first, check if we can retrieve a fieldloadop from the given op
+        Op fieldOp = HATPhaseUtils.findOpInResultFromFirstOperandsOrNull(op, JavaOp.FieldAccessOp.FieldLoadOp.class);
+        if (fieldOp != null) return fieldOp.operands().getFirst(); // if so, we use its first operand to map to accesses
 
-        while (op.operands().getFirst() instanceof Op.Result result) { // Only first?
-            op = result.op(); // we are changing our  par here I assume intended
-            if (op.operands().isEmpty()) { // if the "root op" is an invoke
-                return op.result();
-            }else{
-                // or else
-            }
+        // we then check if there's an invokeop that has no operands (meaning a shared or private buffer that was created)
+        // or if there's an invokeop with a parameter as its first operation (this is a global buffer)
+        Op invokeOp = HATPhaseUtils.findOpInResultFromFirstOperandsOrNull(op, JavaOp.InvokeOp.class);
+        while (invokeOp != null && !invokeOp.operands().isEmpty()) {
+            if (invokeOp.operands().getFirst() instanceof Block.Parameter p) return p; // return the parameter that is the global buffer
+            invokeOp = HATPhaseUtils.findOpInResultFromFirstOperandsOrNull(invokeOp.operands().getFirst().result().op(), JavaOp.InvokeOp.class);
         }
-        return op.operands().getFirst();
+        return (invokeOp == null) ? null : invokeOp.result(); // return the shared/private buffer invokeop that creates the buffer
     }
 
     // updates accessMap
@@ -175,7 +175,7 @@ private  static void updateAccessType(Value value, AccessType currentAccess) {
         } else if (currentAccess != storedAccess && storedAccess != AccessType.RW) {
             accessMap.put(remappedValue, AccessType.RW);
         } else {
-            // or else
+            // this is the same access type as what's already stored
         }
     }
 }
@@ -280,12 +280,8 @@ static void update(ArgArray argArray, KernelCallGraph kernelCallGraph, Object...
                 case Buffer buffer -> {
                     Annotation[] annotations = parameterAnnotations[i];
                     AccessType accessType = AccessType.NA;
-                    if (annotations.length > 0) {
-                        for (Annotation annotation : annotations) {
-                            accessType = AccessType.of(annotation);
-                        }
-                    } else {
-                        throw new IllegalArgumentException("Argument " + i + " has no access annotations");
+                    for (Annotation annotation : annotations) {
+                        accessType = AccessType.of(annotation);
                     }
                     MemorySegment segment = MappableIface.getMemorySegment(buffer);
                     arg.variant((byte) '&');
 
@@ -37,7 +37,6 @@
 import java.util.Random;
 
 import optkl.ifacemapper.MappableIface.RO;
-import optkl.ifacemapper.MappableIface.RW;
 import optkl.ifacemapper.MappableIface.WO;
 
 import jdk.incubator.code.Reflect;
@@ -46,12 +45,12 @@ public class Main {
     static Random rand;
 
     @Reflect
-    public static void blackScholesKernel(@RO KernelContext kc,
-                                          @WO F32Array call,
-                                          @WO F32Array put,
-                                          @RO F32Array sArray,
-                                          @RO F32Array xArray,
-                                          @RO F32Array tArray,
+    public static void blackScholesKernel(KernelContext kc,
+                                          F32Array call,
+                                          F32Array put,
+                                          F32Array sArray,
+                                          F32Array xArray,
+                                          F32Array tArray,
                                           float r,
                                           float v) {
         if (kc.gix < kc.gsx){
 
@@ -103,7 +103,7 @@ static ComplexArray create(Accelerator accelerator, int length) {
     }
 
     @Reflect
-    private static void dftKernel(@RW KernelContext kc, @RO ComplexArray input, @WO ComplexArray output) {
+    private static void dftKernel(KernelContext kc, ComplexArray input, ComplexArray output) {
         int size = input.length();
         int idx = kc.gix;
         if (idx < kc.gsx) {
@@ -130,7 +130,7 @@ private static void dftCompute(@RW ComputeContext cc, @RO ComplexArray input, @W
     }
 
     @Reflect
-    private static void dftPlainKernel(@RW KernelContext kc, @RO F32Array inReal, @RO F32Array inImag, @WO F32Array outReal, @WO F32Array outImag) {
+    private static void dftPlainKernel(KernelContext kc, F32Array inReal, F32Array inImag, F32Array outReal, F32Array outImag) {
         int size = inReal.length();
         int idx = kc.gix;
         if (idx < kc.gsx) {
 
@@ -105,10 +105,10 @@ public class Main {
      * @param softMaxScale
      */
     @Reflect
-    public static void selfAttentionV2HAT(@RO KernelContext kernelContext,
-                                          @RO F32Array Q, @RO F32Array K, @RO F32Array V,
-                                          @WO F32Array attentionMatrix, @WO F32Array O,
-                                          @RO final int N, @RO final int d, @RO final float softMaxScale) {
+    public static void selfAttentionV2HAT(KernelContext kernelContext,
+                                          F32Array Q, F32Array K, F32Array V,
+                                          F32Array attentionMatrix, F32Array O,
+                                          final int N, final int d, final float softMaxScale) {
         int idx = kernelContext.gix;
         if (idx < N) {
             // Compute the attention scores: Q * K^T and scale it to sqrt(d) => softMaxScale
@@ -382,9 +382,9 @@ public static int ceilFunction(int N, int blockN) {
      * @param softmaxScale
      */
     @Reflect
-    public static void flashAttention(@RO KernelContext kernelContext,
-                                      @RO F32Array Q, @RO F32Array K, @RO F32Array V,
-                                      @WO F32Array O, @RW F32Array m, @RW F32Array l,
+    public static void flashAttention(KernelContext kernelContext,
+                                      F32Array Q, F32Array K, F32Array V,
+                                      F32Array O, F32Array m, F32Array l,
                                       final int N, final int d, final float softmaxScale) {
         int bx = kernelContext.bix;
         int tid = kernelContext.lix;
@@ -525,9 +525,9 @@ static PrivateF16Array createPrivate() {
     }
 
     @Reflect
-    public static void flashAttentionF16(@RO KernelContext kernelContext,
-                                      @RO F16Array Q, @RO F16Array K, @RO F16Array V,
-                                      @WO F16Array O, @RW F16Array m, @RW F16Array l,
+    public static void flashAttentionF16(KernelContext kernelContext,
+                                      F16Array Q, F16Array K, F16Array V,
+                                      F16Array O, F16Array m, F16Array l,
                                       final int N, final int d, final float softmaxScale) {
         int bx = kernelContext.bix;
         int tid = kernelContext.lix;
 
@@ -194,11 +194,11 @@ static int blue(int rgb) {
 
     @Reflect
     public static void bestFitCore(int id,
-                                   @RO S32Array2D s32Array2D,
-                                   @RO Box searchArea,
-                                   @RO Box selBox,
-                                   @RO XYRGBList xyrgbList,
-                                   @RW F32Array sumArray) {
+                                   S32Array2D s32Array2D,
+                                   Box searchArea,
+                                   Box selBox,
+                                   XYRGBList xyrgbList,
+                                   F32Array sumArray) {
         int x = searchArea.x1() + id % searchArea.width();
         int y = searchArea.y1() + id / searchArea.width();
         float sum = 0;
@@ -233,12 +233,12 @@ public static void bestFitCore(int id,
     }
 
     @Reflect
-    public static void bestFitKernel(@RO KernelContext kc,
-                                     @RO S32Array2D s32Array2D,
-                                     @RO Box searchArea,
-                                     @RO Box selectionBox,
-                                     @RO XYRGBList xyrgbList,
-                                     @RO F32Array sumArray) {
+    public static void bestFitKernel(KernelContext kc,
+                                     S32Array2D s32Array2D,
+                                     Box searchArea,
+                                     Box selectionBox,
+                                     XYRGBList xyrgbList,
+                                     F32Array sumArray) {
         bestFitCore(kc.gix, s32Array2D, searchArea, selectionBox, xyrgbList, sumArray);
     }
 
 
@@ -144,7 +144,7 @@ int val(__global cellGrid_t *CLWrapCellGrid, int from, int w, int x, int y) {
 
 
         @Reflect
-        public static int val(@RO CellGrid grid, int from, int w, int x, int y) {
+        public static int val(CellGrid grid, int from, int w, int x, int y) {
             return grid.cell(((long) y * w) + x + from) & 1;
         }
 
@@ -175,7 +175,7 @@ __kernel void life( __global  cellGrid_t *CLWrapCellGrid ,__global control_t *CL
                 """;
 
         @Reflect
-        public static void lifePerIdx(int idx, @RW Control control, @RW CellGrid cellGrid) {
+        public static void lifePerIdx(int idx, Control control, CellGrid cellGrid) {
             int w = cellGrid.width();
             int h = cellGrid.height();
             int from = control.from();
@@ -199,7 +199,7 @@ public static void lifePerIdx(int idx, @RW Control control, @RW CellGrid cellGri
         }
 
         @Reflect
-        public static void life(@RO KernelContext kc, @RO Control control, @RW CellGrid cellGrid) {
+        public static void life(KernelContext kc, Control control, CellGrid cellGrid) {
             if (kc.gix < kc.gsx) {
                 ComputeLife.lifePerIdx(kc.gix, control, cellGrid);
             }
 
@@ -43,7 +43,7 @@
 
 public class Main {
     @Reflect
-    public static void mandel(@RO KernelContext kc, @RW S32Array2D s32Array2D, @RO S32Array pallette, float offsetx, float offsety, float scale) {
+    public static void mandel(KernelContext kc, S32Array2D s32Array2D, S32Array pallette, float offsetx, float offsety, float scale) {
         if (kc.gix < kc.gsx) {
             float width = s32Array2D.width();
             float height = s32Array2D.height();
 
@@ -84,7 +84,7 @@ public class Main {
      * @param size
      */
     @Reflect
-    public static void matrixMultiplyKernel2D(@RO KernelContext kc, @RO F32Array matrixA, @RO F32Array matrixB, @WO F32Array matrixC, int size) {
+    public static void matrixMultiplyKernel2D(KernelContext kc, F32Array matrixA, F32Array matrixB, F32Array matrixC, int size) {
         if (kc.gix < kc.gsx) {
             if (kc.giy < kc.gsy) {
                 float acc = 0.0f;
@@ -106,7 +106,7 @@ public static void matrixMultiplyKernel2D(@RO KernelContext kc, @RO F32Array mat
      * @param size
      */
     @Reflect
-    public static void matrixMultiplyKernel2DLI(@RO KernelContext kc, @RO F32Array matrixA, @RO F32Array matrixB, @WO F32Array matrixC, int size) {
+    public static void matrixMultiplyKernel2DLI(KernelContext kc, F32Array matrixA, F32Array matrixB, F32Array matrixC, int size) {
         if (kc.gix < kc.gsx) {
             if (kc.giy < kc.gsy) {
                 float acc = 0.0f;
@@ -141,7 +141,7 @@ static MyLocalArrayFixedSize createLocal() {
     }
 
     @Reflect
-    public static void matrixMultiplyKernel2DTiling(@RO KernelContext kc, @RO F32Array matrixA, @RO F32Array matrixB, @WO F32Array matrixC, int size) {
+    public static void matrixMultiplyKernel2DTiling(KernelContext kc, F32Array matrixA, F32Array matrixB, F32Array matrixC, int size) {
 
         final int tileSize = 16;
         MyLocalArrayFixedSize tileA = MyLocalArrayFixedSize.createLocal();
@@ -254,7 +254,7 @@ static FlatPrivate createPrivate() {
      * @param size
      */
     @Reflect
-    public static void matrixMultiplyKernel2DRegisterTiling(@RO KernelContext kc, @RO F32Array matrixA, @RO F32Array matrixB, @WO F32Array matrixC, int size) {
+    public static void matrixMultiplyKernel2DRegisterTiling(KernelContext kc, F32Array matrixA, F32Array matrixB, F32Array matrixC, int size) {
 
         // Configuration for the kernel: Keep in mind that if you change the following parameters,
         // also change the scheduling (global and local work sizes).
@@ -376,7 +376,7 @@ public static void matrixMultiplyKernel2DRegisterTiling(@RO KernelContext kc, @R
      * @param size
      */
     @Reflect
-    public static void matrixMultiplyKernel2DRegisterTilingVectorized(@RO KernelContext kc, @RO F32ArrayPadded matrixA, @RO F32ArrayPadded matrixB, @WO F32ArrayPadded matrixC, int size) {
+    public static void matrixMultiplyKernel2DRegisterTilingVectorized(KernelContext kc, F32ArrayPadded matrixA, F32ArrayPadded matrixB, F32ArrayPadded matrixC, int size) {
 
         // Configuration for the kernel: Keep in mind that if you change the following parameters,
         // also change the scheduling (global and local work sizes).
@@ -524,7 +524,7 @@ static FlatPrivateHalf createPrivate() {
     }
 
     @Reflect
-    public static void matrixMultiplyKernel2DRegisterTilingHalf(@RO KernelContext kc, @RO F16Array matrixA, @RO F16Array matrixB, @WO F16Array matrixC, int size) {
+    public static void matrixMultiplyKernel2DRegisterTilingHalf(KernelContext kc, F16Array matrixA, F16Array matrixB, F16Array matrixC, int size) {
 
         // Configuration for the kernel: Keep in mind that if you change the following parameters,
         // also change the scheduling (global and local work sizes).
@@ -648,7 +648,7 @@ public static float compute(@RO KernelContext kc, @RO F32Array matrixA, @RO F32A
      * @param size
      */
     @Reflect
-    public static void matrixMultiplyKernel1D(@RO KernelContext kc, @RO F32Array matrixA, @RO F32Array matrixB, @WO F32Array matrixC, int size) {
+    public static void matrixMultiplyKernel1D(KernelContext kc, F32Array matrixA, F32Array matrixB, F32Array matrixC, int size) {
         if (kc.gix < kc.gsx) {
             for (int j = 0; j < size; j++) {
                 float acc = 0.0f;
@@ -664,7 +664,7 @@ public static void matrixMultiplyKernel1D(@RO KernelContext kc, @RO F32Array mat
      * 1D Matrix Multiply with function calls passing the kernel context ID. This is just for testing purposes.
      */
     @Reflect
-    public static void matrixMultiplyKernel1DWithFunctionCalls(@RO KernelContext kc, @RO F32Array matrixA, @RO F32Array matrixB, @WO F32Array matrixC, int size) {
+    public static void matrixMultiplyKernel1DWithFunctionCalls(KernelContext kc, F32Array matrixA, F32Array matrixB, F32Array matrixC, int size) {
         if (kc.gix < kc.gsx) {
             for (int j = 0; j < size; j++) {
                 float acc = compute(kc, matrixA, matrixB, size, j);
Original file line number	Diff line number	Diff line change
`@@ -403,7 +403,7 @@ long Backend::CompilationUnit::Kernel::ndrange(void *argArray) {`
`403`	`403`	`}`
`404`	`404`
`405`	`405`	`auto buffer = static_cast<Buffer >(bufferState->vendorPtr);`
`406`		`- if (kernelWroteToThisArg \|\| compilationUnit->backend->config->alwaysCopy) {`
	`406`	`+ if (kernelWroteToThisArg && compilationUnit->backend->config->alwaysCopy) {`
`407`	`407`	`compilationUnit->backend->queue->copyFromDevice(buffer);`
`408`	`408`	`bufferState->state = BufferState::HOST_OWNED;`
`409`	`409`	`if (compilationUnit->backend->config->traceCopies \|\| compilationUnit->backend->config->traceEnqueues) {`