rbrchen
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎cr-examples/onnx/README.md‎
Lines changed: 1 addition & 1 deletion b/‎cr-examples/onnx/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cr-examples/onnx/src/test/java/oracle/code/onnx/llm/LlamaModel.java‎
Lines changed: 30 additions & 23 deletions b/‎cr-examples/onnx/src/test/java/oracle/code/onnx/llm/LlamaModel.java‎
Lines changed: 30 additions & 23 deletions
diff --git a/‎cr-examples/onnx/src/test/resources/oracle/code/onnx/llm/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎cr-examples/onnx/src/test/resources/oracle/code/onnx/llm/.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎hat/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎hat/.gitignore‎
Lines changed: 1 addition & 0 deletions
@@ -31,12 +31,12 @@ jtreg -jdk:./build/macosx-x86_64-server-release/jdk/ -ea -esa -avm -va test/lang
 Specific runtime tests can be executed using `jtreg`, for example:
 
 ```
-jtreg -jdk:./build/macosx-x86_64-server-release/jdk/ -ea -esa -avm -va test/jdk/java/lang/reflect/code/
+jtreg -jdk:./build/macosx-x86_64-server-release/jdk/ -ea -esa -avm -va test/jdk/jdk/incubator/code/
 ```
 
 In addition, the runtime tests can be executed using make with the test group
-`jdk_lang_reflect_code` as follows:
+`jdk_incubator_code` as follows:
 
 ```
-make test TEST=jdk_lang_reflect_code
+make test TEST=jdk_incubator_code
 ```
@@ -47,7 +47,7 @@ sh setup.sh path/to/cloned/onnxruntime
 
 Setup:
 - Download [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai/releases) native library coresponding to your system/architecture, unzip and put it into `cr-examples/onnx/lib` folder.
-- Download `model.onnx.data`, `tokenizer.json` and `tokenizer_config.json` data files from [Llama-3.2-1B-Instruct-ONNX](https://huggingface.co/onnx-community/Llama-3.2-1B-Instruct-ONNX/tree/main/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4) and put them into `cr-examples/onnx/src/test/resources/oracle/code/onnx/llm` folder.
+- Download `model_q4.onnx_data`, `tokenizer.json` and `tokenizer_config.json` data files from [Llama-3.2-1B-Instruct-ONNX](https://huggingface.co/onnx-community/Llama-3.2-1B-Instruct-ONNX/tree/main) and put them into `cr-examples/onnx/src/test/resources/oracle/code/onnx/llm` folder.
 
 Running the Llama demo:
 ```
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2025, 2026, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -43,32 +43,36 @@ public final class LlamaModel {
                              VOCAB_SIZE = 128256,
                              HEAD_SIZE = 64,
                              HIDEN_SIZE = 2048,
+                             KV_HIDEN_SIZE = 512,
                              CONTEXT_SIZE = 131072,
                              INTERMEDIATE_SIZE = 8192,
                              ATTN_WEIGHTS_SIZE = 3072;
     public static final float EPSILON = 1.0E-5f,
                               SCALE = 0.125f;
 
     public final Tensor<Long> flat1, scalar1;
-    public final Tensor<Float> tokensWeights, initWeight, cosCache, sinCache, headScales;
+    public final Tensor<Float> tokensWeights, initWeight, cosCache, sinCache;
     public final Tensor<Float>[] postAttentionWeights = new Tensor[LAYERS],
                                  inputWeights = new Tensor[LAYERS],
-                                 attnQkvScales = new Tensor[LAYERS],
+                                 attnQScales = new Tensor[LAYERS],
+                                 attnKScales = new Tensor[LAYERS],
+                                 attnVScales = new Tensor[LAYERS],
                                  attnOScales = new Tensor[LAYERS],
                                  mlpGateScales = new Tensor[LAYERS],
                                  mlpUpScales = new Tensor[LAYERS],
                                  mlpDownScales = new Tensor[LAYERS];
-    public final Tensor<Byte>[] attnQkvWeight = new Tensor[LAYERS],
+    public final Tensor<Byte>[] attnQWeight = new Tensor[LAYERS],
+                                attnKWeight = new Tensor[LAYERS],
+                                attnVWeight = new Tensor[LAYERS],
                                 attnOWeight = new Tensor[LAYERS],
                                 mlpGateWeight = new Tensor[LAYERS],
                                 mlpUpWeight = new Tensor[LAYERS],
                                 mlpDownWeight = new Tensor[LAYERS];
-    public final Tensor<Byte> headWeight;
 
     public LlamaModel(Arena arena) throws IOException {
         flat1 = Tensor.ofFlat(arena, 1l);
         scalar1 = Tensor.ofScalar(arena, 1l);
-        var modelData = new TensorDataStream(arena, LlamaModel.class.getResource("model.onnx.data").getPath());
+        var modelData = new TensorDataStream(arena, LlamaModel.class.getResource("model_q4.onnx_data").getPath());
         tokensWeights = modelData.nextTensor(FLOAT, VOCAB_SIZE, HIDEN_SIZE);
         initWeight = modelData.nextTensor(FLOAT, HIDEN_SIZE);
         cosCache = modelData.nextTensor(FLOAT, CONTEXT_SIZE, HEAD_SIZE / 2);
@@ -78,19 +82,21 @@ public LlamaModel(Arena arena) throws IOException {
             inputWeights[i] = modelData.nextTensor(FLOAT, HIDEN_SIZE);
         }
         for (int i = 0; i < LAYERS; i++) {
-            attnQkvWeight[i] = modelData.nextTensor(UINT8, ATTN_WEIGHTS_SIZE, HEAD_SIZE, 16);
-            attnQkvScales[i] = modelData.nextTensor(FLOAT, ATTN_WEIGHTS_SIZE * HEAD_SIZE);
+            attnQWeight[i] = modelData.nextTensor(UINT8, HIDEN_SIZE, HEAD_SIZE, 16);
+            attnQScales[i] = modelData.nextTensor(FLOAT, HIDEN_SIZE, HEAD_SIZE);
+            attnKWeight[i] = modelData.nextTensor(UINT8, KV_HIDEN_SIZE, HEAD_SIZE, 16);
+            attnKScales[i] = modelData.nextTensor(FLOAT, KV_HIDEN_SIZE, HEAD_SIZE);
+            attnVWeight[i] = modelData.nextTensor(UINT8, KV_HIDEN_SIZE, HEAD_SIZE, 16);
+            attnVScales[i] = modelData.nextTensor(FLOAT, KV_HIDEN_SIZE, HEAD_SIZE);
             attnOWeight[i] = modelData.nextTensor(UINT8, HIDEN_SIZE, HEAD_SIZE, 16);
-            attnOScales[i] = modelData.nextTensor(FLOAT, HIDEN_SIZE * HEAD_SIZE);
+            attnOScales[i] = modelData.nextTensor(FLOAT, HIDEN_SIZE, HEAD_SIZE);
             mlpGateWeight[i] = modelData.nextTensor(UINT8, INTERMEDIATE_SIZE, HEAD_SIZE, 16);
-            mlpGateScales[i] = modelData.nextTensor(FLOAT, INTERMEDIATE_SIZE * HEAD_SIZE);
+            mlpGateScales[i] = modelData.nextTensor(FLOAT, INTERMEDIATE_SIZE, HEAD_SIZE);
             mlpUpWeight[i] = modelData.nextTensor(UINT8, INTERMEDIATE_SIZE, HEAD_SIZE, 16);
-            mlpUpScales[i] = modelData.nextTensor(FLOAT, INTERMEDIATE_SIZE * HEAD_SIZE);
+            mlpUpScales[i] = modelData.nextTensor(FLOAT, INTERMEDIATE_SIZE, HEAD_SIZE);
             mlpDownWeight[i] = modelData.nextTensor(UINT8, HIDEN_SIZE, 256, 16);
-            mlpDownScales[i] = modelData.nextTensor(FLOAT, INTERMEDIATE_SIZE * HEAD_SIZE);
+            mlpDownScales[i] = modelData.nextTensor(FLOAT, HIDEN_SIZE, 256);
         }
-        headWeight = modelData.nextTensor(UINT8, VOCAB_SIZE, HEAD_SIZE, 16);
-        headScales = modelData.nextTensor(FLOAT, VOCAB_SIZE * HEAD_SIZE);
     }
 
     public record ForwardResponse(Tensor<Float> logits,
@@ -110,12 +116,15 @@ public ForwardResponse forward(Tensor<Long> inputIds, Tensor<Long> attentionMask
         Tensor<Float>[] presentValues = new Tensor[LAYERS];
 
         for (int i = 0; i < LAYERS; i++) {
-            GroupQueryAttention<Float> attn = GroupQueryAttention(
-                    MatMulNBits(input,
-                                attnQkvWeight[i],
-                                attnQkvScales[i], empty(), empty(), empty(), HIDEN_SIZE, ATTN_WEIGHTS_SIZE, of(ACCURACY_LEVEL), BITS, BLOCK_SIZE),
-                    empty(),
-                    empty(),
+            GroupQueryAttention<Float> attn = GroupQueryAttention(MatMulNBits(input,
+                                attnQWeight[i],
+                                attnQScales[i], empty(), empty(), empty(), HIDEN_SIZE, HIDEN_SIZE, of(ACCURACY_LEVEL), BITS, BLOCK_SIZE),
+                    of(MatMulNBits(input,
+                                attnKWeight[i],
+                                attnKScales[i], empty(), empty(), empty(), HIDEN_SIZE, KV_HIDEN_SIZE, of(ACCURACY_LEVEL), BITS, BLOCK_SIZE)),
+                    of(MatMulNBits(input,
+                                attnVWeight[i],
+                                attnVScales[i], empty(), empty(), empty(), HIDEN_SIZE, KV_HIDEN_SIZE, of(ACCURACY_LEVEL), BITS, BLOCK_SIZE)),
                     of(pastKey[i]),
                     of(pastValue[i]),
                     amSL,
@@ -150,9 +159,7 @@ mlpDownScales[i], empty(), empty(), empty(), INTERMEDIATE_SIZE, HIDEN_SIZE, of(A
             presentValues[i] = attn.present_value();
         }
 
-        Tensor<Float> logits = MatMulNBits(input,
-                                           headWeight,
-                                           headScales, empty(), empty(), empty(), HIDEN_SIZE, VOCAB_SIZE, of(ACCURACY_LEVEL), BITS, BLOCK_SIZE);
+        Tensor<Float> logits = MatMul(input, Transpose(tokensWeights, of(new long[] {1L, 0L})));
 
         return new ForwardResponse(logits, presentKeys, presentValues);
     }
 
@@ -1,4 +1,5 @@
 /model.onnx.data
+/model_q4.onnx_data
 /tokenizer_config.json
 /tokenizer.json
 
@@ -27,3 +27,4 @@ stage/
 compile_flags.txt
 remoteTesting.conf
 test_report.txt
+var/*