From d4eb571b18e04c7fe19d8b1dd8662b99d140372c Mon Sep 17 00:00:00 2001
From: Junji Hashimoto <junji.hashimoto@gree.net>
Date: Mon, 21 Oct 2024 11:22:48 +0900
Subject: [PATCH 1/7] Add the ops of AoT

---
 experimental/kernels/gpt2_webgpu_aot.cpp | 799 +++++++++++++++++++++++
 experimental/kernels/ops_aot.cpp         | 356 ++++++++++
 experimental/kernels/ops_aot.hpp         | 108 +++
 3 files changed, 1263 insertions(+)
 create mode 100644 experimental/kernels/gpt2_webgpu_aot.cpp
 create mode 100644 experimental/kernels/ops_aot.cpp
 create mode 100644 experimental/kernels/ops_aot.hpp
diff --git a/experimental/kernels/gpt2_webgpu_aot.cpp b/experimental/kernels/gpt2_webgpu_aot.cpp
new file mode 100644
index 0000000..0c136f7
--- /dev/null
+++ b/experimental/kernels/gpt2_webgpu_aot.cpp
@@ -0,0 +1,799 @@
+#include "gpu.hpp"
+#include "ops.hpp"
+/*
+This file trains the GPT-2 model.
+This version is the clean, minimal, reference. As such:
+- it runs on CPU.
+- it does not make the code too complex; it is readable.
+- it does not use any processor-specific instructions, intrinsics and such.
+- it _does_ use a few OpenMP pragmas because this is a large speedup at very low cost
+There will be other versions of this code that specialize it and make it fast.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <string.h>
+#include <unistd.h>
+#include <memory>
+#ifdef OMP
+#include <omp.h>
+#endif
+// our own utilities
+// defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck
+#include "llmc/utils.h"
+// defines: tokenizer_init, tokenizer_decode, tokenizer_free
+#include "llmc/tokenizer.h"
+// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
+#include "llmc/dataloader.h"
+
+using namespace gpu;
+
+// ----------------------------------------------------------------------------
+// GPT-2 model definition
+
+typedef struct {
+    int max_seq_len; // max sequence length, e.g. 1024
+    int vocab_size; // vocab size, e.g. 50257
+    int padded_vocab_size; // padded to e.g. %128==0, 50304
+    int num_layers; // number of layers, e.g. 12
+    int num_heads; // number of heads in attention, e.g. 12
+    int channels; // number of channels, e.g. 768
+} GPT2Config;
+
+// the parameters of the model
+#define NUM_PARAMETER_TENSORS 16
+#define NUM_PARAMETER_LAYERS 12
+typedef struct {
+    Tensor wte; // (V, C)
+    Tensor wpe; // (maxT, C)
+    std::vector<Tensor> ln1w; // (L, C)
+    std::vector<Tensor> ln1b; // (L, C)
+    std::vector<Tensor> qkvw; // (L, 3*C, C)
+    std::vector<Tensor> qkvb; // (L, 3*C)
+    std::vector<Tensor> attprojw; // (L, C, C)
+    std::vector<Tensor> attprojb; // (L, C)
+    std::vector<Tensor> ln2w; // (L, C)
+    std::vector<Tensor> ln2b; // (L, C)
+    std::vector<Tensor> fcw; // (L, 4*C, C)
+    std::vector<Tensor> fcb; // (L, 4*C)
+    std::vector<Tensor> fcprojw; // (L, C, 4*C)
+    std::vector<Tensor> fcprojb; // (L, C)
+    Tensor lnfw; // (C)
+    Tensor lnfb; // (C)
+} ParameterTensors;
+
+void fill_in_parameter_sizes(size_t* param_sizes, GPT2Config config) {
+    size_t Vp = config.padded_vocab_size;
+    size_t C = config.channels;
+    size_t maxT = config.max_seq_len;
+    size_t L = config.num_layers;
+    param_sizes[0] = Vp * C; // wte
+    param_sizes[1] = maxT * C; // wpe
+    param_sizes[2] = L * C; // ln1w
+    param_sizes[3] = L * C; // ln1b
+    param_sizes[4] = L * (3 * C) * C; // qkvw
+    param_sizes[5] = L * (3 * C); // qkvb
+    param_sizes[6] = L * C * C; // attprojw
+    param_sizes[7] = L * C; // attprojb
+    param_sizes[8] = L * C; // ln2w
+    param_sizes[9] = L * C; // ln2b
+    param_sizes[10] = L * (4 * C) * C; // fcw
+    param_sizes[11] = L * (4 * C); // fcb
+    param_sizes[12] = L * C * (4 * C); // fcprojw
+    param_sizes[13] = L * C; // fcprojb
+    param_sizes[14] = C; // lnfw
+    param_sizes[15] = C; // lnfb
+}
+
+// allocate memory for the parameters and point the individual tensors to the right places
+float* malloc_and_point_parameters(ParameterTensors* params, size_t* param_sizes) {
+    size_t num_parameters = 0;
+    for (size_t i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+        num_parameters += param_sizes[i];
+    }
+    // malloc all parameters all at once
+    float* params_memory = (float*)mallocCheck(num_parameters * sizeof(float));
+    // assign all the tensors
+    float** ptrs[] = {
+        &params->wte, &params->wpe, &params->ln1w, &params->ln1b, &params->qkvw, &params->qkvb,
+        &params->attprojw, &params->attprojb, &params->ln2w, &params->ln2b, &params->fcw, &params->fcb,
+        &params->fcprojw, &params->fcprojb, &params->lnfw, &params->lnfb
+    };
+    float* params_memory_iterator = params_memory;
+    for (size_t i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+        *(ptrs[i]) = params_memory_iterator;
+        params_memory_iterator += param_sizes[i];
+    }
+    return params_memory;
+}
+
+
+#define NUM_ACTIVATION_TENSORS 23
+typedef struct {
+    Tensor encoded; // (B, T, C)
+    std::vector<Tensor> ln1; // (L, B, T, C)
+    std::vector<Tensor> ln1_mean; // (L, B, T)
+    std::vector<Tensor> ln1_rstd; // (L, B, T)
+    std::vector<Tensor> qkv; // (L, B, T, 3*C)
+    std::vector<Tensor> atty; // (L, B, T, C)
+    std::vector<Tensor> preatt; // (L, B, NH, T, T)
+    std::vector<Tensor> att; // (L, B, NH, T, T)
+    std::vector<Tensor> attproj; // (L, B, T, C)
+    std::vector<Tensor> residual2; // (L, B, T, C)
+    std::vector<Tensor> ln2; // (L, B, T, C)
+    std::vector<Tensor> ln2_mean; // (L, B, T)
+    std::vector<Tensor> ln2_rstd; // (L, B, T)
+    std::vector<Tensor> fch; // (L, B, T, 4*C)
+    std::vector<Tensor> fch_gelu; // (L, B, T, 4*C)
+    std::vector<Tensor> fcproj; // (L, B, T, C)
+    std::vector<Tensor> residual3; // (L, B, T, C)
+    Tensor lnf; // (B, T, C)
+    Tensor lnf_mean; // (B, T)
+    Tensor lnf_rstd; // (B, T)
+    Tensor logits; // (B, T, V)
+    Tensor probs; // (B, T, V)
+    Tensor losses; // (B, T)
+} ActivationTensors;
+
+typedef struct {
+    Kernel encoder_forward;
+    std::vector<Kernel> layernorm_forward;
+    std::vector<Kernel> qkv_projection_forward;
+    std::vector<Kernel> attention_forward;
+    std::vector<Kernel> attention_projection_forward;
+    std::vector<Kernel> residual_forward;
+    std::vector<Kernel> ff_up_forward;
+    std::vector<Kernel> gelu_forward;
+    std::vector<Kernel> ff_down_forward;
+    std::vector<Kernel> residual2_forward;
+    Kernel layernorm_final_forward;
+    Kernel matmul_final_forward;
+    Kernel softmax_final_forward;
+    std::vector<Kernel> crossentropy_forward;
+  
+    Kernel crossentropy_softmax_backward;
+    Kernel matmul_final_backward;
+    Kernel layernorm_final_backward;
+    std::vector<Kernel> residual2_backward;
+    std::vector<Kernel> ff_down_backward;
+    std::vector<Kernel> gelu_backward;
+    std::vector<Kernel> ff_up_backward;
+    std::vector<Kernel> layernorm2_backward;
+    std::vector<Kernel> attention_projection_backward;
+    std::vector<Kernel> attention_backward;
+    std::vector<Kernel> qkv_projection_backward;
+    std::vector<Kernel> layernorm1_backward;
+    Kernel encoder_backward;
+} Kernels;
+
+void fill_in_activation_sizes(size_t* act_sizes, GPT2Config config, int B, int T) {
+    size_t C = config.channels;
+    size_t NH = config.num_heads;
+    size_t L = config.num_layers;
+    size_t Vp = config.padded_vocab_size;
+    act_sizes[0] = B * T * C; // encoded
+    act_sizes[1] = L * B * T * C; // ln1
+    act_sizes[2] = L * B * T; // ln1_mean
+    act_sizes[3] = L * B * T; // ln1_rstd
+    act_sizes[4] = L * B * T * 3 * C; // qkv
+    act_sizes[5] = L * B * T * C; // atty
+    act_sizes[6] = L * B * NH * T * T; // preatt
+    act_sizes[7] = L * B * NH * T * T; // att
+    act_sizes[8] = L * B * T * C; // attproj
+    act_sizes[9] = L * B * T * C; // residual2
+    act_sizes[10] = L * B * T * C; // ln2
+    act_sizes[11] = L * B * T; // ln2_mean
+    act_sizes[12] = L * B * T; // ln2_rstd
+    act_sizes[13] = L * B * T * 4 * C; // fch
+    act_sizes[14] = L * B * T * 4 * C; // fch_gelu
+    act_sizes[15] = L * B * T * C; // fcproj
+    act_sizes[16] = L * B * T * C; // residual3
+    act_sizes[17] = B * T * C; // lnf
+    act_sizes[18] = B * T; // lnf_mean
+    act_sizes[19] = B * T; // lnf_rstd
+    act_sizes[20] = B * T * Vp; // logits
+    act_sizes[21] = B * T * Vp; // probs
+    act_sizes[22] = B * T; // losses
+}
+
+float* malloc_and_point_activations(ActivationTensors* acts, size_t* act_sizes) {
+    size_t num_activations = 0;
+    for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
+        num_activations += act_sizes[i];
+    }
+    float* acts_memory = (float*)mallocCheck(num_activations * sizeof(float));
+    float** ptrs[] = {
+        &acts->encoded, &acts->ln1, &acts->ln1_mean, &acts->ln1_rstd, &acts->qkv, &acts->atty,
+        &acts->preatt, &acts->att, &acts->attproj, &acts->residual2, &acts->ln2, &acts->ln2_mean,
+        &acts->ln2_rstd, &acts->fch, &acts->fch_gelu, &acts->fcproj, &acts->residual3, &acts->lnf,
+        &acts->lnf_mean, &acts->lnf_rstd, &acts->logits, &acts->probs, &acts->losses
+    };
+    float* acts_memory_iterator = acts_memory;
+    for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
+        *(ptrs[i]) = acts_memory_iterator;
+        acts_memory_iterator += act_sizes[i];
+    }
+    return acts_memory;
+}
+
+struct GPUParameters {
+  Tensor data[NUM_PARAMETER_TENSORS];
+};
+
+struct GPUActivations {
+  Tensor data[NUM_ACTIVATION_TENSORS];
+};
+
+
+void gpu_alloc(Context& ctx, Tensor* tensors, size_t* sizes, size_t n) { 
+    for (size_t i = 0; i < n; i++) {
+        tensors[i] = createTensor(ctx, Shape{sizes[i]}, kf32);
+    }
+}
+
+typedef struct {
+    GPT2Config config;
+    // the weights (parameters) of the model, and their sizes
+    ParameterTensors params;
+    GPUParameters params_; // TODO(avh): eventually this replaces params
+    size_t param_sizes[NUM_PARAMETER_TENSORS];
+    float* params_memory;
+    size_t num_parameters;
+    // gradients of the weights
+    ParameterTensors grads;
+    float* grads_memory;
+    // buffers for the AdamW optimizer
+    float* m_memory;
+    float* v_memory;
+    // the activations of the model, and their sizes
+    ActivationTensors acts;
+    GPUActivations acts_; // TODO(avh): eventually this replaces params
+    size_t act_sizes[NUM_ACTIVATION_TENSORS];
+    float* acts_memory;
+    size_t num_activations;
+    // gradients of the activations
+    ActivationTensors grads_acts;
+    float* grads_acts_memory;
+    // other run state configuration
+    int batch_size; // the batch size (B) of current forward pass
+    int seq_len; // the sequence length (T) of current forward pass
+    int* inputs; // the input tokens for the current forward pass
+    int* targets; // the target tokens for the current forward pass
+    float mean_loss; // after a forward pass with targets, will be populated with the mean loss
+} GPT2;
+
+void gpt2_build_from_checkpoint(Context& ctx, GPT2 *model, const char* checkpoint_path) {
+
+    // read in model from a checkpoint file
+    FILE *model_file = fopenCheck(checkpoint_path, "rb");
+    int model_header[256];
+    freadCheck(model_header, sizeof(int), 256, model_file);
+    if (model_header[0] != 20240326) { printf("Bad magic model file\n"); exit(1); }
+    if (model_header[1] != 3) {
+        printf("Bad version in model file\n");
+        printf("---> HINT: try to re-run `python train_gpt2.py`\n");
+        exit(1);
+    }
+
+    // read in hyperparameters
+    size_t maxT, V, Vp, L, NH, C; // size_t to prevent int overflow
+    model->config.max_seq_len = maxT = model_header[2];
+    model->config.vocab_size = V = model_header[3];
+#ifdef __EMSCRIPTEN__
+    model->config.num_layers = L = 12; // TODO(avh): Debugging only hack - revert this
+#else
+    model->config.num_layers = L = model_header[4];
+#endif
+    model->config.num_heads = NH = model_header[5];
+    model->config.channels = C = model_header[6];
+    model->config.padded_vocab_size = Vp = model_header[7];
+    printf("[GPT-2]\n");
+    printf("max_seq_len: %zu\n", maxT);
+    printf("vocab_size: %zu\n", V);
+    printf("padded_vocab_size: %zu\n", Vp);
+    printf("num_layers: %zu\n", L);
+    printf("num_heads: %zu\n", NH);
+    printf("channels: %zu\n", C);
+
+    // allocate space for all the parameters and read them in
+    fill_in_parameter_sizes(model->param_sizes,  model->config);
+
+    // count the number of parameters
+    size_t num_parameters = 0;
+    for (size_t i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+        num_parameters += model->param_sizes[i];
+    }
+    printf("num_parameters: %zu\n", num_parameters);
+    model->num_parameters = num_parameters;
+
+    // read in all the parameters from file
+    model->params_memory = malloc_and_point_parameters(&model->params, model->param_sizes);
+    freadCheck(model->params_memory, sizeof(float), num_parameters, model_file);
+    fcloseCheck(model_file);
+
+    // other inits
+    model->acts_memory = NULL;
+    model->grads_memory = NULL;
+    model->m_memory = NULL;
+    model->v_memory = NULL;
+    model->grads_acts_memory = NULL;
+    model->inputs = NULL;
+    model->targets = NULL;
+    model->batch_size = 0;
+    model->seq_len = 0;
+    model->mean_loss = -1.0f; // -1.0f will designate no loss
+
+    // TODO(avh): this is just a resource test for now, eventually deprecate CPU allocations
+    gpu_alloc(ctx, model->params_.data, model->param_sizes, NUM_PARAMETER_TENSORS);
+
+}
+
+
+void gpt2_forward(Context& ctx, GPT2 *model, int* inputs, int* targets, size_t B, size_t T) {
+    // targets are optional and could be NULL
+
+    // ensure the model was initialized or error out
+    if (model->params_memory == NULL) {
+        printf("Error: model was not initialized properly.\n");
+        exit(1);
+    }
+
+    // convenience parameters (size_t to help prevent int overflow)
+    size_t V = model->config.vocab_size;
+    size_t Vp = model->config.padded_vocab_size;
+    size_t L = model->config.num_layers;
+    size_t NH = model->config.num_heads;
+    size_t C = model->config.channels;
+
+    // validate inputs, all indices must be in the range [0, V)
+    for(int i = 0; i < B * T; i++) {
+        assert(0 <= inputs[i] && inputs[i] < V);
+        if (targets != NULL) {
+            assert(0 <= targets[i] && targets[i] < V);
+        }
+    }
+
+    // allocate space for all the activations if needed (done here, lazily)
+    if(model->acts_memory == NULL) {
+        // record the current B,T as well
+        model->batch_size = B;
+        model->seq_len = T;
+        // and now allocate the space
+        fill_in_activation_sizes(model->act_sizes, model->config, B, T);
+        // TODO(avh): this is just a resource test for now, eventually deprecate CPU allocations
+        gpu_alloc(ctx, model->acts_.data, model->act_sizes, NUM_PARAMETER_TENSORS);
+        size_t num_activations = 0;
+        for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
+            num_activations += model->act_sizes[i];
+        }
+        printf("num_activations: %zu\n", num_activations);
+        model->num_activations = num_activations;
+        printf("Allocating %.2f MB for activations\n", num_activations * sizeof(float) / (1024.0f * 1024.0f));
+        model->acts_memory = malloc_and_point_activations(&model->acts, model->act_sizes);
+        // also create memory for caching inputs and targets
+        model->inputs = (int*)mallocCheck(B * T * sizeof(int));
+        model->targets = (int*)mallocCheck(B * T * sizeof(int)); // might be unused if we never have targets but it's small
+    } else {
+        // validate B,T is consistent with how we've allocated the memory before
+        // in principle we could get more clever here in the future, for now this is safest
+        if (B != model->batch_size || T != model->seq_len) {
+            printf("Model: B=%d T=%d, Desired: B=%d T=%d\n", model->batch_size, model->seq_len, (int)B, (int)T);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    printf("Cache inputs/targets\n");
+    // cache the inputs/targets
+    memcpy(model->inputs, inputs, B * T * sizeof(int));
+    if (targets != NULL) {
+        memcpy(model->targets, targets, B * T * sizeof(int));
+    }
+
+    printf("Forward pass\n");
+    // forward pass
+    ParameterTensors params = model->params; // for brevity
+    ActivationTensors acts = model->acts;
+    float* residual;
+    printf("Encoding\n");
+    printf("inputs[0] = %d\n", inputs[0]);
+    encoder_forward(ctx, acts.encoded, inputs, params.wte, params.wpe, B, T, C); // encoding goes into residual[0]
+    for (int l = 0; l < L; l++) {
+      printf("Forward Pass Layer %d\n", l);
+
+        residual = l == 0 ? acts.encoded : acts.residual3 + (l-1) * B * T * C;
+
+        // get the pointers of the weights for this layer
+        float* l_ln1w = params.ln1w + l * C;
+        float* l_ln1b = params.ln1b + l * C;
+        float* l_qkvw = params.qkvw + l * 3*C * C;
+        float* l_qkvb = params.qkvb + l * 3*C;
+        float* l_attprojw = params.attprojw + l * C * C;
+        float* l_attprojb = params.attprojb + l * C;
+        float* l_ln2w = params.ln2w + l * C;
+        float* l_ln2b = params.ln2b + l * C;
+        float* l_fcw = params.fcw + l * 4*C * C;
+        float* l_fcb = params.fcb + l * 4*C;
+        float* l_fcprojw = params.fcprojw + l * C * 4*C;
+        float* l_fcprojb = params.fcprojb + l * C;
+
+        // get the pointers of the activations for this layer
+        float* l_ln1 = acts.ln1 + l * B * T * C;
+        float* l_ln1_mean = acts.ln1_mean + l * B * T;
+        float* l_ln1_rstd = acts.ln1_rstd + l * B * T;
+        float* l_qkv = acts.qkv + l * B * T * 3*C;
+        float* l_atty = acts.atty + l * B * T * C;
+        float* l_preatt = acts.preatt + l * B * NH * T * T;
+        float* l_att = acts.att + l * B * NH * T * T;
+        float* l_attproj = acts.attproj + l * B * T * C;
+        float* l_residual2 = acts.residual2 + l * B * T * C;
+        float* l_ln2 = acts.ln2 + l * B * T * C;
+        float* l_ln2_mean = acts.ln2_mean + l * B * T;
+        float* l_ln2_rstd = acts.ln2_rstd + l * B * T;
+        float* l_fch = acts.fch + l * B * T * 4*C;
+        float* l_fch_gelu = acts.fch_gelu + l * B * T * 4*C;
+        float* l_fcproj = acts.fcproj + l * B * T * C;
+        float* l_residual3 = acts.residual3 + l * B * T * C;
+
+        // now do the forward pass
+        printf("  [Forward] : LayerNorm1\n");
+        layernorm_forward(ctx, l_ln1, l_ln1_mean, l_ln1_rstd, residual, l_ln1w, l_ln1b, B, T, C);
+        printf("  [Forward] : QKV Projection\n");
+        matmul_forward(ctx, l_qkv, l_ln1, l_qkvw, l_qkvb, B, T, C, 3*C);
+        printf("  [Forward] : Attention\n");
+        attention_forward(ctx, l_atty, l_preatt, l_att, l_qkv, B, T, C, NH);
+        printf("  [Forward] : Attention Projection\n");
+        matmul_forward(ctx, l_attproj, l_atty, l_attprojw, l_attprojb, B, T, C, C);
+        printf("  [Forward] : Residual1\n");
+        residual_forward(ctx, l_residual2, residual, l_attproj, B*T*C);
+        printf("  [Forward] : LayerNorm2\n");
+        layernorm_forward(ctx, l_ln2, l_ln2_mean, l_ln2_rstd, l_residual2, l_ln2w, l_ln2b, B, T, C);
+        printf("  [Forward] : FF Up\n");
+        matmul_forward(ctx, l_fch, l_ln2, l_fcw, l_fcb, B, T, C, 4*C);
+        printf("  [Forward] : GELU\n");
+        gelu_forward(ctx, l_fch_gelu, l_fch, B*T*4*C);
+        printf("  [Forward] : FF Down\n");
+        matmul_forward(ctx, l_fcproj, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, 4*C, C);
+        printf("  [Forward] : Residual2\n");
+        residual_forward(ctx, l_residual3, l_residual2, l_fcproj, B*T*C);
+    }
+    residual = acts.residual3 + (L-1) * B * T * C; // last residual is in residual3
+    layernorm_forward(ctx, acts.lnf, acts.lnf_mean, acts.lnf_rstd, residual, params.lnfw, params.lnfb, B, T, C);
+    matmul_forward(ctx, acts.logits, acts.lnf, params.wte, NULL, B, T, C, Vp);
+    softmax_forward(ctx, acts.probs, acts.logits, B, T, V, Vp);
+
+    printf("Crossentropy\n");
+    // also forward the cross-entropy loss function if we have the targets
+    if (targets != NULL) {
+        crossentropy_forward(ctx, model->acts.losses, model->acts.probs, targets, B, T, Vp);
+        // for convenience also evaluate the mean loss
+        float mean_loss = 0.0f;
+        for (int i=0; i<B*T; i++) { mean_loss += model->acts.losses[i]; }
+        mean_loss /= B*T;
+        model->mean_loss = mean_loss;
+    } else {
+        // if we don't have targets, we don't have a loss
+        model->mean_loss = -1.0f;
+    }
+    printf("Forward pass done\n");
+}
+
+void gpt2_zero_grad(GPT2 *model) {
+    if(model->grads_memory != NULL) { memset(model->grads_memory, 0, model->num_parameters * sizeof(float)); }
+    if(model->grads_acts_memory != NULL) { memset(model->grads_acts_memory, 0, model->num_activations * sizeof(float)); }
+}
+
+void gpt2_backward(Context& ctx, GPT2 *model) {
+    printf("Backward pass\n");
+
+    // double check we forwarded previously, with targets
+    if (model->mean_loss == -1.0f) {
+        printf("Error: must forward with targets before backward\n");
+        exit(1);
+    }
+
+    // lazily allocate the memory for gradients of the weights and activations, if needed
+    if (model->grads_memory == NULL) {
+        printf("Allocating %.2f MB for gradients\n", model->num_parameters * sizeof(float) / (1024.0f * 1024.0f));
+        model->grads_memory = malloc_and_point_parameters(&model->grads, model->param_sizes);
+        model->grads_acts_memory = malloc_and_point_activations(&model->grads_acts, model->act_sizes);
+        gpt2_zero_grad(model);
+    }
+
+    // convenience shortcuts (and size_t to help prevent int overflow)
+    size_t B = model->batch_size;
+    size_t T = model->seq_len;
+    size_t V = model->config.vocab_size;
+    size_t Vp = model->config.padded_vocab_size;
+    size_t L = model->config.num_layers;
+    size_t NH = model->config.num_heads;
+    size_t C = model->config.channels;
+
+    // backward pass: go in the reverse order of the forward pass, and call backward() functions
+    ParameterTensors params = model->params; // for brevity
+    ParameterTensors grads = model->grads;
+    ActivationTensors acts = model->acts;
+    ActivationTensors grads_acts = model->grads_acts;
+
+    // we kick off the chain rule by filling in dlosses with 1.0f/(B*T)
+    // technically this is a small, inline backward() pass of calculating
+    // total, final loss as the mean over all losses over all (B,T) positions in the batch
+    float dloss_mean = 1.0f / (B*T);
+    for (int i = 0; i < B*T; i++) { grads_acts.losses[i] = dloss_mean; }
+
+    crossentropy_softmax_backward(ctx, grads_acts.logits, grads_acts.losses, acts.probs, model->targets, B, T, V, Vp);
+    matmul_backward(ctx, grads_acts.lnf, grads.wte, NULL, grads_acts.logits, acts.lnf, params.wte, B, T, C, Vp);
+    float* residual = acts.residual3 + (L-1) * B * T * C; // last layer's residual
+    float* dresidual = grads_acts.residual3 + (L-1) * B * T * C; // write to last layer's residual
+    layernorm_backward(ctx, dresidual, grads.lnfw, grads.lnfb, grads_acts.lnf, residual, params.lnfw, acts.lnf_mean, acts.lnf_rstd, B, T, C);
+
+    for (int l = L-1; l >= 0; l--) {
+        printf("Backward Pass Layer %d\n", l);
+
+        residual = l == 0 ? acts.encoded : acts.residual3 + (l-1) * B * T * C;
+        dresidual = l == 0 ? grads_acts.encoded : grads_acts.residual3 + (l-1) * B * T * C;
+
+        // get the pointers of the weights for this layer
+        float* l_ln1w = params.ln1w + l * C;
+        float* l_qkvw = params.qkvw + l * 3*C * C;
+        float* l_attprojw = params.attprojw + l * C * C;
+        float* l_ln2w = params.ln2w + l * C;
+        float* l_fcw = params.fcw + l * 4*C * C;
+        float* l_fcprojw = params.fcprojw + l * C * 4*C;
+        // get the pointers of the gradients of the weights for this layer
+        float* dl_ln1w = grads.ln1w + l * C;
+        float* dl_ln1b = grads.ln1b + l * C;
+        float* dl_qkvw = grads.qkvw + l * 3*C * C;
+        float* dl_qkvb = grads.qkvb + l * 3*C;
+        float* dl_attprojw = grads.attprojw + l * C * C;
+        float* dl_attprojb = grads.attprojb + l * C;
+        float* dl_ln2w = grads.ln2w + l * C;
+        float* dl_ln2b = grads.ln2b + l * C;
+        float* dl_fcw = grads.fcw + l * 4*C * C;
+        float* dl_fcb = grads.fcb + l * 4*C;
+        float* dl_fcprojw = grads.fcprojw + l * C * 4*C;
+        float* dl_fcprojb = grads.fcprojb + l * C;
+        // get the pointers of the activations for this layer
+        float* l_ln1 = acts.ln1 + l * B * T * C;
+        float* l_ln1_mean = acts.ln1_mean + l * B * T;
+        float* l_ln1_rstd = acts.ln1_rstd + l * B * T;
+        float* l_qkv = acts.qkv + l * B * T * 3*C;
+        float* l_atty = acts.atty + l * B * T * C;
+        float* l_att = acts.att + l * B * NH * T * T;
+        float* l_residual2 = acts.residual2 + l * B * T * C;
+        float* l_ln2 = acts.ln2 + l * B * T * C;
+        float* l_ln2_mean = acts.ln2_mean + l * B * T;
+        float* l_ln2_rstd = acts.ln2_rstd + l * B * T;
+        float* l_fch = acts.fch + l * B * T * 4*C;
+        float* l_fch_gelu = acts.fch_gelu + l * B * T * 4*C;
+        // get the pointers of the gradients of the activations for this layer
+        float* dl_ln1 = grads_acts.ln1 + l * B * T * C;
+        float* dl_qkv = grads_acts.qkv + l * B * T * 3*C;
+        float* dl_atty = grads_acts.atty + l * B * T * C;
+        float* dl_preatt = grads_acts.preatt + l * B * NH * T * T;
+        float* dl_att = grads_acts.att + l * B * NH * T * T;
+        float* dl_attproj = grads_acts.attproj + l * B * T * C;
+        float* dl_residual2 = grads_acts.residual2 + l * B * T * C;
+        float* dl_ln2 = grads_acts.ln2 + l * B * T * C;
+        float* dl_fch = grads_acts.fch + l * B * T * 4*C;
+        float* dl_fch_gelu = grads_acts.fch_gelu + l * B * T * 4*C;
+        float* dl_fcproj = grads_acts.fcproj + l * B * T * C;
+        float* dl_residual3 = grads_acts.residual3 + l * B * T * C;
+
+        // backprop this layer
+        printf("  [Backward] : Residual2\n");
+        residual_backward(ctx, dl_residual2, dl_fcproj, dl_residual3, B*T*C);
+        printf("  [Backward] : FF Down \n");
+        matmul_backward(ctx, dl_fch_gelu, dl_fcprojw, dl_fcprojb, dl_fcproj, l_fch_gelu, l_fcprojw, B, T, 4*C, C);
+        printf("  [Backward] : GELU\n");
+        gelu_backward(ctx, dl_fch, l_fch, dl_fch_gelu, B*T*4*C);
+        printf("  [Backward] : FF Up\n");
+        matmul_backward(ctx, dl_ln2, dl_fcw, dl_fcb, dl_fch, l_ln2, l_fcw, B, T, C, 4*C);
+        printf("  [Backward] : LayerNorm2\n");
+        layernorm_backward(ctx, dl_residual2, dl_ln2w, dl_ln2b, dl_ln2, l_residual2, l_ln2w, l_ln2_mean, l_ln2_rstd, B, T, C);
+        printf("  [Backward] : Residual1\n");
+        residual_backward(ctx, dresidual, dl_attproj, dl_residual2, B*T*C);
+        printf("  [Backward] : Attention Projection\n");
+        matmul_backward(ctx, dl_atty, dl_attprojw, dl_attprojb, dl_attproj, l_atty, l_attprojw, B, T, C, C);
+        printf("  [Backward] : Attention\n");
+        attention_backward(ctx, dl_qkv, dl_preatt, dl_att, dl_atty, l_qkv, l_att, B, T, C, NH);
+        printf("  [Backward] : QKV Projection\n");
+        matmul_backward(ctx, dl_ln1, dl_qkvw, dl_qkvb, dl_qkv, l_ln1, l_qkvw, B, T, C, 3*C);
+        printf("  [Backward] : LayerNorm1\n");
+        layernorm_backward(ctx, dresidual, dl_ln1w, dl_ln1b, dl_ln1, residual, l_ln1w, l_ln1_mean, l_ln1_rstd, B, T, C);
+    }
+    encoder_backward(ctx, grads.wte, grads.wpe, grads_acts.encoded, model->inputs, B, T, C);
+}
+
+void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t) {
+    // reference: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
+
+    // lazily allocate the memory for m_memory and v_memory
+    if (model->m_memory == NULL) {
+        model->m_memory = (float*)calloc(model->num_parameters, sizeof(float));
+        model->v_memory = (float*)calloc(model->num_parameters, sizeof(float));
+    }
+
+    for (size_t i = 0; i < model->num_parameters; i++) {
+        float param = model->params_memory[i];
+        float grad = model->grads_memory[i];
+
+        // update the first moment (momentum)
+        float m = beta1 * model->m_memory[i] + (1.0f - beta1) * grad;
+        // update the second moment (RMSprop)
+        float v = beta2 * model->v_memory[i] + (1.0f - beta2) * grad * grad;
+        // bias-correct both moments
+        float m_hat = m / (1.0f - powf(beta1, t));
+        float v_hat = v / (1.0f - powf(beta2, t));
+
+        // update
+        model->m_memory[i] = m;
+        model->v_memory[i] = v;
+        model->params_memory[i] -= learning_rate * (m_hat / (sqrtf(v_hat) + eps) + weight_decay * param);
+    }
+}
+
+void gpt2_free(GPT2 *model) {
+    free(model->params_memory);
+    free(model->grads_memory);
+    free(model->m_memory);
+    free(model->v_memory);
+    free(model->acts_memory);
+    free(model->grads_acts_memory);
+    free(model->inputs);
+    free(model->targets);
+}
+
+#ifndef TESTING
+// if we are TESTING (see test_gpt2.c), we'll skip the int main below
+// ----------------------------------------------------------------------------
+// sampler
+
+unsigned int random_u32(uint64_t *state) {
+    // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
+    *state ^= *state >> 12;
+    *state ^= *state << 25;
+    *state ^= *state >> 27;
+    return (*state * 0x2545F4914F6CDD1Dull) >> 32;
+}
+float random_f32(uint64_t *state) { // random float32 in [0,1)
+    return (random_u32(state) >> 8) / 16777216.0f;
+}
+
+int sample_mult(float* probabilities, int n, float coin) {
+    // sample index from probabilities (they must sum to 1!)
+    // coin is a random number in [0, 1), usually from random_f32()
+    float cdf = 0.0f;
+    for (int i = 0; i < n; i++) {
+        cdf += probabilities[i];
+        if (coin < cdf) {
+            return i;
+        }
+    }
+    return n - 1; // in case of rounding errors
+}
+
+// ----------------------------------------------------------------------------
+// main training loop
+int main() {
+
+    setLogLevel(kWarn);
+
+    printf("Creating GPU context\n");
+    WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
+    gpu::Context ctx = gpu::createContext({}, {}, {
+        .requiredLimits = &requiredLimits
+    });
+    // gpu::Context ctx = gpu::createContext();
+
+    // build the GPT-2 model from a checkpoint
+    GPT2 model;
+    gpt2_build_from_checkpoint(ctx, &model, "gpt2_124M.bin");
+
+    // build the DataLoaders from tokens files. for now use tiny_shakespeare if available, else tiny_stories
+    const char* tiny_stories_train = "dev/data/tinystories/TinyStories_train.bin";
+    const char* tiny_stories_val = "dev/data/tinystories/TinyStories_val.bin";
+    const char* tiny_shakespeare_train = "dev/data/tinyshakespeare/tiny_shakespeare_train.bin";
+    const char* tiny_shakespeare_val = "dev/data/tinyshakespeare/tiny_shakespeare_val.bin";
+    const char* train_tokens = access(tiny_shakespeare_train, F_OK) != -1 ? tiny_shakespeare_train : tiny_stories_train;
+    const char* val_tokens = access(tiny_shakespeare_val, F_OK) != -1 ? tiny_shakespeare_val : tiny_stories_val;
+    constexpr int B = 4; // batch size 4 (i.e. 4 independent token sequences will be trained on)
+    constexpr int T = 64; // sequence length 64 (i.e. each sequence is 64 tokens long). must be <= maxT, which is 1024 for GPT-2
+    DataLoader train_loader, val_loader;
+    dataloader_init(&train_loader, train_tokens, B, T, 0, 1, 1);
+    dataloader_init(&val_loader, val_tokens, B, T, 0, 1, 0);
+    printf("train dataset num_batches: %zu\n", train_loader.num_tokens / (B*T));
+    printf("val dataset num_batches: %zu\n", val_loader.num_tokens / (B*T));
+    int val_num_batches = 5;
+
+    // build the Tokenizer
+    Tokenizer tokenizer;
+    tokenizer_init(&tokenizer, "gpt2_tokenizer.bin");
+
+    // some memory for generating samples from the model
+    uint64_t rng_state = 1337;
+    int* gen_tokens = (int*)mallocCheck(B * T * sizeof(int));
+    const int genT = 64; // number of steps of inference we will do
+
+      
+    // train
+    struct timespec start, end;
+    printf("Starting training\n");
+    for (int step = 0; step <= 40; step++) {
+      printf("Step %d\n", step);
+
+        // once in a while estimate the validation loss
+        if (step % 10 == 0) {
+            float val_loss = 0.0f;
+            dataloader_reset(&val_loader);
+            for (int i = 0; i < val_num_batches; i++) {
+                dataloader_next_batch(&val_loader);
+                gpt2_forward(ctx, &model, val_loader.inputs, val_loader.targets, B, T);
+                val_loss += model.mean_loss;
+            }
+            val_loss /= val_num_batches;
+            printf("val loss %f\n", val_loss);
+        }
+
+        // once in a while do model inference to print generated text
+        if (step > 0 && step % 20 == 0) {
+            // fill up gen_tokens with the GPT2_EOT, which kicks off the generation
+            for(int i = 0; i < B * T; ++i) {
+                gen_tokens[i] = tokenizer.eot_token;
+            }
+            // now sample from the model autoregressively
+            printf("generating:\n---\n");
+            for (int t = 1; t < genT; t++) {
+                // note that inference is very wasteful here because for each token
+                // we re-calculate the forward pass for all of (B,T) positions from scratch
+                // but the inference here is just for sanity checking anyway
+                // and we can maybe optimize a bit more later, with careful tests
+                gpt2_forward(ctx, &model, gen_tokens, NULL, B, T);
+                // furthermore, below we're only using b=0 (i.e. the first row) of all B rows
+                // we're in principle running B "inference streams" in parallel here
+                // but only using position 0
+                // get the Vp-dimensional vector probs[0, t-1, :]
+                float* probs = model.acts.probs + (t-1) * model.config.padded_vocab_size;
+                float coin = random_f32(&rng_state);
+                // note we're only sampling from the first V elements, ignoring padding
+                // (the probabilities in the padded region should be zero anyway)
+                int next_token = sample_mult(probs, model.config.vocab_size, coin);
+                gen_tokens[t] = next_token;
+                // print the generated token, either using the Tokenizer or a fallback
+                if (tokenizer.init_ok) {
+                    const char* token_str = tokenizer_decode(&tokenizer, next_token);
+                    safe_printf(token_str);
+                } else {
+                    // fall back to printing the token id
+                    printf("%d ", next_token);
+                }
+                fflush(stdout);
+            }
+            printf("\n---\n");
+        }
+
+        // do a training step
+        clock_gettime(CLOCK_MONOTONIC, &start);
+        dataloader_next_batch(&train_loader);
+        gpt2_forward(ctx, &model, train_loader.inputs, train_loader.targets, B, T);
+        gpt2_zero_grad(&model);
+        gpt2_backward(ctx, &model);
+        gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, step+1);
+        clock_gettime(CLOCK_MONOTONIC, &end);
+        double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
+        printf("step %d: train loss %f (took %f ms)\n", step, model.mean_loss, time_elapsed_s * 1000);
+    }
+
+    // free
+    dataloader_free(&train_loader);
+    dataloader_free(&val_loader);
+    tokenizer_free(&tokenizer);
+    gpt2_free(&model);
+    free(gen_tokens);
+    return 0;
+}
+#endif
diff --git a/experimental/kernels/ops_aot.cpp b/experimental/kernels/ops_aot.cpp
new file mode 100644
index 0000000..f4ce9c0
--- /dev/null
+++ b/experimental/kernels/ops_aot.cpp
@@ -0,0 +1,356 @@
+#include "gpu.hpp"
+#include <array>
+#include <cstdio>
+#include <future>
+#include <memory>
+
+#include "kernels.h"
+#include "ops_aot.hpp"
+#include "experimental/wgsl.h"      // loopUnrolling
+
+using namespace gpu;
+
+Kernel encoder_forward(Context& ctx, Tensor& out,
+                       Tensor& inp, Tensor& wte, Tensor& wpe,
+                       int B, int T, int C){
+  unsigned long b = static_cast<unsigned long>(B);
+  unsigned long t = static_cast<unsigned long>(T);
+  unsigned long c = static_cast<unsigned long>(C);
+  unsigned long v = VOCAB_SIZE;
+  struct EncoderParams {
+    uint32_t B;
+    uint32_t T;
+    uint32_t C;
+  };
+  setLogLevel(kError);
+  return createKernel(ctx, {kShaderEncoder, 256, kf32},
+                      Bindings{inp, wte, wpe, out},
+                      /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
+                      /* params */
+                      EncoderParams{
+                        static_cast<uint32_t>(b),
+                        static_cast<uint32_t>(t),
+                        static_cast<uint32_t>(c)
+                      });
+}
+
+Kernel encoder_backward(Context& ctx, Tensor& dwte, Tensor& dwpe,
+                        Tensor& dout, Tensor& inp,
+                        int B, int T, int C) {
+  unsigned long b = static_cast<unsigned long>(B);
+  unsigned long t = static_cast<unsigned long>(T);
+  unsigned long c = static_cast<unsigned long>(C);
+  unsigned long v = VOCAB_SIZE;
+  struct EncoderParams {
+    uint32_t B;
+    uint32_t T;
+    uint32_t C;
+  };
+  setLogLevel(kError);
+  return createKernel(ctx, {kShaderEncoderBackward, 256, kf32},
+                      Bindings{dwte, dwpe, dout, inp},
+                      /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
+                      /* params */
+                      EncoderParams{
+                        static_cast<uint32_t>(b),
+                        static_cast<uint32_t>(t),
+                        static_cast<uint32_t>(c)
+                      });
+}
+
+Kernel layernorm_forward(Context& ctx, Tensor& out, Tensor& mean, Tensor& rstd,
+                         Tensor& inp, Tensor& weight, Tensor& bias,
+                         int B, int T, int C){
+  unsigned long b = static_cast<unsigned long>(B);
+  unsigned long t = static_cast<unsigned long>(T);
+  unsigned long c = static_cast<unsigned long>(C);
+  struct LayerNormParams {
+    uint32_t B;
+    uint32_t T;
+    uint32_t C;
+  };
+  setLogLevel(kError);
+  return createKernel(ctx, {kShaderLayerNorm, 256, kf32},
+                      Bindings{inp, weight, bias, out, mean, rstd},
+                      /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
+                      /* params */
+                      LayerNormParams{
+                        static_cast<uint32_t>(b),
+                        static_cast<uint32_t>(t),
+                        static_cast<uint32_t>(c)
+                      });
+}
+
+Kernel layernorm_backward(Context& ctx, Tensor& dinp, Tensor& dweight, Tensor& dbias,
+                          Tensor& dout, Tensor& inp, Tensor& weight, Tensor& mean, Tensor& rstd,
+                          int B, int T, int C){
+  unsigned long b = static_cast<unsigned long>(B);
+  unsigned long t = static_cast<unsigned long>(T);
+  unsigned long c = static_cast<unsigned long>(C);
+  struct LayerNormParams {
+    uint32_t B;
+    uint32_t T;
+    uint32_t C;
+  };
+  setLogLevel(kError);
+  return createKernel(ctx, {kShaderLayerNormBackward, 256, kf32},
+                      Bindings{dinp, dweight, dbias, dout, inp, weight, mean, rstd},
+                      /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
+                      /* params */
+                      LayerNormParams{
+                        static_cast<uint32_t>(b),
+                        static_cast<uint32_t>(t),
+                        static_cast<uint32_t>(c)
+                      });
+}
+
+struct DurationTime {
+  std::chrono::high_resolution_clock::time_point start;
+  std::chrono::high_resolution_clock::time_point end;
+  std::chrono::microseconds duration;
+  std::string src;
+  bool verbose;
+  
+  inline DurationTime(const std::string& src, bool verbose = true) {
+    this->src = src;
+    this->verbose = verbose;
+    start = std::chrono::high_resolution_clock::now();
+  }
+
+  inline ~DurationTime() {
+    end = std::chrono::high_resolution_clock::now();
+    duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    if (this->verbose) {
+      printf("Duration(%s): %.1f microseconds\n", src.c_str(), static_cast<double>(duration.count()));
+    }
+  }
+};
+
+
+Kernel matmul_forward(Context& ctx, Tensor& out,
+                      const Tensor& inp, const Tensor& weight, const Tensor& bias,
+                      int B, int T, int C, int OC){
+  bool verbose = false;
+  DurationTime duration("matmul_forward_gpu", verbose);
+  struct MatmulParams {
+    uint32_t B;
+    uint32_t T;
+    uint32_t C;
+    uint32_t OC;
+  };
+  unsigned long b = static_cast<unsigned long>(B);
+  unsigned long t = static_cast<unsigned long>(T);
+  unsigned long c = static_cast<unsigned long>(C);
+  unsigned long oc = static_cast<unsigned long>(OC);
+  setLogLevel(kError);
+
+  constexpr size_t BT = 64;
+  constexpr size_t BC = 8;
+  constexpr size_t BOC = 64;
+  constexpr size_t TT = BT / BC;
+  constexpr size_t TOC = BOC / BC;
+  size_t num_threads = BT * BOC / (TT * TOC);
+  Shape wgSize = {num_threads, 1, 1};
+  Shape nWorkgroups = {b, cdiv(T, BT), cdiv(OC, BOC)};
+
+  std::string kShaderMatmul2DTiling_(kShaderMatmul2DTiling);
+  std::string kShaderMatmul2D(loopUnrolling(
+                                            replaceAll(kShaderMatmul2DTiling_,
+                                                       {{"{{precision}}", toString(kf32)},
+                                                        {"{{BT}}", toString(BT)},
+                                                        {"{{BC}}", toString(BC)},
+                                                        {"{{BOC}}", toString(BOC)},
+                                                        {"{{TT}}", toString(TT)},
+                                                        {"{{TOC}}", toString(TOC)},
+                                                        {"{{NUM_TILEI}}", toString(BT * BC / num_threads)},
+                                                        {"{{NUM_TILEW}}", toString(BOC * BC / num_threads)}
+                                                       })
+                                            )
+                              );
+
+  return createKernel(ctx, {kShaderMatmul2D, wgSize, kf32},
+                      Bindings{inp, weight, bias, out},
+                      nWorkgroups,
+                      /* params */
+                      MatmulParams{
+                        static_cast<uint32_t>(b),
+                        static_cast<uint32_t>(t),
+                        static_cast<uint32_t>(c),
+                        static_cast<uint32_t>(oc)
+                      });
+}
+
+Kernel matmul_backward(Context& ctx, Tensor& dinp, Tensor& dweight, Tensor& dbias,
+                       const Tensor& dout, const Tensor& inp, const Tensor& weight,
+                       int B, int T, int C, int OC){
+  struct MatmulParams {
+    uint32_t B;
+    uint32_t T;
+    uint32_t C;
+    uint32_t OC;
+  };
+  unsigned long b = static_cast<unsigned long>(B);
+  unsigned long t = static_cast<unsigned long>(T);
+  unsigned long c = static_cast<unsigned long>(C);
+  unsigned long oc = static_cast<unsigned long>(OC);
+  setLogLevel(kError);
+  return createKernel(ctx, {kShaderMatmulBackward, 256, kf32},
+                      Bindings{dinp, dweight, dbias, dout, inp, weight},
+                      /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
+                      /* params */
+                      MatmulParams{
+                        static_cast<uint32_t>(b),
+                        static_cast<uint32_t>(t),
+                        static_cast<uint32_t>(c),
+                        static_cast<uint32_t>(oc)
+                      });
+}
+
+Kernel attention_forward(Context& ctx, Tensor& out, Tensor& preatt, Tensor& att,
+                         Tensor& inp,
+                         int B, int T, int C, int NH){
+  struct AttentionParams {
+    uint32_t B;
+    uint32_t T;
+    uint32_t C;
+    uint32_t NH;
+  };
+  unsigned long b = static_cast<unsigned long>(B);
+  unsigned long t = static_cast<unsigned long>(T);
+  unsigned long c = static_cast<unsigned long>(C);
+  unsigned long nh = static_cast<unsigned long>(NH);
+  setLogLevel(kError);
+  return createKernel(ctx, {kShaderAttention, 256, kf32},
+                      Bindings{inp, preatt, att, out},
+                      /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
+                      /* params */
+                      AttentionParams{
+                        static_cast<uint32_t>(b),
+                        static_cast<uint32_t>(t),
+                        static_cast<uint32_t>(c),
+                        static_cast<uint32_t>(nh)
+                      });
+}
+
+Kernel attention_backward(Context& ctx, Tensor& dinp, Tensor& dpreatt, Tensor& datt,
+                          Tensor& dout, Tensor& inp, Tensor& att,
+                          int B, int T, int C, int NH){
+  struct AttentionParams {
+    uint32_t B;
+    uint32_t T;
+    uint32_t C;
+    uint32_t NH;
+  };
+  unsigned long b = static_cast<unsigned long>(B);
+  unsigned long t = static_cast<unsigned long>(T);
+  unsigned long c = static_cast<unsigned long>(C);
+  unsigned long nh = static_cast<unsigned long>(NH);
+  setLogLevel(kError);
+  return createKernel(ctx, {kShaderAttentionBackward, 256, kf32},
+                      Bindings{dinp, dpreatt, datt, dout, inp, att},
+                      /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
+                      /* params */
+                      AttentionParams{
+                        static_cast<uint32_t>(b),
+                        static_cast<uint32_t>(t),
+                        static_cast<uint32_t>(c),
+                        static_cast<uint32_t>(nh)
+                      });
+}
+
+Kernel gelu_forward(Context& ctx, Tensor& out, Tensor& inp, int n) {
+  unsigned long N = static_cast<unsigned long>(n);
+  setLogLevel(kError);
+  return createKernel(ctx, {kShaderGelu, 256, kf32},
+                      Bindings{inp, out},
+                      /* nWorkgroups */ {cdiv(N, 256), 1, 1});
+}
+
+Kernel gelu_backward(Context& ctx, Tensor& dinp, Tensor& inp, Tensor& dout, int N){
+  unsigned long n = static_cast<unsigned long>(N);
+  setLogLevel(kError);
+  return createKernel(ctx, {kShaderGeluBackward, 256, kf32},
+                      Bindings{inp, dout, dinp},
+                      /* nWorkgroups */ {cdiv(n, 256), 1, 1});
+}
+
+Kernel residual_forward(Context& ctx, Tensor& out, Tensor& inp1, Tensor& inp2, int N){
+  unsigned long n = static_cast<unsigned long>(N);
+  setLogLevel(kError);
+  return createKernel(ctx, {kShaderResidual, 256, kf32},
+                      Bindings{inp1, inp2, out},
+                      /* nWorkgroups */ {cdiv(n, 256), 1, 1});
+}
+
+Kernel residual_backward(Context& ctx, Tensor& dinp1, Tensor& dinp2, Tensor& dout, int N){
+  unsigned long n = static_cast<unsigned long>(N);
+  setLogLevel(kError);
+  return createKernel(ctx, {kShaderResidualBackward, 256, kf32},
+                      Bindings{dout, dinp1, dinp2},
+                      /* nWorkgroups */ {cdiv(n, 256), 1, 1});
+}
+
+Kernel softmax_forward(Context& ctx, Tensor& probs, Tensor& logits, int B, int T, int V, int Vp) {
+  struct SoftmaxParam {
+    uint32_t N;
+    uint32_t C;
+    uint32_t Cp;
+  };
+  uint32_t b = static_cast<uint32_t>(B);
+  uint32_t t = static_cast<uint32_t>(T);
+  uint32_t c = static_cast<uint32_t>(V);
+  uint32_t cp = static_cast<uint32_t>(Vp);
+  assert( (B*T) % 256 == 0);
+  return createKernel(
+      ctx, {kShaderSoftmax1, 256, kf32}, Bindings{logits, probs},
+      Shape{cdiv(B * T, 256), 1, 1}, SoftmaxParam{b * t, c, cp});
+}
+
+Kernel crossentropy_forward(Context& ctx, Tensor& losses,
+                            Tensor& probs, Tensor& targets,
+                            int B, int T, int Vp){
+  struct CrossEntropyParams {
+    uint32_t B;
+    uint32_t T;
+    uint32_t VP;
+  };
+  unsigned long b = static_cast<unsigned long>(B);
+  unsigned long t = static_cast<unsigned long>(T);
+  unsigned long vp = static_cast<unsigned long>(Vp);
+  setLogLevel(kError);
+  return createKernel(ctx, {kShaderCrossEntropyForward, 256, kf32},
+                      Bindings{losses, probs, targets},
+                      /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
+                      /* params */
+                      CrossEntropyParams{
+                        static_cast<uint32_t>(b),
+                        static_cast<uint32_t>(t),
+                        static_cast<uint32_t>(vp)
+                      });
+}
+
+Kernel crossentropy_softmax_backward(Context& ctx, Tensor& dlogits,
+                                     Tensor& dlosses, Tensor& probs, Tensor& targets,
+                                     int B, int T, int V, int Vp){
+  struct CrossEntropySoftmaxBackwardParams {
+    uint32_t B;
+    uint32_t T;
+    uint32_t V;
+    uint32_t VP;
+  };
+  unsigned long b = static_cast<unsigned long>(B);
+  unsigned long t = static_cast<unsigned long>(T);
+  unsigned long v = static_cast<unsigned long>(V);
+  unsigned long vp = static_cast<unsigned long>(Vp);
+  setLogLevel(kError);
+  return createKernel(ctx, {kShaderCrossEntropySoftmaxBackward, 256, kf32},
+                      Bindings{dlogits, dlosses, probs, targets},
+                      /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
+                      /* params */
+                      CrossEntropySoftmaxBackwardParams{
+                        static_cast<uint32_t>(b),
+                        static_cast<uint32_t>(t),
+                        static_cast<uint32_t>(v),
+                        static_cast<uint32_t>(vp)
+                      });
+}
diff --git a/experimental/kernels/ops_aot.hpp b/experimental/kernels/ops_aot.hpp
new file mode 100644
index 0000000..8ec6d8e
--- /dev/null
+++ b/experimental/kernels/ops_aot.hpp
@@ -0,0 +1,108 @@
+#ifndef OPS_H
+#define OPS_H
+
+#include "gpu.hpp"
+
+using namespace gpu;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define VOCAB_SIZE 50257
+
+// See https://github.com/google/dawn/blob/a8fbe981a86cb59536e2de423d2013a82d9b54a0/src/dawn/native/Limits.cpp
+#define LIMITS_BUFFER_SIZE_1GB { \
+    .nextInChain = nullptr, \
+    .limits = { \
+      .maxTextureDimension1D=8192, \
+      .maxTextureDimension2D=8192, \
+      .maxTextureDimension3D=2048, \
+      .maxTextureArrayLayers=256, \
+      .maxBindGroups=4, \
+      .maxBindGroupsPlusVertexBuffers=24, \
+      .maxBindingsPerBindGroup=1000, \
+      .maxDynamicUniformBuffersPerPipelineLayout=8, \
+      .maxDynamicStorageBuffersPerPipelineLayout=4, \
+      .maxSampledTexturesPerShaderStage=16, \
+      .maxSamplersPerShaderStage=16, \
+      .maxStorageBuffersPerShaderStage=8, \
+      .maxStorageTexturesPerShaderStage=4, \
+      .maxUniformBuffersPerShaderStage=12, \
+      .maxUniformBufferBindingSize=65536, \
+      .maxStorageBufferBindingSize=1073741824, \
+      .minUniformBufferOffsetAlignment=256, \
+      .minStorageBufferOffsetAlignment=256, \
+      .maxVertexBuffers=8, \
+      .maxBufferSize=0x80000000, \
+      .maxVertexAttributes=16, \
+      .maxVertexBufferArrayStride=2048, \
+      .maxInterStageShaderComponents=64, \
+      .maxInterStageShaderVariables=16, \
+      .maxColorAttachments=8, \
+      .maxColorAttachmentBytesPerSample=32, \
+      .maxComputeWorkgroupStorageSize=16384, \
+      .maxComputeInvocationsPerWorkgroup=256, \
+      .maxComputeWorkgroupSizeX=256, \
+      .maxComputeWorkgroupSizeY=256, \
+      .maxComputeWorkgroupSizeZ=64, \
+      .maxComputeWorkgroupsPerDimension=65535 \
+    } \
+  }
+
+
+Kernel encoder_forward(Context& ctx, Tensor& out,
+                       Tensor& inp, Tensor& wte, Tensor& wpe,
+                       int B, int T, int C);
+
+Kernel encoder_backward(Context& ctx, Tensor& dwte, Tensor& dwpe,
+                        Tensor& dout, Tensor& inp,
+                        int B, int T, int C);
+
+Kernel layernorm_forward(Context& ctx, Tensor& out, Tensor& mean, Tensor& rstd,
+                         Tensor& inp, Tensor& weight, Tensor& bias,
+                         int B, int T, int C);
+
+Kernel layernorm_backward(Context& ctx, Tensor& dinp, Tensor& dweight, Tensor& dbias,
+                          Tensor& dout, Tensor& inp, Tensor& weight, Tensor& mean, Tensor& rstd,
+                          int B, int T, int C);
+
+Kernel matmul_forward(Context& ctx, Tensor& out,
+                      const Tensor& inp, const Tensor& weight, const Tensor& bias,
+                      int B, int T, int C, int OC);
+
+Kernel matmul_backward(Context& ctx, Tensor& dinp, Tensor& dweight, Tensor& dbias,
+                       const Tensor& dout, const Tensor& inp, const Tensor& weight,
+                       int B, int T, int C, int OC);
+
+Kernel attention_forward(Context& ctx, Tensor& out, Tensor& preatt, Tensor& att,
+                         Tensor& inp,
+                         int B, int T, int C, int NH);
+
+Kernel attention_backward(Context& ctx, Tensor& dinp, Tensor& dpreatt, Tensor& datt,
+                          Tensor& dout, Tensor& inp, Tensor& att,
+                          int B, int T, int C, int NH);
+
+Kernel gelu_forward(Context& ctx, Tensor& out, Tensor& inp, int N);
+
+Kernel gelu_backward(Context& ctx, Tensor& dinp, Tensor& inp, Tensor& dout, int N);
+
+Kernel residual_forward(Context& ctx, Tensor& out, Tensor& inp1, Tensor& inp2, int N);
+
+Kernel residual_backward(Context& ctx, Tensor& dinp1, Tensor& dinp2, Tensor& dout, int N);
+
+Kernel softmax_forward(Context& ctx, Tensor& probs, Tensor& logits, int B, int T, int V, int Vp);
+
+Kernel crossentropy_forward(Context& ctx, Tensor& losses,
+                            Tensor& probs, Tensor& targets,
+                            int B, int T, int Vp);
+
+Kernel crossentropy_softmax_backward(Context& ctx, Tensor& dlogits,
+                                     Tensor& dlosses, Tensor& probs, Tensor& targets,
+                                     int B, int T, int V, int Vp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // OPS_H

From 43e4ac0e594cdb00a71c5ced3170441b94b73ba2 Mon Sep 17 00:00:00 2001
From: Junji Hashimoto <junji.hashimoto@gree.net>
Date: Tue, 22 Oct 2024 01:31:35 +0900
Subject: [PATCH 2/7] Update

---
 experimental/kernels/Makefile            |   8 +-
 experimental/kernels/gpt2_webgpu_aot.cpp | 586 +++++++++++++++--------
 experimental/kernels/ops_aot.hpp         |   8 -
 3 files changed, 393 insertions(+), 209 deletions(-)

diff --git a/experimental/kernels/Makefile b/experimental/kernels/Makefile
index 5817e23..7430a71 100644
--- a/experimental/kernels/Makefile
+++ b/experimental/kernels/Makefile
@@ -99,6 +99,10 @@ build/gpt2_webgpu: llm.c gpt2_124M.bin llm.c gpt2_webgpu.cpp ops.cpp
 	mkdir -p build
 	$(CC) $(CXXFLAGS) -Illm.c $(LDFLAGS) -o $@ gpt2_webgpu.cpp ops.cpp
 
+build/gpt2_webgpu_aot: llm.c gpt2_124M.bin llm.c gpt2_webgpu_aot.cpp ops_aot.cpp
+	mkdir -p build
+	$(CC) $(CXXFLAGS) -Illm.c $(LDFLAGS) -o $@ gpt2_webgpu_aot.cpp ops_aot.cpp
+
 build/gpt2_webgpu.html: check-emsdk gpt2_webgpu.cpp term.html llm.c
 	em++ gpt2_webgpu.cpp ops.cpp \
 		--preload-file gpt2_tokenizer.bin@/gpt2_tokenizer.bin \
@@ -116,8 +120,8 @@ watch-web:
 watch-native:
 	ls *.cpp *.c *.hpp *.h | entr -s "rm -f build/gpt2_webgpu && rm -f build/ops.o && make build/gpt2_webgpu"
 
-run-native: build/gpt2_webgpu
-	. $(GPUCPP)/source && ./build/gpt2_webgpu
+run-native: build/gpt2_webgpu_aot
+	. $(GPUCPP)/source && ./build/gpt2_webgpu_aot
 
 # server: build/train_gpt2.html build/test_gpt2.html build/gpt2_gpucpp.html
 server: build/gpt2_webgpu.html
diff --git a/experimental/kernels/gpt2_webgpu_aot.cpp b/experimental/kernels/gpt2_webgpu_aot.cpp
index 0c136f7..e0a1d54 100644
--- a/experimental/kernels/gpt2_webgpu_aot.cpp
+++ b/experimental/kernels/gpt2_webgpu_aot.cpp
@@ -1,5 +1,5 @@
 #include "gpu.hpp"
-#include "ops.hpp"
+#include "ops_aot.hpp"
 /*
 This file trains the GPT-2 model.
 This version is the clean, minimal, reference. As such:
@@ -91,25 +91,25 @@ void fill_in_parameter_sizes(size_t* param_sizes, GPT2Config config) {
 }
 
 // allocate memory for the parameters and point the individual tensors to the right places
-float* malloc_and_point_parameters(ParameterTensors* params, size_t* param_sizes) {
-    size_t num_parameters = 0;
-    for (size_t i = 0; i < NUM_PARAMETER_TENSORS; i++) {
-        num_parameters += param_sizes[i];
-    }
-    // malloc all parameters all at once
-    float* params_memory = (float*)mallocCheck(num_parameters * sizeof(float));
-    // assign all the tensors
-    float** ptrs[] = {
-        &params->wte, &params->wpe, &params->ln1w, &params->ln1b, &params->qkvw, &params->qkvb,
-        &params->attprojw, &params->attprojb, &params->ln2w, &params->ln2b, &params->fcw, &params->fcb,
-        &params->fcprojw, &params->fcprojb, &params->lnfw, &params->lnfb
-    };
-    float* params_memory_iterator = params_memory;
-    for (size_t i = 0; i < NUM_PARAMETER_TENSORS; i++) {
-        *(ptrs[i]) = params_memory_iterator;
-        params_memory_iterator += param_sizes[i];
+void malloc_and_point_parameters(Context& ctx, ParameterTensors* params, size_t* param_sizes) {
+    params->wte = createTensor(ctx, Shape{param_sizes[0]}, kf32);
+    params->wpe = createTensor(ctx, Shape{param_sizes[1]}, kf32);
+    for(int l = 0; l < NUM_PARAMETER_LAYERS; l++) {
+      params->ln1w.push_back(createTensor(ctx, Shape{param_sizes[2]/NUM_PARAMETER_LAYERS}, kf32));
+      params->ln1b.push_back(createTensor(ctx, Shape{param_sizes[3]/NUM_PARAMETER_LAYERS}, kf32));
+      params->qkvw.push_back(createTensor(ctx, Shape{param_sizes[4]/NUM_PARAMETER_LAYERS}, kf32));
+      params->qkvb.push_back(createTensor(ctx, Shape{param_sizes[5]/NUM_PARAMETER_LAYERS}, kf32));
+      params->attprojw.push_back(createTensor(ctx, Shape{param_sizes[6]/NUM_PARAMETER_LAYERS}, kf32));
+      params->attprojb.push_back(createTensor(ctx, Shape{param_sizes[7]/NUM_PARAMETER_LAYERS}, kf32));
+      params->ln2w.push_back(createTensor(ctx, Shape{param_sizes[8]/NUM_PARAMETER_LAYERS}, kf32));
+      params->ln2b.push_back(createTensor(ctx, Shape{param_sizes[9]/NUM_PARAMETER_LAYERS}, kf32));
+      params->fcw.push_back(createTensor(ctx, Shape{param_sizes[10]/NUM_PARAMETER_LAYERS}, kf32));
+      params->fcb.push_back(createTensor(ctx, Shape{param_sizes[11]/NUM_PARAMETER_LAYERS}, kf32));
+      params->fcprojw.push_back(createTensor(ctx, Shape{param_sizes[12]/NUM_PARAMETER_LAYERS}, kf32));
+      params->fcprojb.push_back(createTensor(ctx, Shape{param_sizes[13]/NUM_PARAMETER_LAYERS}, kf32));
     }
-    return params_memory;
+    params->lnfw = createTensor(ctx, Shape{param_sizes[14]}, kf32);
+    params->lnfb = createTensor(ctx, Shape{param_sizes[15]}, kf32);
 }
 
 
@@ -154,7 +154,7 @@ typedef struct {
     Kernel layernorm_final_forward;
     Kernel matmul_final_forward;
     Kernel softmax_final_forward;
-    std::vector<Kernel> crossentropy_forward;
+    Kernel crossentropy_forward;
   
     Kernel crossentropy_softmax_backward;
     Kernel matmul_final_backward;
@@ -201,24 +201,32 @@ void fill_in_activation_sizes(size_t* act_sizes, GPT2Config config, int B, int T
     act_sizes[22] = B * T; // losses
 }
 
-float* malloc_and_point_activations(ActivationTensors* acts, size_t* act_sizes) {
-    size_t num_activations = 0;
-    for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
-        num_activations += act_sizes[i];
+void malloc_and_point_activations(Context& ctx, ActivationTensors* acts, size_t* act_sizes) {
+    acts->encoded = createTensor(ctx, Shape{act_sizes[0]}, kf32);
+    for (int l = 0; l < NUM_PARAMETER_LAYERS; l++) {
+        acts->ln1.push_back(createTensor(ctx, Shape{act_sizes[1]/NUM_PARAMETER_LAYERS}, kf32));
+        acts->ln1_mean.push_back(createTensor(ctx, Shape{act_sizes[2]/NUM_PARAMETER_LAYERS}, kf32));
+        acts->ln1_rstd.push_back(createTensor(ctx, Shape{act_sizes[3]/NUM_PARAMETER_LAYERS}, kf32));
+        acts->qkv.push_back(createTensor(ctx, Shape{act_sizes[4]/NUM_PARAMETER_LAYERS}, kf32));
+        acts->atty.push_back(createTensor(ctx, Shape{act_sizes[5]/NUM_PARAMETER_LAYERS}, kf32));
+        acts->preatt.push_back(createTensor(ctx, Shape{act_sizes[6]/NUM_PARAMETER_LAYERS}, kf32));
+        acts->att.push_back(createTensor(ctx, Shape{act_sizes[7]/NUM_PARAMETER_LAYERS}, kf32));
+        acts->attproj.push_back(createTensor(ctx, Shape{act_sizes[8]/NUM_PARAMETER_LAYERS}, kf32));
+        acts->residual2.push_back(createTensor(ctx, Shape{act_sizes[9]/NUM_PARAMETER_LAYERS}, kf32));
+        acts->ln2.push_back(createTensor(ctx, Shape{act_sizes[10]/NUM_PARAMETER_LAYERS}, kf32));
+        acts->ln2_mean.push_back(createTensor(ctx, Shape{act_sizes[11]/NUM_PARAMETER_LAYERS}, kf32));
+        acts->ln2_rstd.push_back(createTensor(ctx, Shape{act_sizes[12]/NUM_PARAMETER_LAYERS}, kf32));
+        acts->fch.push_back(createTensor(ctx, Shape{act_sizes[13]/NUM_PARAMETER_LAYERS}, kf32));
+        acts->fch_gelu.push_back(createTensor(ctx, Shape{act_sizes[14]/NUM_PARAMETER_LAYERS}, kf32));
+        acts->fcproj.push_back(createTensor(ctx, Shape{act_sizes[15]/NUM_PARAMETER_LAYERS}, kf32));
+        acts->residual3.push_back(createTensor(ctx, Shape{act_sizes[16]/NUM_PARAMETER_LAYERS}, kf32));
     }
-    float* acts_memory = (float*)mallocCheck(num_activations * sizeof(float));
-    float** ptrs[] = {
-        &acts->encoded, &acts->ln1, &acts->ln1_mean, &acts->ln1_rstd, &acts->qkv, &acts->atty,
-        &acts->preatt, &acts->att, &acts->attproj, &acts->residual2, &acts->ln2, &acts->ln2_mean,
-        &acts->ln2_rstd, &acts->fch, &acts->fch_gelu, &acts->fcproj, &acts->residual3, &acts->lnf,
-        &acts->lnf_mean, &acts->lnf_rstd, &acts->logits, &acts->probs, &acts->losses
-    };
-    float* acts_memory_iterator = acts_memory;
-    for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
-        *(ptrs[i]) = acts_memory_iterator;
-        acts_memory_iterator += act_sizes[i];
-    }
-    return acts_memory;
+    acts->lnf = createTensor(ctx, Shape{act_sizes[17]}, kf32);
+    acts->lnf_mean = createTensor(ctx, Shape{act_sizes[18]}, kf32);
+    acts->lnf_rstd = createTensor(ctx, Shape{act_sizes[19]}, kf32);
+    acts->logits = createTensor(ctx, Shape{act_sizes[20]}, kf32);
+    acts->probs = createTensor(ctx, Shape{act_sizes[21]}, kf32);
+    acts->losses = createTensor(ctx, Shape{act_sizes[22]}, kf32);
 }
 
 struct GPUParameters {
@@ -240,7 +248,6 @@ typedef struct {
     GPT2Config config;
     // the weights (parameters) of the model, and their sizes
     ParameterTensors params;
-    GPUParameters params_; // TODO(avh): eventually this replaces params
     size_t param_sizes[NUM_PARAMETER_TENSORS];
     float* params_memory;
     size_t num_parameters;
@@ -252,7 +259,6 @@ typedef struct {
     float* v_memory;
     // the activations of the model, and their sizes
     ActivationTensors acts;
-    GPUActivations acts_; // TODO(avh): eventually this replaces params
     size_t act_sizes[NUM_ACTIVATION_TENSORS];
     float* acts_memory;
     size_t num_activations;
@@ -262,13 +268,16 @@ typedef struct {
     // other run state configuration
     int batch_size; // the batch size (B) of current forward pass
     int seq_len; // the sequence length (T) of current forward pass
-    int* inputs; // the input tokens for the current forward pass
-    int* targets; // the target tokens for the current forward pass
+    Tensor inputs; // the input tokens for the current forward pass
+    Tensor targets; // the target tokens for the current forward pass
     float mean_loss; // after a forward pass with targets, will be populated with the mean loss
+
+    // kernels
+    Kernels kernels;
 } GPT2;
 
 void gpt2_build_from_checkpoint(Context& ctx, GPT2 *model, const char* checkpoint_path) {
-
+    printf("Building GPT-2 model from checkpoint '%s'\n", checkpoint_path);
     // read in model from a checkpoint file
     FILE *model_file = fopenCheck(checkpoint_path, "rb");
     int model_header[256];
@@ -302,7 +311,6 @@ void gpt2_build_from_checkpoint(Context& ctx, GPT2 *model, const char* checkpoin
 
     // allocate space for all the parameters and read them in
     fill_in_parameter_sizes(model->param_sizes,  model->config);
-
     // count the number of parameters
     size_t num_parameters = 0;
     for (size_t i = 0; i < NUM_PARAMETER_TENSORS; i++) {
@@ -312,29 +320,65 @@ void gpt2_build_from_checkpoint(Context& ctx, GPT2 *model, const char* checkpoin
     model->num_parameters = num_parameters;
 
     // read in all the parameters from file
-    model->params_memory = malloc_and_point_parameters(&model->params, model->param_sizes);
+    malloc_and_point_parameters(ctx, &model->params, model->param_sizes);
+    model->params_memory = (float*)mallocCheck(num_parameters * sizeof(float));
     freadCheck(model->params_memory, sizeof(float), num_parameters, model_file);
     fcloseCheck(model_file);
 
+    // transfer to GPU memory
+    float* iter = model->params_memory;
+    toGPU(ctx, iter, model->params.wte);
+    iter += model->param_sizes[0];
+    toGPU(ctx, iter, model->params.wpe);
+    iter += model->param_sizes[1];
+    for (int l = 0; l < L; l++) {
+        toGPU(ctx, iter, model->params.ln1w[l]);
+        iter += model->param_sizes[2]/L;
+        toGPU(ctx, iter, model->params.ln1b[l]);
+        iter += model->param_sizes[3]/L;
+        toGPU(ctx, iter, model->params.qkvw[l]);
+        iter += model->param_sizes[4]/L;
+        toGPU(ctx, iter, model->params.qkvb[l]);
+        iter += model->param_sizes[5]/L;
+        toGPU(ctx, iter, model->params.attprojw[l]);
+        iter += model->param_sizes[6]/L;
+        toGPU(ctx, iter, model->params.attprojb[l]);
+        iter += model->param_sizes[7]/L;
+        toGPU(ctx, iter, model->params.ln2w[l]);
+        iter += model->param_sizes[8]/L;
+        toGPU(ctx, iter, model->params.ln2b[l]);
+        iter += model->param_sizes[9]/L;
+        toGPU(ctx, iter, model->params.fcw[l]);
+        iter += model->param_sizes[10]/L;
+        toGPU(ctx, iter, model->params.fcb[l]);
+        iter += model->param_sizes[11]/L;
+        toGPU(ctx, iter, model->params.fcprojw[l]);
+        iter += model->param_sizes[12]/L;
+        toGPU(ctx, iter, model->params.fcprojb[l]);
+        iter += model->param_sizes[13]/L;
+    }
+    toGPU(ctx, iter, model->params.lnfw);
+    iter += model->param_sizes[14];
+    toGPU(ctx, iter, model->params.lnfb);
+    iter += model->param_sizes[15];
+    
+
     // other inits
     model->acts_memory = NULL;
     model->grads_memory = NULL;
     model->m_memory = NULL;
     model->v_memory = NULL;
     model->grads_acts_memory = NULL;
-    model->inputs = NULL;
-    model->targets = NULL;
     model->batch_size = 0;
     model->seq_len = 0;
     model->mean_loss = -1.0f; // -1.0f will designate no loss
 
-    // TODO(avh): this is just a resource test for now, eventually deprecate CPU allocations
-    gpu_alloc(ctx, model->params_.data, model->param_sizes, NUM_PARAMETER_TENSORS);
+    printf("Model build complete\n");
 
 }
 
 
-void gpt2_forward(Context& ctx, GPT2 *model, int* inputs, int* targets, size_t B, size_t T) {
+void gpt2_forward(Context& ctx, GPT2 *model, Tensor& inputs, Tensor& targets, size_t B, size_t T) {
     // targets are optional and could be NULL
 
     // ensure the model was initialized or error out
@@ -350,13 +394,13 @@ void gpt2_forward(Context& ctx, GPT2 *model, int* inputs, int* targets, size_t B
     size_t NH = model->config.num_heads;
     size_t C = model->config.channels;
 
-    // validate inputs, all indices must be in the range [0, V)
-    for(int i = 0; i < B * T; i++) {
-        assert(0 <= inputs[i] && inputs[i] < V);
-        if (targets != NULL) {
-            assert(0 <= targets[i] && targets[i] < V);
-        }
-    }
+    // // validate inputs, all indices must be in the range [0, V)
+    // for(int i = 0; i < B * T; i++) {
+    //     assert(0 <= inputs[i] && inputs[i] < V);
+    //     if (targets != NULL) {
+    //         assert(0 <= targets[i] && targets[i] < V);
+    //     }
+    // }
 
     // allocate space for all the activations if needed (done here, lazily)
     if(model->acts_memory == NULL) {
@@ -365,8 +409,8 @@ void gpt2_forward(Context& ctx, GPT2 *model, int* inputs, int* targets, size_t B
         model->seq_len = T;
         // and now allocate the space
         fill_in_activation_sizes(model->act_sizes, model->config, B, T);
+
         // TODO(avh): this is just a resource test for now, eventually deprecate CPU allocations
-        gpu_alloc(ctx, model->acts_.data, model->act_sizes, NUM_PARAMETER_TENSORS);
         size_t num_activations = 0;
         for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
             num_activations += model->act_sizes[i];
@@ -374,10 +418,12 @@ void gpt2_forward(Context& ctx, GPT2 *model, int* inputs, int* targets, size_t B
         printf("num_activations: %zu\n", num_activations);
         model->num_activations = num_activations;
         printf("Allocating %.2f MB for activations\n", num_activations * sizeof(float) / (1024.0f * 1024.0f));
-        model->acts_memory = malloc_and_point_activations(&model->acts, model->act_sizes);
+        malloc_and_point_activations(ctx, &model->acts, model->act_sizes);
         // also create memory for caching inputs and targets
-        model->inputs = (int*)mallocCheck(B * T * sizeof(int));
-        model->targets = (int*)mallocCheck(B * T * sizeof(int)); // might be unused if we never have targets but it's small
+        //model->inputs = (int*)mallocCheck(B * T * sizeof(int));
+        //model->targets = (int*)mallocCheck(B * T * sizeof(int)); // might be unused if we never have targets but it's small
+        model->inputs = createTensor(ctx, Shape{B * T}, ki32);
+        model->targets = createTensor(ctx, Shape{B * T}, ki32);
     } else {
         // validate B,T is consistent with how we've allocated the memory before
         // in principle we could get more clever here in the future, for now this is safest
@@ -386,99 +432,202 @@ void gpt2_forward(Context& ctx, GPT2 *model, int* inputs, int* targets, size_t B
             exit(EXIT_FAILURE);
         }
     }
-
-    printf("Cache inputs/targets\n");
-    // cache the inputs/targets
-    memcpy(model->inputs, inputs, B * T * sizeof(int));
-    if (targets != NULL) {
-        memcpy(model->targets, targets, B * T * sizeof(int));
+    // create all kernels ahead of time
+    if (model->kernels.encoder_forward == nullptr) {
+        printf("Creating Kernels\n");
+        Kernels& kernels = model->kernels;
+        kernels.layernorm_forward.resize(L);
+        kernels.layernorm1_backward.resize(L);
+        kernels.qkv_projection_forward.resize(L);
+        kernels.qkv_projection_backward.resize(L);
+        kernels.attention_forward.resize(L);
+        kernels.attention_backward.resize(L);
+        kernels.attention_projection_forward.resize(L);
+        kernels.attention_projection_backward.resize(L);
+        kernels.residual_forward.resize(L);
+        kernels.residual2_forward.resize(L);
+        kernels.residual2_backward.resize(L);
+        kernels.ff_up_forward.resize(L);
+        kernels.ff_up_backward.resize(L);
+        kernels.gelu_forward.resize(L);
+        kernels.gelu_backward.resize(L);
+        kernels.ff_down_forward.resize(L);
+        kernels.ff_down_backward.resize(L);
+        for (int l = 0; l < L; ++l) {
+            kernels.layernorm_forward[l] = layernorm_forward(ctx, model->acts.ln1[l], model->acts.ln1_mean[l], model->acts.ln1_rstd[l],
+                                                            /*input=*/ model->acts.residual3[l], /*weight=*/ model->params.ln1w[l], /*bias=*/ model->params.ln1b[l],
+                                                            B, T, C);
+            kernels.qkv_projection_forward[l] = matmul_forward(ctx, model->acts.qkv[l], model->acts.ln1[l], model->params.qkvw[l], model->params.qkvb[l], B, T, C, 3*C);
+            kernels.attention_forward[l] = attention_forward(ctx, model->acts.atty[l], model->acts.preatt[l], model->acts.att[l], model->acts.qkv[l], B, T, C, NH);
+            kernels.attention_projection_forward[l] = matmul_forward(ctx, model->acts.attproj[l], model->acts.atty[l], model->params.attprojw[l], model->params.attprojb[l], B, T, C, C);
+            kernels.residual_forward[l] = residual_forward(ctx, model->acts.residual2[l], model->acts.residual3[l], model->acts.attproj[l], B*T*C);
+            kernels.ff_up_forward[l] = matmul_forward(ctx, model->acts.fch[l], model->acts.ln2[l], model->params.fcw[l], model->params.fcb[l], B, T, C, 4*C);
+            kernels.gelu_forward[l] = gelu_forward(ctx, model->acts.fch_gelu[l], model->acts.fch[l], B*T*4*C);
+            kernels.ff_down_forward[l] = matmul_forward(ctx, model->acts.fcproj[l], model->acts.fch_gelu[l], model->params.fcw[l], model->params.fcb[l], B, T, 4*C, C);
+            kernels.residual2_forward[l] = residual_forward(ctx, model->acts.residual3[l], model->acts.residual2[l], model->acts.fcproj[l], B*T*C);
+        }
+        kernels.crossentropy_forward = crossentropy_forward(ctx, model->acts.losses, model->acts.probs, targets, B, T, Vp);
+        
+        kernels.encoder_forward = encoder_forward(ctx, model->acts.encoded, inputs, model->params.wte, model->params.wpe, B, T, C); // encoding goes into residual[0]
+        kernels.encoder_backward = encoder_backward(ctx, model->params.wte, model->params.wpe, model->acts.encoded, inputs, B, T, C);
+        kernels.layernorm_final_forward = layernorm_forward(ctx, model->acts.lnf, model->acts.lnf_mean, model->acts.lnf_rstd,
+                                                        /*input=*/ model->acts.residual3[L-1], /*weight=*/ model->params.lnfw, /*bias=*/ model->params.lnfb,
+                                                        B, T, C);
+        Tensor nullTensor = createTensor(ctx, Shape{1}, kf32);
+        kernels.matmul_final_forward = matmul_forward(ctx, model->acts.logits, model->acts.lnf, model->params.wte, nullTensor, B, T, C, Vp);
+        kernels.softmax_final_forward = softmax_forward(ctx, model->acts.probs, model->acts.logits, B, T, V, Vp);
+        kernels.crossentropy_softmax_backward = crossentropy_softmax_backward(ctx, model->acts.logits, model->acts.losses, model->acts.probs, targets, B, T, V, Vp);
+        kernels.matmul_final_backward = matmul_backward(ctx, model->acts.lnf, model->params.wte, nullTensor, model->acts.logits,
+                                                 model->acts.lnf, model->params.wte, B, T, C, Vp);
+        kernels.layernorm_final_backward = layernorm_backward(ctx, model->acts.residual3[L-1], model->params.lnfw, model->params.lnfb,
+                                                        model->acts.lnf, model->acts.residual3[L-1], model->params.lnfw,
+                                                        model->acts.lnf_mean, model->acts.lnf_rstd, B, T, C);
+        printf("Created Kernels\n");
     }
 
+    printf("Cache inputs/targets\n");
     printf("Forward pass\n");
     // forward pass
     ParameterTensors params = model->params; // for brevity
     ActivationTensors acts = model->acts;
     float* residual;
     printf("Encoding\n");
-    printf("inputs[0] = %d\n", inputs[0]);
-    encoder_forward(ctx, acts.encoded, inputs, params.wte, params.wpe, B, T, C); // encoding goes into residual[0]
+    //printf("inputs[0] = %d\n", inputs[0]);
+    // encoder_forward(ctx, acts.encoded, inputs, params.wte, params.wpe, B, T, C); // encoding goes into residual[0]
+    {
+        std::promise<void> promise;
+        std::future<void> future = promise.get_future();
+        dispatchKernel(ctx, model->kernels.encoder_forward, promise);
+        wait(ctx, future);
+    }
     for (int l = 0; l < L; l++) {
       printf("Forward Pass Layer %d\n", l);
 
-        residual = l == 0 ? acts.encoded : acts.residual3 + (l-1) * B * T * C;
-
-        // get the pointers of the weights for this layer
-        float* l_ln1w = params.ln1w + l * C;
-        float* l_ln1b = params.ln1b + l * C;
-        float* l_qkvw = params.qkvw + l * 3*C * C;
-        float* l_qkvb = params.qkvb + l * 3*C;
-        float* l_attprojw = params.attprojw + l * C * C;
-        float* l_attprojb = params.attprojb + l * C;
-        float* l_ln2w = params.ln2w + l * C;
-        float* l_ln2b = params.ln2b + l * C;
-        float* l_fcw = params.fcw + l * 4*C * C;
-        float* l_fcb = params.fcb + l * 4*C;
-        float* l_fcprojw = params.fcprojw + l * C * 4*C;
-        float* l_fcprojb = params.fcprojb + l * C;
-
-        // get the pointers of the activations for this layer
-        float* l_ln1 = acts.ln1 + l * B * T * C;
-        float* l_ln1_mean = acts.ln1_mean + l * B * T;
-        float* l_ln1_rstd = acts.ln1_rstd + l * B * T;
-        float* l_qkv = acts.qkv + l * B * T * 3*C;
-        float* l_atty = acts.atty + l * B * T * C;
-        float* l_preatt = acts.preatt + l * B * NH * T * T;
-        float* l_att = acts.att + l * B * NH * T * T;
-        float* l_attproj = acts.attproj + l * B * T * C;
-        float* l_residual2 = acts.residual2 + l * B * T * C;
-        float* l_ln2 = acts.ln2 + l * B * T * C;
-        float* l_ln2_mean = acts.ln2_mean + l * B * T;
-        float* l_ln2_rstd = acts.ln2_rstd + l * B * T;
-        float* l_fch = acts.fch + l * B * T * 4*C;
-        float* l_fch_gelu = acts.fch_gelu + l * B * T * 4*C;
-        float* l_fcproj = acts.fcproj + l * B * T * C;
-        float* l_residual3 = acts.residual3 + l * B * T * C;
-
         // now do the forward pass
         printf("  [Forward] : LayerNorm1\n");
-        layernorm_forward(ctx, l_ln1, l_ln1_mean, l_ln1_rstd, residual, l_ln1w, l_ln1b, B, T, C);
+        // layernorm_forward(ctx, l_ln1, l_ln1_mean, l_ln1_rstd, residual, l_ln1w, l_ln1b, B, T, C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.layernorm_forward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Forward] : QKV Projection\n");
-        matmul_forward(ctx, l_qkv, l_ln1, l_qkvw, l_qkvb, B, T, C, 3*C);
+        // matmul_forward(ctx, l_qkv, l_ln1, l_qkvw, l_qkvb, B, T, C, 3*C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.qkv_projection_forward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Forward] : Attention\n");
-        attention_forward(ctx, l_atty, l_preatt, l_att, l_qkv, B, T, C, NH);
+        // attention_forward(ctx, l_atty, l_preatt, l_att, l_qkv, B, T, C, NH);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.attention_forward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Forward] : Attention Projection\n");
-        matmul_forward(ctx, l_attproj, l_atty, l_attprojw, l_attprojb, B, T, C, C);
+        // matmul_forward(ctx, l_attproj, l_atty, l_attprojw, l_attprojb, B, T, C, C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.attention_projection_forward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Forward] : Residual1\n");
-        residual_forward(ctx, l_residual2, residual, l_attproj, B*T*C);
+        // residual_forward(ctx, l_residual2, residual, l_attproj, B*T*C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.residual_forward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Forward] : LayerNorm2\n");
-        layernorm_forward(ctx, l_ln2, l_ln2_mean, l_ln2_rstd, l_residual2, l_ln2w, l_ln2b, B, T, C);
+        // layernorm_forward(ctx, l_ln2, l_ln2_mean, l_ln2_rstd, l_residual2, l_ln2w, l_ln2b, B, T, C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.layernorm2_backward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Forward] : FF Up\n");
-        matmul_forward(ctx, l_fch, l_ln2, l_fcw, l_fcb, B, T, C, 4*C);
+        // matmul_forward(ctx, l_fch, l_ln2, l_fcw, l_fcb, B, T, C, 4*C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.ff_up_forward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Forward] : GELU\n");
-        gelu_forward(ctx, l_fch_gelu, l_fch, B*T*4*C);
+        // gelu_forward(ctx, l_fch_gelu, l_fch, B*T*4*C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.gelu_forward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Forward] : FF Down\n");
-        matmul_forward(ctx, l_fcproj, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, 4*C, C);
+        // matmul_forward(ctx, l_fcproj, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, 4*C, C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.ff_down_forward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Forward] : Residual2\n");
-        residual_forward(ctx, l_residual3, l_residual2, l_fcproj, B*T*C);
+        // residual_forward(ctx, l_residual3, l_residual2, l_fcproj, B*T*C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.residual2_forward[l], promise);
+            wait(ctx, future);
+        }
+    }
+    //    residual = acts.residual3.data() + (L-1) * B * T * C; // last residual is in residual3
+    // layernorm_forward(ctx, acts.lnf, acts.lnf_mean, acts.lnf_rstd, residual, params.lnfw, params.lnfb, B, T, C);
+    {
+        std::promise<void> promise;
+        std::future<void> future = promise.get_future();
+        dispatchKernel(ctx, model->kernels.layernorm_final_forward, promise);
+        wait(ctx, future);
+    }
+    // matmul_forward(ctx, acts.logits, acts.lnf, params.wte, NULL, B, T, C, Vp);
+    {
+        std::promise<void> promise;
+        std::future<void> future = promise.get_future();
+        dispatchKernel(ctx, model->kernels.matmul_final_forward, promise);
+        wait(ctx, future);
+    }
+    // softmax_forward(ctx, acts.probs, acts.logits, B, T, V, Vp);
+    {
+        std::promise<void> promise;
+        std::future<void> future = promise.get_future();
+        dispatchKernel(ctx, model->kernels.softmax_final_forward, promise);
+        wait(ctx, future);
     }
-    residual = acts.residual3 + (L-1) * B * T * C; // last residual is in residual3
-    layernorm_forward(ctx, acts.lnf, acts.lnf_mean, acts.lnf_rstd, residual, params.lnfw, params.lnfb, B, T, C);
-    matmul_forward(ctx, acts.logits, acts.lnf, params.wte, NULL, B, T, C, Vp);
-    softmax_forward(ctx, acts.probs, acts.logits, B, T, V, Vp);
 
     printf("Crossentropy\n");
     // also forward the cross-entropy loss function if we have the targets
-    if (targets != NULL) {
-        crossentropy_forward(ctx, model->acts.losses, model->acts.probs, targets, B, T, Vp);
+    //    if (targets != NULL) {
+        // crossentropy_forward(ctx, model->acts.losses, model->acts.probs, targets, B, T, Vp);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.crossentropy_forward, promise);
+            wait(ctx, future);
+        }
         // for convenience also evaluate the mean loss
         float mean_loss = 0.0f;
-        for (int i=0; i<B*T; i++) { mean_loss += model->acts.losses[i]; }
+        //toCPU(ctx, model->acts_.data[22], model->acts.losses.data, model->act_sizes[22] * sizeof(float));
+        for (int i=0; i<B*T; i++) { mean_loss += model->acts.losses.data[i]; }
         mean_loss /= B*T;
         model->mean_loss = mean_loss;
-    } else {
-        // if we don't have targets, we don't have a loss
-        model->mean_loss = -1.0f;
-    }
+    // } else {
+    //     // if we don't have targets, we don't have a loss
+    //     model->mean_loss = -1.0f;
+    // }
     printf("Forward pass done\n");
 }
 
@@ -499,8 +648,8 @@ void gpt2_backward(Context& ctx, GPT2 *model) {
     // lazily allocate the memory for gradients of the weights and activations, if needed
     if (model->grads_memory == NULL) {
         printf("Allocating %.2f MB for gradients\n", model->num_parameters * sizeof(float) / (1024.0f * 1024.0f));
-        model->grads_memory = malloc_and_point_parameters(&model->grads, model->param_sizes);
-        model->grads_acts_memory = malloc_and_point_activations(&model->grads_acts, model->act_sizes);
+        malloc_and_point_parameters(&model->grads, model->param_sizes);
+        malloc_and_point_activations(&model->grads_acts, model->act_sizes);
         gpt2_zero_grad(model);
     }
 
@@ -523,90 +672,124 @@ void gpt2_backward(Context& ctx, GPT2 *model) {
     // technically this is a small, inline backward() pass of calculating
     // total, final loss as the mean over all losses over all (B,T) positions in the batch
     float dloss_mean = 1.0f / (B*T);
-    for (int i = 0; i < B*T; i++) { grads_acts.losses[i] = dloss_mean; }
-
-    crossentropy_softmax_backward(ctx, grads_acts.logits, grads_acts.losses, acts.probs, model->targets, B, T, V, Vp);
-    matmul_backward(ctx, grads_acts.lnf, grads.wte, NULL, grads_acts.logits, acts.lnf, params.wte, B, T, C, Vp);
-    float* residual = acts.residual3 + (L-1) * B * T * C; // last layer's residual
-    float* dresidual = grads_acts.residual3 + (L-1) * B * T * C; // write to last layer's residual
-    layernorm_backward(ctx, dresidual, grads.lnfw, grads.lnfb, grads_acts.lnf, residual, params.lnfw, acts.lnf_mean, acts.lnf_rstd, B, T, C);
+    for (int i = 0; i < B*T; i++) { grads_acts.losses.data[i] = dloss_mean; }
+    toGPU(ctx, grads_acts.losses.data, model->acts_.data[22]);
+
+    // crossentropy_softmax_backward(ctx, grads_acts.logits, grads_acts.losses, acts.probs, model->targets, B, T, V, Vp);
+    {
+        std::promise<void> promise;
+        std::future<void> future = promise.get_future();
+        dispatchKernel(ctx, model->kernels.crossentropy_softmax_backward, promise);
+        wait(ctx, future);
+    }
+    // matmul_backward(ctx, grads_acts.lnf, grads.wte, NULL, grads_acts.logits, acts.lnf, params.wte, B, T, C, Vp);
+    {
+        std::promise<void> promise;
+        std::future<void> future = promise.get_future();
+        dispatchKernel(ctx, model->kernels.matmul_final_backward, promise);
+        wait(ctx, future);
+    }
+    // layernorm_backward(ctx, dresidual, grads.lnfw, grads.lnfb, grads_acts.lnf, residual, params.lnfw, acts.lnf_mean, acts.lnf_rstd, B, T, C);
+    {
+        std::promise<void> promise;
+        std::future<void> future = promise.get_future();
+        dispatchKernel(ctx, model->kernels.layernorm_final_backward, promise);
+        wait(ctx, future);
+    }
 
     for (int l = L-1; l >= 0; l--) {
         printf("Backward Pass Layer %d\n", l);
-
-        residual = l == 0 ? acts.encoded : acts.residual3 + (l-1) * B * T * C;
-        dresidual = l == 0 ? grads_acts.encoded : grads_acts.residual3 + (l-1) * B * T * C;
-
-        // get the pointers of the weights for this layer
-        float* l_ln1w = params.ln1w + l * C;
-        float* l_qkvw = params.qkvw + l * 3*C * C;
-        float* l_attprojw = params.attprojw + l * C * C;
-        float* l_ln2w = params.ln2w + l * C;
-        float* l_fcw = params.fcw + l * 4*C * C;
-        float* l_fcprojw = params.fcprojw + l * C * 4*C;
-        // get the pointers of the gradients of the weights for this layer
-        float* dl_ln1w = grads.ln1w + l * C;
-        float* dl_ln1b = grads.ln1b + l * C;
-        float* dl_qkvw = grads.qkvw + l * 3*C * C;
-        float* dl_qkvb = grads.qkvb + l * 3*C;
-        float* dl_attprojw = grads.attprojw + l * C * C;
-        float* dl_attprojb = grads.attprojb + l * C;
-        float* dl_ln2w = grads.ln2w + l * C;
-        float* dl_ln2b = grads.ln2b + l * C;
-        float* dl_fcw = grads.fcw + l * 4*C * C;
-        float* dl_fcb = grads.fcb + l * 4*C;
-        float* dl_fcprojw = grads.fcprojw + l * C * 4*C;
-        float* dl_fcprojb = grads.fcprojb + l * C;
-        // get the pointers of the activations for this layer
-        float* l_ln1 = acts.ln1 + l * B * T * C;
-        float* l_ln1_mean = acts.ln1_mean + l * B * T;
-        float* l_ln1_rstd = acts.ln1_rstd + l * B * T;
-        float* l_qkv = acts.qkv + l * B * T * 3*C;
-        float* l_atty = acts.atty + l * B * T * C;
-        float* l_att = acts.att + l * B * NH * T * T;
-        float* l_residual2 = acts.residual2 + l * B * T * C;
-        float* l_ln2 = acts.ln2 + l * B * T * C;
-        float* l_ln2_mean = acts.ln2_mean + l * B * T;
-        float* l_ln2_rstd = acts.ln2_rstd + l * B * T;
-        float* l_fch = acts.fch + l * B * T * 4*C;
-        float* l_fch_gelu = acts.fch_gelu + l * B * T * 4*C;
-        // get the pointers of the gradients of the activations for this layer
-        float* dl_ln1 = grads_acts.ln1 + l * B * T * C;
-        float* dl_qkv = grads_acts.qkv + l * B * T * 3*C;
-        float* dl_atty = grads_acts.atty + l * B * T * C;
-        float* dl_preatt = grads_acts.preatt + l * B * NH * T * T;
-        float* dl_att = grads_acts.att + l * B * NH * T * T;
-        float* dl_attproj = grads_acts.attproj + l * B * T * C;
-        float* dl_residual2 = grads_acts.residual2 + l * B * T * C;
-        float* dl_ln2 = grads_acts.ln2 + l * B * T * C;
-        float* dl_fch = grads_acts.fch + l * B * T * 4*C;
-        float* dl_fch_gelu = grads_acts.fch_gelu + l * B * T * 4*C;
-        float* dl_fcproj = grads_acts.fcproj + l * B * T * C;
-        float* dl_residual3 = grads_acts.residual3 + l * B * T * C;
-
         // backprop this layer
         printf("  [Backward] : Residual2\n");
-        residual_backward(ctx, dl_residual2, dl_fcproj, dl_residual3, B*T*C);
+        // residual_backward(ctx, dl_residual2, dl_fcproj, dl_residual3, B*T*C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.residual2_backward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Backward] : FF Down \n");
-        matmul_backward(ctx, dl_fch_gelu, dl_fcprojw, dl_fcprojb, dl_fcproj, l_fch_gelu, l_fcprojw, B, T, 4*C, C);
+        // matmul_backward(ctx, dl_fch_gelu, dl_fcprojw, dl_fcprojb, dl_fcproj, l_fch_gelu, l_fcprojw, B, T, 4*C, C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.ff_down_backward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Backward] : GELU\n");
-        gelu_backward(ctx, dl_fch, l_fch, dl_fch_gelu, B*T*4*C);
+        // gelu_backward(ctx, dl_fch, l_fch, dl_fch_gelu, B*T*4*C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.gelu_backward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Backward] : FF Up\n");
-        matmul_backward(ctx, dl_ln2, dl_fcw, dl_fcb, dl_fch, l_ln2, l_fcw, B, T, C, 4*C);
+        // matmul_backward(ctx, dl_ln2, dl_fcw, dl_fcb, dl_fch, l_ln2, l_fcw, B, T, C, 4*C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.ff_up_backward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Backward] : LayerNorm2\n");
-        layernorm_backward(ctx, dl_residual2, dl_ln2w, dl_ln2b, dl_ln2, l_residual2, l_ln2w, l_ln2_mean, l_ln2_rstd, B, T, C);
+        // layernorm_backward(ctx, dl_residual2, dl_ln2w, dl_ln2b, dl_ln2, l_residual2, l_ln2w, l_ln2_mean, l_ln2_rstd, B, T, C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.layernorm2_backward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Backward] : Residual1\n");
-        residual_backward(ctx, dresidual, dl_attproj, dl_residual2, B*T*C);
+        // residual_backward(ctx, dresidual, dl_attproj, dl_residual2, B*T*C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.residual_forward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Backward] : Attention Projection\n");
-        matmul_backward(ctx, dl_atty, dl_attprojw, dl_attprojb, dl_attproj, l_atty, l_attprojw, B, T, C, C);
+        // matmul_backward(ctx, dl_atty, dl_attprojw, dl_attprojb, dl_attproj, l_atty, l_attprojw, B, T, C, C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.attention_projection_backward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Backward] : Attention\n");
-        attention_backward(ctx, dl_qkv, dl_preatt, dl_att, dl_atty, l_qkv, l_att, B, T, C, NH);
+        // attention_backward(ctx, dl_qkv, dl_preatt, dl_att, dl_atty, l_qkv, l_att, B, T, C, NH);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.attention_backward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Backward] : QKV Projection\n");
-        matmul_backward(ctx, dl_ln1, dl_qkvw, dl_qkvb, dl_qkv, l_ln1, l_qkvw, B, T, C, 3*C);
+        // matmul_backward(ctx, dl_ln1, dl_qkvw, dl_qkvb, dl_qkv, l_ln1, l_qkvw, B, T, C, 3*C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.qkv_projection_backward[l], promise);
+            wait(ctx, future);
+        }
         printf("  [Backward] : LayerNorm1\n");
-        layernorm_backward(ctx, dresidual, dl_ln1w, dl_ln1b, dl_ln1, residual, l_ln1w, l_ln1_mean, l_ln1_rstd, B, T, C);
+        // layernorm_backward(ctx, dresidual, dl_ln1w, dl_ln1b, dl_ln1, residual, l_ln1w, l_ln1_mean, l_ln1_rstd, B, T, C);
+        {
+            std::promise<void> promise;
+            std::future<void> future = promise.get_future();
+            dispatchKernel(ctx, model->kernels.layernorm1_backward[l], promise);
+            wait(ctx, future);
+        }
+    }
+    // encoder_backward(ctx, grads.wte, grads.wpe, grads_acts.encoded, model->inputs, B, T, C);
+    {
+        std::promise<void> promise;
+        std::future<void> future = promise.get_future();
+        dispatchKernel(ctx, model->kernels.encoder_backward, promise);
+        wait(ctx, future);
     }
-    encoder_backward(ctx, grads.wte, grads.wpe, grads_acts.encoded, model->inputs, B, T, C);
+    toCPU(ctx, model->params_.data[0], model->grads.wte.data, model->param_sizes[0] * sizeof(float));
+    toCPU(ctx, model->params_.data[1], model->grads.wpe.data, model->param_sizes[1] * sizeof(float));
 }
 
 void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t) {
@@ -635,6 +818,8 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
         model->v_memory[i] = v;
         model->params_memory[i] -= learning_rate * (m_hat / (sqrtf(v_hat) + eps) + weight_decay * param);
     }
+    toGPU(ctx, model->params_memory, model->params_.data[0]);
+    toGPU(ctx, model->params_memory + model->param_sizes[0], model->params_.data[1]);
 }
 
 void gpt2_free(GPT2 *model) {
@@ -688,9 +873,11 @@ int main() {
     gpu::Context ctx = gpu::createContext({}, {}, {
         .requiredLimits = &requiredLimits
     });
-    // gpu::Context ctx = gpu::createContext();
+    
+Continue!
 
-    // build the GPT-2 model from a checkpoint
+```cpp
+   // build the GPT-2 model from a checkpoint
     GPT2 model;
     gpt2_build_from_checkpoint(ctx, &model, "gpt2_124M.bin");
 
@@ -719,12 +906,11 @@ int main() {
     int* gen_tokens = (int*)mallocCheck(B * T * sizeof(int));
     const int genT = 64; // number of steps of inference we will do
 
-      
     // train
     struct timespec start, end;
     printf("Starting training\n");
     for (int step = 0; step <= 40; step++) {
-      printf("Step %d\n", step);
+        printf("Step %d\n", step);
 
         // once in a while estimate the validation loss
         if (step % 10 == 0) {
@@ -757,7 +943,9 @@ int main() {
                 // we're in principle running B "inference streams" in parallel here
                 // but only using position 0
                 // get the Vp-dimensional vector probs[0, t-1, :]
-                float* probs = model.acts.probs + (t-1) * model.config.padded_vocab_size;
+                float* probs = model.acts.probs.data + (t-1) * model.config.padded_vocab_size;
+                toCPU(ctx, model.acts_.data[21], probs, (t-1) * model.config.padded_vocab_size * sizeof(float));
+
                 float coin = random_f32(&rng_state);
                 // note we're only sampling from the first V elements, ignoring padding
                 // (the probabilities in the padded region should be zero anyway)
diff --git a/experimental/kernels/ops_aot.hpp b/experimental/kernels/ops_aot.hpp
index 8ec6d8e..5db9ff7 100644
--- a/experimental/kernels/ops_aot.hpp
+++ b/experimental/kernels/ops_aot.hpp
@@ -5,10 +5,6 @@
 
 using namespace gpu;
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 #define VOCAB_SIZE 50257
 
 // See https://github.com/google/dawn/blob/a8fbe981a86cb59536e2de423d2013a82d9b54a0/src/dawn/native/Limits.cpp
@@ -101,8 +97,4 @@ Kernel crossentropy_softmax_backward(Context& ctx, Tensor& dlogits,
                                      Tensor& dlosses, Tensor& probs, Tensor& targets,
                                      int B, int T, int V, int Vp);
 
-#ifdef __cplusplus
-}
-#endif
-
 #endif // OPS_H

From 1d8e43577deda96675dee68960b3dc315670f4ca Mon Sep 17 00:00:00 2001
From: Junji Hashimoto <junji.hashimoto@gree.net>
Date: Tue, 22 Oct 2024 13:08:06 +0900
Subject: [PATCH 3/7] Update

---
 experimental/kernels/gpt2_webgpu_aot.cpp | 31 +++++++++++++++---------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/experimental/kernels/gpt2_webgpu_aot.cpp b/experimental/kernels/gpt2_webgpu_aot.cpp
index e0a1d54..c95ad3d 100644
--- a/experimental/kernels/gpt2_webgpu_aot.cpp
+++ b/experimental/kernels/gpt2_webgpu_aot.cpp
@@ -271,6 +271,9 @@ typedef struct {
     Tensor inputs; // the input tokens for the current forward pass
     Tensor targets; // the target tokens for the current forward pass
     float mean_loss; // after a forward pass with targets, will be populated with the mean loss
+    float* mean_loss_buffer;
+
+    Tensor nullTensor;
 
     // kernels
     Kernels kernels;
@@ -372,6 +375,8 @@ void gpt2_build_from_checkpoint(Context& ctx, GPT2 *model, const char* checkpoin
     model->batch_size = 0;
     model->seq_len = 0;
     model->mean_loss = -1.0f; // -1.0f will designate no loss
+    // Allocate B * C buffer for mean loss
+    model->mean_loss_buffer = (float*)mallocCheck(sizeof(float) * model->batch_size * model->seq_len);
 
     printf("Model build complete\n");
 
@@ -474,6 +479,7 @@ void gpt2_forward(Context& ctx, GPT2 *model, Tensor& inputs, Tensor& targets, si
                                                         /*input=*/ model->acts.residual3[L-1], /*weight=*/ model->params.lnfw, /*bias=*/ model->params.lnfb,
                                                         B, T, C);
         Tensor nullTensor = createTensor(ctx, Shape{1}, kf32);
+        model->nullTensor = nullTensor;
         kernels.matmul_final_forward = matmul_forward(ctx, model->acts.logits, model->acts.lnf, model->params.wte, nullTensor, B, T, C, Vp);
         kernels.softmax_final_forward = softmax_forward(ctx, model->acts.probs, model->acts.logits, B, T, V, Vp);
         kernels.crossentropy_softmax_backward = crossentropy_softmax_backward(ctx, model->acts.logits, model->acts.losses, model->acts.probs, targets, B, T, V, Vp);
@@ -829,8 +835,9 @@ void gpt2_free(GPT2 *model) {
     free(model->v_memory);
     free(model->acts_memory);
     free(model->grads_acts_memory);
-    free(model->inputs);
-    free(model->targets);
+    //    free(model->inputs);
+    //    free(model->targets);
+    free(model->mean_loss_buffer);
 }
 
 #ifndef TESTING
@@ -874,9 +881,6 @@ int main() {
         .requiredLimits = &requiredLimits
     });
     
-Continue!
-
-```cpp
    // build the GPT-2 model from a checkpoint
     GPT2 model;
     gpt2_build_from_checkpoint(ctx, &model, "gpt2_124M.bin");
@@ -903,11 +907,14 @@ Continue!
 
     // some memory for generating samples from the model
     uint64_t rng_state = 1337;
-    int* gen_tokens = (int*)mallocCheck(B * T * sizeof(int));
+    // int* gen_tokens = (int*)mallocCheck(B * T * sizeof(int));
     const int genT = 64; // number of steps of inference we will do
 
     // train
     struct timespec start, end;
+    Tensor inputs = createTensor(ctx, Shape{B, T}, ki32);
+    Tensor targets = createTensor(ctx, Shape{B, T}, ki32);
+    Tensor gen_tokens = createTensor(ctx, Shape{B, T}, ki32);
     printf("Starting training\n");
     for (int step = 0; step <= 40; step++) {
         printf("Step %d\n", step);
@@ -918,7 +925,9 @@ Continue!
             dataloader_reset(&val_loader);
             for (int i = 0; i < val_num_batches; i++) {
                 dataloader_next_batch(&val_loader);
-                gpt2_forward(ctx, &model, val_loader.inputs, val_loader.targets, B, T);
+                toGPU(ctx, val_loader.inputs, inputs);
+                toGPU(ctx, val_loader.targets, targets);
+                gpt2_forward(ctx, &model, inputs, targets, B, T);
                 val_loss += model.mean_loss;
             }
             val_loss /= val_num_batches;
@@ -928,9 +937,7 @@ Continue!
         // once in a while do model inference to print generated text
         if (step > 0 && step % 20 == 0) {
             // fill up gen_tokens with the GPT2_EOT, which kicks off the generation
-            for(int i = 0; i < B * T; ++i) {
-                gen_tokens[i] = tokenizer.eot_token;
-            }
+            toGPU(ctx, tokenizer.eot_token, gen_tokens);
             // now sample from the model autoregressively
             printf("generating:\n---\n");
             for (int t = 1; t < genT; t++) {
@@ -938,7 +945,7 @@ Continue!
                 // we re-calculate the forward pass for all of (B,T) positions from scratch
                 // but the inference here is just for sanity checking anyway
                 // and we can maybe optimize a bit more later, with careful tests
-                gpt2_forward(ctx, &model, gen_tokens, NULL, B, T);
+                gpt2_forward(ctx, &model, gen_tokens, model.nullTensor, B, T);
                 // furthermore, below we're only using b=0 (i.e. the first row) of all B rows
                 // we're in principle running B "inference streams" in parallel here
                 // but only using position 0
@@ -981,7 +988,7 @@ Continue!
     dataloader_free(&val_loader);
     tokenizer_free(&tokenizer);
     gpt2_free(&model);
-    free(gen_tokens);
+    // free(gen_tokens);
     return 0;
 }
 #endif

From 49859306cf44d4db408f949b1ea61068d91557f7 Mon Sep 17 00:00:00 2001
From: Junji Hashimoto <junji.hashimoto@gree.net>
Date: Wed, 23 Oct 2024 02:46:41 +0900
Subject: [PATCH 4/7] Update

---
 experimental/kernels/gpt2_webgpu_aot.cpp | 128 +++++++++++++++++++----
 1 file changed, 107 insertions(+), 21 deletions(-)

diff --git a/experimental/kernels/gpt2_webgpu_aot.cpp b/experimental/kernels/gpt2_webgpu_aot.cpp
index c95ad3d..cd474f9 100644
--- a/experimental/kernels/gpt2_webgpu_aot.cpp
+++ b/experimental/kernels/gpt2_webgpu_aot.cpp
@@ -272,6 +272,7 @@ typedef struct {
     Tensor targets; // the target tokens for the current forward pass
     float mean_loss; // after a forward pass with targets, will be populated with the mean loss
     float* mean_loss_buffer;
+    float* probs_buffer;
 
     Tensor nullTensor;
 
@@ -377,6 +378,7 @@ void gpt2_build_from_checkpoint(Context& ctx, GPT2 *model, const char* checkpoin
     model->mean_loss = -1.0f; // -1.0f will designate no loss
     // Allocate B * C buffer for mean loss
     model->mean_loss_buffer = (float*)mallocCheck(sizeof(float) * model->batch_size * model->seq_len);
+    model->probs_buffer = (float*)mallocCheck(sizeof(float) * model->batch_size * model->seq_len * Vp);
 
     printf("Model build complete\n");
 
@@ -616,7 +618,8 @@ void gpt2_forward(Context& ctx, GPT2 *model, Tensor& inputs, Tensor& targets, si
 
     printf("Crossentropy\n");
     // also forward the cross-entropy loss function if we have the targets
-    //    if (targets != NULL) {
+    // When targets's shape is (1), it means we don't have targets
+    if (targets.shape[0] != 1) {
         // crossentropy_forward(ctx, model->acts.losses, model->acts.probs, targets, B, T, Vp);
         {
             std::promise<void> promise;
@@ -627,13 +630,14 @@ void gpt2_forward(Context& ctx, GPT2 *model, Tensor& inputs, Tensor& targets, si
         // for convenience also evaluate the mean loss
         float mean_loss = 0.0f;
         //toCPU(ctx, model->acts_.data[22], model->acts.losses.data, model->act_sizes[22] * sizeof(float));
-        for (int i=0; i<B*T; i++) { mean_loss += model->acts.losses.data[i]; }
+        toCPU(ctx, model->acts.losses, model->mean_loss_buffer, B*T * sizeof(float));
+        for (int i=0; i<B*T; i++) { mean_loss += model->mean_loss_buffer[i]; }
         mean_loss /= B*T;
         model->mean_loss = mean_loss;
-    // } else {
-    //     // if we don't have targets, we don't have a loss
-    //     model->mean_loss = -1.0f;
-    // }
+    } else {
+        // if we don't have targets, we don't have a loss
+        model->mean_loss = -1.0f;
+    }
     printf("Forward pass done\n");
 }
 
@@ -654,8 +658,8 @@ void gpt2_backward(Context& ctx, GPT2 *model) {
     // lazily allocate the memory for gradients of the weights and activations, if needed
     if (model->grads_memory == NULL) {
         printf("Allocating %.2f MB for gradients\n", model->num_parameters * sizeof(float) / (1024.0f * 1024.0f));
-        malloc_and_point_parameters(&model->grads, model->param_sizes);
-        malloc_and_point_activations(&model->grads_acts, model->act_sizes);
+        malloc_and_point_parameters(ctx, &model->grads, model->param_sizes);
+        malloc_and_point_activations(ctx, &model->grads_acts, model->act_sizes);
         gpt2_zero_grad(model);
     }
 
@@ -678,8 +682,9 @@ void gpt2_backward(Context& ctx, GPT2 *model) {
     // technically this is a small, inline backward() pass of calculating
     // total, final loss as the mean over all losses over all (B,T) positions in the batch
     float dloss_mean = 1.0f / (B*T);
-    for (int i = 0; i < B*T; i++) { grads_acts.losses.data[i] = dloss_mean; }
-    toGPU(ctx, grads_acts.losses.data, model->acts_.data[22]);
+    for (int i = 0; i < B*T; i++) { model->mean_loss_buffer[i] = dloss_mean; }
+    toGPU(ctx, model->mean_loss_buffer, model->acts.losses);
+    //toGPU(ctx, grads_acts.losses.data, model->acts_.data[22]);
 
     // crossentropy_softmax_backward(ctx, grads_acts.logits, grads_acts.losses, acts.probs, model->targets, B, T, V, Vp);
     {
@@ -794,11 +799,11 @@ void gpt2_backward(Context& ctx, GPT2 *model) {
         dispatchKernel(ctx, model->kernels.encoder_backward, promise);
         wait(ctx, future);
     }
-    toCPU(ctx, model->params_.data[0], model->grads.wte.data, model->param_sizes[0] * sizeof(float));
-    toCPU(ctx, model->params_.data[1], model->grads.wpe.data, model->param_sizes[1] * sizeof(float));
+    // toCPU(ctx, model->params_.data[0], model->grads.wte.data, model->param_sizes[0] * sizeof(float));
+    // toCPU(ctx, model->params_.data[1], model->grads.wpe.data, model->param_sizes[1] * sizeof(float));
 }
 
-void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t) {
+void gpt2_update(Context& ctx, GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t) {
     // reference: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
 
     // lazily allocate the memory for m_memory and v_memory
@@ -807,6 +812,45 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
         model->v_memory = (float*)calloc(model->num_parameters, sizeof(float));
     }
 
+    // Copy the parameters to the CPU
+    float* iter = model->params_memory;
+    toCPU(ctx, model->params.wte, iter, model->param_sizes[0] * sizeof(float));
+    iter += model->param_sizes[0];
+    toCPU(ctx, model->params.wpe, iter, model->param_sizes[1] * sizeof(float));
+    iter += model->param_sizes[1];
+    size_t L = model->config.num_layers;
+    for (int l = 0; l < L; l++) {
+        toCPU(ctx, model->params.ln1w[l], iter, model->param_sizes[2]/L * sizeof(float));
+        iter += model->param_sizes[2]/L;
+        toCPU(ctx, model->params.ln1b[l], iter, model->param_sizes[3]/L * sizeof(float));
+        iter += model->param_sizes[3]/L;
+        toCPU(ctx, model->params.qkvw[l], iter, model->param_sizes[4]/L * sizeof(float));
+        iter += model->param_sizes[4]/L;
+        toCPU(ctx, model->params.qkvb[l], iter, model->param_sizes[5]/L * sizeof(float));
+        iter += model->param_sizes[5]/L;
+        toCPU(ctx, model->params.attprojw[l], iter, model->param_sizes[6]/L * sizeof(float));
+        iter += model->param_sizes[6]/L;
+        toCPU(ctx, model->params.attprojb[l], iter, model->param_sizes[7]/L * sizeof(float));
+        iter += model->param_sizes[7]/L;
+        toCPU(ctx, model->params.ln2w[l], iter, model->param_sizes[8]/L * sizeof(float));
+        iter += model->param_sizes[8]/L;
+        toCPU(ctx, model->params.ln2b[l], iter, model->param_sizes[9]/L * sizeof(float));
+        iter += model->param_sizes[9]/L;
+        toCPU(ctx, model->params.fcw[l], iter, model->param_sizes[10]/L * sizeof(float));
+        iter += model->param_sizes[10]/L;
+        toCPU(ctx, model->params.fcb[l], iter, model->param_sizes[11]/L * sizeof(float));
+        iter += model->param_sizes[11]/L;
+        toCPU(ctx, model->params.fcprojw[l], iter, model->param_sizes[12]/L * sizeof(float));
+        iter += model->param_sizes[12]/L;
+        toCPU(ctx, model->params.fcprojb[l], iter, model->param_sizes[13]/L * sizeof(float));
+        iter += model->param_sizes[13]/L;
+    }
+    toCPU(ctx, model->params.lnfw, iter, model->param_sizes[14] * sizeof(float));
+    iter += model->param_sizes[14];
+    toCPU(ctx, model->params.lnfb, iter, model->param_sizes[15] * sizeof(float));
+    iter += model->param_sizes[15];
+    
+
     for (size_t i = 0; i < model->num_parameters; i++) {
         float param = model->params_memory[i];
         float grad = model->grads_memory[i];
@@ -824,8 +868,43 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
         model->v_memory[i] = v;
         model->params_memory[i] -= learning_rate * (m_hat / (sqrtf(v_hat) + eps) + weight_decay * param);
     }
-    toGPU(ctx, model->params_memory, model->params_.data[0]);
-    toGPU(ctx, model->params_memory + model->param_sizes[0], model->params_.data[1]);
+    // toGPU(ctx, model->params_memory, model->params_.data[0]);
+    // toGPU(ctx, model->params_memory + model->param_sizes[0], model->params_.data[1]);
+    iter = model->params_memory;
+    toGPU(ctx, iter, model->params.wte);
+    iter += model->param_sizes[0];
+    toGPU(ctx, iter, model->params.wpe);
+    iter += model->param_sizes[1];
+    for (int l = 0; l < L; l++) {
+        toGPU(ctx, iter, model->params.ln1w[l]);
+        iter += model->param_sizes[2]/L;
+        toGPU(ctx, iter, model->params.ln1b[l]);
+        iter += model->param_sizes[3]/L;
+        toGPU(ctx, iter, model->params.qkvw[l]);
+        iter += model->param_sizes[4]/L;
+        toGPU(ctx, iter, model->params.qkvb[l]);
+        iter += model->param_sizes[5]/L;
+        toGPU(ctx, iter, model->params.attprojw[l]);
+        iter += model->param_sizes[6]/L;
+        toGPU(ctx, iter, model->params.attprojb[l]);
+        iter += model->param_sizes[7]/L;
+        toGPU(ctx, iter, model->params.ln2w[l]);
+        iter += model->param_sizes[8]/L;
+        toGPU(ctx, iter, model->params.ln2b[l]);
+        iter += model->param_sizes[9]/L;
+        toGPU(ctx, iter, model->params.fcw[l]);
+        iter += model->param_sizes[10]/L;
+        toGPU(ctx, iter, model->params.fcb[l]);
+        iter += model->param_sizes[11]/L;
+        toGPU(ctx, iter, model->params.fcprojw[l]);
+        iter += model->param_sizes[12]/L;
+        toGPU(ctx, iter, model->params.fcprojb[l]);
+        iter += model->param_sizes[13]/L;
+    }
+    toGPU(ctx, iter, model->params.lnfw);
+    iter += model->param_sizes[14];
+    toGPU(ctx, iter, model->params.lnfb);
+    iter += model->param_sizes[15];
 }
 
 void gpt2_free(GPT2 *model) {
@@ -915,6 +994,7 @@ int main() {
     Tensor inputs = createTensor(ctx, Shape{B, T}, ki32);
     Tensor targets = createTensor(ctx, Shape{B, T}, ki32);
     Tensor gen_tokens = createTensor(ctx, Shape{B, T}, ki32);
+    int* gen_tokens_cpu = (int*)mallocCheck(B * T * sizeof(int));
     printf("Starting training\n");
     for (int step = 0; step <= 40; step++) {
         printf("Step %d\n", step);
@@ -937,7 +1017,10 @@ int main() {
         // once in a while do model inference to print generated text
         if (step > 0 && step % 20 == 0) {
             // fill up gen_tokens with the GPT2_EOT, which kicks off the generation
-            toGPU(ctx, tokenizer.eot_token, gen_tokens);
+            for(int i = 0; i < B * T; ++i) {
+                gen_tokens_cpu[i] = tokenizer.eot_token;
+            }
+            toGPU(ctx, gen_tokens_cpu, gen_tokens);
             // now sample from the model autoregressively
             printf("generating:\n---\n");
             for (int t = 1; t < genT; t++) {
@@ -950,14 +1033,15 @@ int main() {
                 // we're in principle running B "inference streams" in parallel here
                 // but only using position 0
                 // get the Vp-dimensional vector probs[0, t-1, :]
-                float* probs = model.acts.probs.data + (t-1) * model.config.padded_vocab_size;
-                toCPU(ctx, model.acts_.data[21], probs, (t-1) * model.config.padded_vocab_size * sizeof(float));
+                toCPU(ctx, model.acts.probs, model.probs_buffer, B * T * model.config.padded_vocab_size * sizeof(float));
+                float* probs = model.probs_buffer + (t-1) * model.config.padded_vocab_size;
 
                 float coin = random_f32(&rng_state);
                 // note we're only sampling from the first V elements, ignoring padding
                 // (the probabilities in the padded region should be zero anyway)
                 int next_token = sample_mult(probs, model.config.vocab_size, coin);
-                gen_tokens[t] = next_token;
+                gen_tokens_cpu[t] = next_token;
+                toGPU(ctx, gen_tokens_cpu, gen_tokens);
                 // print the generated token, either using the Tokenizer or a fallback
                 if (tokenizer.init_ok) {
                     const char* token_str = tokenizer_decode(&tokenizer, next_token);
@@ -974,10 +1058,12 @@ int main() {
         // do a training step
         clock_gettime(CLOCK_MONOTONIC, &start);
         dataloader_next_batch(&train_loader);
-        gpt2_forward(ctx, &model, train_loader.inputs, train_loader.targets, B, T);
+        toGPU(ctx, train_loader.inputs, inputs);
+        toGPU(ctx, train_loader.targets, targets);
+        gpt2_forward(ctx, &model, inputs, targets, B, T);
         gpt2_zero_grad(&model);
         gpt2_backward(ctx, &model);
-        gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, step+1);
+        gpt2_update(ctx, &model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, step+1);
         clock_gettime(CLOCK_MONOTONIC, &end);
         double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
         printf("step %d: train loss %f (took %f ms)\n", step, model.mean_loss, time_elapsed_s * 1000);

From 7ef40b097cd7c942f195d9894ca6ae8a7b5bc4b7 Mon Sep 17 00:00:00 2001
From: Junji Hashimoto <junji.hashimoto@gree.net>
Date: Tue, 5 Nov 2024 01:41:44 +0900
Subject: [PATCH 5/7] Add a flag to disable bardward-pass

---
 experimental/kernels/gpt2_webgpu_aot.cpp | 30 +++++++++++++++---------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/experimental/kernels/gpt2_webgpu_aot.cpp b/experimental/kernels/gpt2_webgpu_aot.cpp
index cd474f9..2190a7a 100644
--- a/experimental/kernels/gpt2_webgpu_aot.cpp
+++ b/experimental/kernels/gpt2_webgpu_aot.cpp
@@ -278,6 +278,7 @@ typedef struct {
 
     // kernels
     Kernels kernels;
+    bool backward_enabled;
 } GPT2;
 
 void gpt2_build_from_checkpoint(Context& ctx, GPT2 *model, const char* checkpoint_path) {
@@ -379,6 +380,7 @@ void gpt2_build_from_checkpoint(Context& ctx, GPT2 *model, const char* checkpoin
     // Allocate B * C buffer for mean loss
     model->mean_loss_buffer = (float*)mallocCheck(sizeof(float) * model->batch_size * model->seq_len);
     model->probs_buffer = (float*)mallocCheck(sizeof(float) * model->batch_size * model->seq_len * Vp);
+    model->backward_enabled = false;
 
     printf("Model build complete\n");
 
@@ -476,7 +478,8 @@ void gpt2_forward(Context& ctx, GPT2 *model, Tensor& inputs, Tensor& targets, si
         kernels.crossentropy_forward = crossentropy_forward(ctx, model->acts.losses, model->acts.probs, targets, B, T, Vp);
         
         kernels.encoder_forward = encoder_forward(ctx, model->acts.encoded, inputs, model->params.wte, model->params.wpe, B, T, C); // encoding goes into residual[0]
-        kernels.encoder_backward = encoder_backward(ctx, model->params.wte, model->params.wpe, model->acts.encoded, inputs, B, T, C);
+        if(model->backward_enabled)
+          kernels.encoder_backward = encoder_backward(ctx, model->params.wte, model->params.wpe, model->acts.encoded, inputs, B, T, C);
         kernels.layernorm_final_forward = layernorm_forward(ctx, model->acts.lnf, model->acts.lnf_mean, model->acts.lnf_rstd,
                                                         /*input=*/ model->acts.residual3[L-1], /*weight=*/ model->params.lnfw, /*bias=*/ model->params.lnfb,
                                                         B, T, C);
@@ -484,12 +487,15 @@ void gpt2_forward(Context& ctx, GPT2 *model, Tensor& inputs, Tensor& targets, si
         model->nullTensor = nullTensor;
         kernels.matmul_final_forward = matmul_forward(ctx, model->acts.logits, model->acts.lnf, model->params.wte, nullTensor, B, T, C, Vp);
         kernels.softmax_final_forward = softmax_forward(ctx, model->acts.probs, model->acts.logits, B, T, V, Vp);
-        kernels.crossentropy_softmax_backward = crossentropy_softmax_backward(ctx, model->acts.logits, model->acts.losses, model->acts.probs, targets, B, T, V, Vp);
-        kernels.matmul_final_backward = matmul_backward(ctx, model->acts.lnf, model->params.wte, nullTensor, model->acts.logits,
-                                                 model->acts.lnf, model->params.wte, B, T, C, Vp);
-        kernels.layernorm_final_backward = layernorm_backward(ctx, model->acts.residual3[L-1], model->params.lnfw, model->params.lnfb,
-                                                        model->acts.lnf, model->acts.residual3[L-1], model->params.lnfw,
-                                                        model->acts.lnf_mean, model->acts.lnf_rstd, B, T, C);
+        if(model->backward_enabled)
+          kernels.crossentropy_softmax_backward = crossentropy_softmax_backward(ctx, model->acts.logits, model->acts.losses, model->acts.probs, targets, B, T, V, Vp);
+        if(model->backward_enabled)
+          kernels.matmul_final_backward = matmul_backward(ctx, model->acts.lnf, model->params.wte, nullTensor, model->acts.logits,
+                                                          model->acts.lnf, model->params.wte, B, T, C, Vp);
+        if(model->backward_enabled)
+          kernels.layernorm_final_backward = layernorm_backward(ctx, model->acts.residual3[L-1], model->params.lnfw, model->params.lnfb,
+                                                                model->acts.lnf, model->acts.residual3[L-1], model->params.lnfw,
+                                                                model->acts.lnf_mean, model->acts.lnf_rstd, B, T, C);
         printf("Created Kernels\n");
     }
 
@@ -557,7 +563,7 @@ void gpt2_forward(Context& ctx, GPT2 *model, Tensor& inputs, Tensor& targets, si
         {
             std::promise<void> promise;
             std::future<void> future = promise.get_future();
-            dispatchKernel(ctx, model->kernels.layernorm2_backward[l], promise);
+            dispatchKernel(ctx, model->kernels.layernorm_forward[l], promise);
             wait(ctx, future);
         }
         printf("  [Forward] : FF Up\n");
@@ -1061,9 +1067,11 @@ int main() {
         toGPU(ctx, train_loader.inputs, inputs);
         toGPU(ctx, train_loader.targets, targets);
         gpt2_forward(ctx, &model, inputs, targets, B, T);
-        gpt2_zero_grad(&model);
-        gpt2_backward(ctx, &model);
-        gpt2_update(ctx, &model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, step+1);
+        if (model.backward_enabled) {
+            gpt2_zero_grad(&model);
+            gpt2_backward(ctx, &model);
+            gpt2_update(ctx, &model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, step+1);
+        }
         clock_gettime(CLOCK_MONOTONIC, &end);
         double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
         printf("step %d: train loss %f (took %f ms)\n", step, model.mean_loss, time_elapsed_s * 1000);

From 6be7e1e0462d819712bf43570f55854a54fbc8f0 Mon Sep 17 00:00:00 2001
From: Junji Hashimoto <junji.hashimoto@gree.net>
Date: Sat, 16 Nov 2024 14:17:59 +0900
Subject: [PATCH 6/7] Fix the bug of memory allocation

---
 experimental/kernels/Makefile            | 4 ++--
 experimental/kernels/gpt2_webgpu_aot.cpp | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/experimental/kernels/Makefile b/experimental/kernels/Makefile
index 7430a71..aa34e97 100644
--- a/experimental/kernels/Makefile
+++ b/experimental/kernels/Makefile
@@ -16,7 +16,7 @@ CXXFLAGS=-std=c++17 -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -I. -Iunittest_l
 CFLAGS=-Ofast -march=native -I. -Iunittest_llmc
 # CFLAGS=-O2 -march=native -I. -Iunittest_llmc
 
-LDFLAGS=$(STDLIB) -L$(GPUCPP)/third_party/lib -ldl -ldawn
+LDFLAGS=$(STDLIB) -L$(GPUCPP)/third_party/lib -ldl -ldawn -fsanitize=address
 FLAGS=$(CXXFLAGS) $(LDFLAGS)
 
 ifeq ($(shell [ -d /opt/homebrew/opt/libomp/lib ] && echo "exists"), exists)
@@ -101,7 +101,7 @@ build/gpt2_webgpu: llm.c gpt2_124M.bin llm.c gpt2_webgpu.cpp ops.cpp
 
 build/gpt2_webgpu_aot: llm.c gpt2_124M.bin llm.c gpt2_webgpu_aot.cpp ops_aot.cpp
 	mkdir -p build
-	$(CC) $(CXXFLAGS) -Illm.c $(LDFLAGS) -o $@ gpt2_webgpu_aot.cpp ops_aot.cpp
+	$(CC) $(CXXFLAGS) -Illm.c $(LDFLAGS) -o $@ gpt2_webgpu_aot.cpp ops_aot.cpp -g
 
 build/gpt2_webgpu.html: check-emsdk gpt2_webgpu.cpp term.html llm.c
 	em++ gpt2_webgpu.cpp ops.cpp \
diff --git a/experimental/kernels/gpt2_webgpu_aot.cpp b/experimental/kernels/gpt2_webgpu_aot.cpp
index 2190a7a..1e7043f 100644
--- a/experimental/kernels/gpt2_webgpu_aot.cpp
+++ b/experimental/kernels/gpt2_webgpu_aot.cpp
@@ -377,9 +377,8 @@ void gpt2_build_from_checkpoint(Context& ctx, GPT2 *model, const char* checkpoin
     model->batch_size = 0;
     model->seq_len = 0;
     model->mean_loss = -1.0f; // -1.0f will designate no loss
-    // Allocate B * C buffer for mean loss
-    model->mean_loss_buffer = (float*)mallocCheck(sizeof(float) * model->batch_size * model->seq_len);
-    model->probs_buffer = (float*)mallocCheck(sizeof(float) * model->batch_size * model->seq_len * Vp);
+    model->mean_loss_buffer = NULL;
+    model->probs_buffer = NULL;
     model->backward_enabled = false;
 
     printf("Model build complete\n");
@@ -418,6 +417,8 @@ void gpt2_forward(Context& ctx, GPT2 *model, Tensor& inputs, Tensor& targets, si
         model->seq_len = T;
         // and now allocate the space
         fill_in_activation_sizes(model->act_sizes, model->config, B, T);
+        model->mean_loss_buffer = (float*)mallocCheck(sizeof(float) * model->batch_size * model->seq_len);
+        model->probs_buffer =  (float*)mallocCheck(sizeof(float) * model->batch_size * model->seq_len * Vp);
 
         // TODO(avh): this is just a resource test for now, eventually deprecate CPU allocations
         size_t num_activations = 0;
@@ -635,7 +636,6 @@ void gpt2_forward(Context& ctx, GPT2 *model, Tensor& inputs, Tensor& targets, si
         }
         // for convenience also evaluate the mean loss
         float mean_loss = 0.0f;
-        //toCPU(ctx, model->acts_.data[22], model->acts.losses.data, model->act_sizes[22] * sizeof(float));
         toCPU(ctx, model->acts.losses, model->mean_loss_buffer, B*T * sizeof(float));
         for (int i=0; i<B*T; i++) { mean_loss += model->mean_loss_buffer[i]; }
         mean_loss /= B*T;

From f629a335aef7f8f27cc27d1811f3c09679e3bfb5 Mon Sep 17 00:00:00 2001
From: Junji Hashimoto <junji.hashimoto@gree.net>
Date: Sun, 17 Nov 2024 04:36:43 +0900
Subject: [PATCH 7/7] Remove NUM_PARAMETER_LAYERS

---
 experimental/kernels/gpt2_webgpu_aot.cpp | 113 ++++++++++++++---------
 1 file changed, 67 insertions(+), 46 deletions(-)

diff --git a/experimental/kernels/gpt2_webgpu_aot.cpp b/experimental/kernels/gpt2_webgpu_aot.cpp
index 1e7043f..966fb7a 100644
--- a/experimental/kernels/gpt2_webgpu_aot.cpp
+++ b/experimental/kernels/gpt2_webgpu_aot.cpp
@@ -47,7 +47,6 @@ typedef struct {
 
 // the parameters of the model
 #define NUM_PARAMETER_TENSORS 16
-#define NUM_PARAMETER_LAYERS 12
 typedef struct {
     Tensor wte; // (V, C)
     Tensor wpe; // (maxT, C)
@@ -91,22 +90,36 @@ void fill_in_parameter_sizes(size_t* param_sizes, GPT2Config config) {
 }
 
 // allocate memory for the parameters and point the individual tensors to the right places
-void malloc_and_point_parameters(Context& ctx, ParameterTensors* params, size_t* param_sizes) {
+void malloc_and_point_parameters(Context& ctx, GPT2Config config, ParameterTensors* params, size_t* param_sizes) {
+    size_t L = config.num_layers;
     params->wte = createTensor(ctx, Shape{param_sizes[0]}, kf32);
     params->wpe = createTensor(ctx, Shape{param_sizes[1]}, kf32);
-    for(int l = 0; l < NUM_PARAMETER_LAYERS; l++) {
-      params->ln1w.push_back(createTensor(ctx, Shape{param_sizes[2]/NUM_PARAMETER_LAYERS}, kf32));
-      params->ln1b.push_back(createTensor(ctx, Shape{param_sizes[3]/NUM_PARAMETER_LAYERS}, kf32));
-      params->qkvw.push_back(createTensor(ctx, Shape{param_sizes[4]/NUM_PARAMETER_LAYERS}, kf32));
-      params->qkvb.push_back(createTensor(ctx, Shape{param_sizes[5]/NUM_PARAMETER_LAYERS}, kf32));
-      params->attprojw.push_back(createTensor(ctx, Shape{param_sizes[6]/NUM_PARAMETER_LAYERS}, kf32));
-      params->attprojb.push_back(createTensor(ctx, Shape{param_sizes[7]/NUM_PARAMETER_LAYERS}, kf32));
-      params->ln2w.push_back(createTensor(ctx, Shape{param_sizes[8]/NUM_PARAMETER_LAYERS}, kf32));
-      params->ln2b.push_back(createTensor(ctx, Shape{param_sizes[9]/NUM_PARAMETER_LAYERS}, kf32));
-      params->fcw.push_back(createTensor(ctx, Shape{param_sizes[10]/NUM_PARAMETER_LAYERS}, kf32));
-      params->fcb.push_back(createTensor(ctx, Shape{param_sizes[11]/NUM_PARAMETER_LAYERS}, kf32));
-      params->fcprojw.push_back(createTensor(ctx, Shape{param_sizes[12]/NUM_PARAMETER_LAYERS}, kf32));
-      params->fcprojb.push_back(createTensor(ctx, Shape{param_sizes[13]/NUM_PARAMETER_LAYERS}, kf32));
+
+    params->ln1w.resize(L);
+    params->ln1b.resize(L);
+    params->qkvw.resize(L);
+    params->qkvb.resize(L);
+    params->attprojw.resize(L);
+    params->attprojb.resize(L);
+    params->ln2w.resize(L);
+    params->ln2b.resize(L);
+    params->fcw.resize(L);
+    params->fcb.resize(L);
+    params->fcprojw.resize(L);
+    params->fcprojb.resize(L);
+    for(int l = 0; l < L ; l++) {
+      params->ln1w[l] = createTensor(ctx, Shape{param_sizes[2]/config.num_layers}, kf32);
+      params->ln1b[l] = createTensor(ctx, Shape{param_sizes[3]/config.num_layers}, kf32);
+      params->qkvw[l] = createTensor(ctx, Shape{param_sizes[4]/config.num_layers}, kf32);
+      params->qkvb[l] = createTensor(ctx, Shape{param_sizes[5]/config.num_layers}, kf32);
+      params->attprojw[l] = createTensor(ctx, Shape{param_sizes[6]/config.num_layers}, kf32);
+      params->attprojb[l] = createTensor(ctx, Shape{param_sizes[7]/config.num_layers}, kf32);
+      params->ln2w[l] = createTensor(ctx, Shape{param_sizes[8]/config.num_layers}, kf32);
+      params->ln2b[l] = createTensor(ctx, Shape{param_sizes[9]/config.num_layers}, kf32);
+      params->fcw[l] = createTensor(ctx, Shape{param_sizes[10]/config.num_layers}, kf32);
+      params->fcb[l] = createTensor(ctx, Shape{param_sizes[11]/config.num_layers}, kf32);
+      params->fcprojw[l] = createTensor(ctx, Shape{param_sizes[12]/config.num_layers}, kf32);
+      params->fcprojb[l] = createTensor(ctx, Shape{param_sizes[13]/config.num_layers}, kf32);
     }
     params->lnfw = createTensor(ctx, Shape{param_sizes[14]}, kf32);
     params->lnfb = createTensor(ctx, Shape{param_sizes[15]}, kf32);
@@ -201,25 +214,42 @@ void fill_in_activation_sizes(size_t* act_sizes, GPT2Config config, int B, int T
     act_sizes[22] = B * T; // losses
 }
 
-void malloc_and_point_activations(Context& ctx, ActivationTensors* acts, size_t* act_sizes) {
+void malloc_and_point_activations(Context& ctx, GPT2Config config, ActivationTensors* acts, size_t* act_sizes) {
+    size_t L = config.num_layers;
     acts->encoded = createTensor(ctx, Shape{act_sizes[0]}, kf32);
-    for (int l = 0; l < NUM_PARAMETER_LAYERS; l++) {
-        acts->ln1.push_back(createTensor(ctx, Shape{act_sizes[1]/NUM_PARAMETER_LAYERS}, kf32));
-        acts->ln1_mean.push_back(createTensor(ctx, Shape{act_sizes[2]/NUM_PARAMETER_LAYERS}, kf32));
-        acts->ln1_rstd.push_back(createTensor(ctx, Shape{act_sizes[3]/NUM_PARAMETER_LAYERS}, kf32));
-        acts->qkv.push_back(createTensor(ctx, Shape{act_sizes[4]/NUM_PARAMETER_LAYERS}, kf32));
-        acts->atty.push_back(createTensor(ctx, Shape{act_sizes[5]/NUM_PARAMETER_LAYERS}, kf32));
-        acts->preatt.push_back(createTensor(ctx, Shape{act_sizes[6]/NUM_PARAMETER_LAYERS}, kf32));
-        acts->att.push_back(createTensor(ctx, Shape{act_sizes[7]/NUM_PARAMETER_LAYERS}, kf32));
-        acts->attproj.push_back(createTensor(ctx, Shape{act_sizes[8]/NUM_PARAMETER_LAYERS}, kf32));
-        acts->residual2.push_back(createTensor(ctx, Shape{act_sizes[9]/NUM_PARAMETER_LAYERS}, kf32));
-        acts->ln2.push_back(createTensor(ctx, Shape{act_sizes[10]/NUM_PARAMETER_LAYERS}, kf32));
-        acts->ln2_mean.push_back(createTensor(ctx, Shape{act_sizes[11]/NUM_PARAMETER_LAYERS}, kf32));
-        acts->ln2_rstd.push_back(createTensor(ctx, Shape{act_sizes[12]/NUM_PARAMETER_LAYERS}, kf32));
-        acts->fch.push_back(createTensor(ctx, Shape{act_sizes[13]/NUM_PARAMETER_LAYERS}, kf32));
-        acts->fch_gelu.push_back(createTensor(ctx, Shape{act_sizes[14]/NUM_PARAMETER_LAYERS}, kf32));
-        acts->fcproj.push_back(createTensor(ctx, Shape{act_sizes[15]/NUM_PARAMETER_LAYERS}, kf32));
-        acts->residual3.push_back(createTensor(ctx, Shape{act_sizes[16]/NUM_PARAMETER_LAYERS}, kf32));
+    acts->ln1.resize(L);
+    acts->ln1_mean.resize(L);
+    acts->ln1_rstd.resize(L);
+    acts->qkv.resize(L);
+    acts->atty.resize(L);
+    acts->preatt.resize(L);
+    acts->att.resize(L);
+    acts->attproj.resize(L);
+    acts->residual2.resize(L);
+    acts->ln2.resize(L);
+    acts->ln2_mean.resize(L);
+    acts->ln2_rstd.resize(L);
+    acts->fch.resize(L);
+    acts->fch_gelu.resize(L);
+    acts->fcproj.resize(L);
+    acts->residual3.resize(L);
+    for (int l = 0; l < L; l++) {
+        acts->ln1[l] = createTensor(ctx, Shape{act_sizes[1]/config.num_layers}, kf32);
+        acts->ln1_mean[l] = createTensor(ctx, Shape{act_sizes[2]/config.num_layers}, kf32);
+        acts->ln1_rstd[l] = createTensor(ctx, Shape{act_sizes[3]/config.num_layers}, kf32);
+        acts->qkv[l] = createTensor(ctx, Shape{act_sizes[4]/config.num_layers}, kf32);
+        acts->atty[l] = createTensor(ctx, Shape{act_sizes[5]/config.num_layers}, kf32);
+        acts->preatt[l] = createTensor(ctx, Shape{act_sizes[6]/config.num_layers}, kf32);
+        acts->att[l] = createTensor(ctx, Shape{act_sizes[7]/config.num_layers}, kf32);
+        acts->attproj[l] = createTensor(ctx, Shape{act_sizes[8]/config.num_layers}, kf32);
+        acts->residual2[l] = createTensor(ctx, Shape{act_sizes[9]/config.num_layers}, kf32);
+        acts->ln2[l] = createTensor(ctx, Shape{act_sizes[10]/config.num_layers}, kf32);
+        acts->ln2_mean[l] = createTensor(ctx, Shape{act_sizes[11]/config.num_layers}, kf32);
+        acts->ln2_rstd[l] = createTensor(ctx, Shape{act_sizes[12]/config.num_layers}, kf32);
+        acts->fch[l] = createTensor(ctx, Shape{act_sizes[13]/config.num_layers}, kf32);
+        acts->fch_gelu[l] = createTensor(ctx, Shape{act_sizes[14]/config.num_layers}, kf32);
+        acts->fcproj[l] = createTensor(ctx, Shape{act_sizes[15]/config.num_layers}, kf32);
+        acts->residual3[l] = createTensor(ctx, Shape{act_sizes[16]/config.num_layers}, kf32);
     }
     acts->lnf = createTensor(ctx, Shape{act_sizes[17]}, kf32);
     acts->lnf_mean = createTensor(ctx, Shape{act_sizes[18]}, kf32);
@@ -229,15 +259,6 @@ void malloc_and_point_activations(Context& ctx, ActivationTensors* acts, size_t*
     acts->losses = createTensor(ctx, Shape{act_sizes[22]}, kf32);
 }
 
-struct GPUParameters {
-  Tensor data[NUM_PARAMETER_TENSORS];
-};
-
-struct GPUActivations {
-  Tensor data[NUM_ACTIVATION_TENSORS];
-};
-
-
 void gpu_alloc(Context& ctx, Tensor* tensors, size_t* sizes, size_t n) { 
     for (size_t i = 0; i < n; i++) {
         tensors[i] = createTensor(ctx, Shape{sizes[i]}, kf32);
@@ -325,7 +346,7 @@ void gpt2_build_from_checkpoint(Context& ctx, GPT2 *model, const char* checkpoin
     model->num_parameters = num_parameters;
 
     // read in all the parameters from file
-    malloc_and_point_parameters(ctx, &model->params, model->param_sizes);
+    malloc_and_point_parameters(ctx, model->config, &model->params, model->param_sizes);
     model->params_memory = (float*)mallocCheck(num_parameters * sizeof(float));
     freadCheck(model->params_memory, sizeof(float), num_parameters, model_file);
     fcloseCheck(model_file);
@@ -428,7 +449,7 @@ void gpt2_forward(Context& ctx, GPT2 *model, Tensor& inputs, Tensor& targets, si
         printf("num_activations: %zu\n", num_activations);
         model->num_activations = num_activations;
         printf("Allocating %.2f MB for activations\n", num_activations * sizeof(float) / (1024.0f * 1024.0f));
-        malloc_and_point_activations(ctx, &model->acts, model->act_sizes);
+        malloc_and_point_activations(ctx, model->config, &model->acts, model->act_sizes);
         // also create memory for caching inputs and targets
         //model->inputs = (int*)mallocCheck(B * T * sizeof(int));
         //model->targets = (int*)mallocCheck(B * T * sizeof(int)); // might be unused if we never have targets but it's small
@@ -664,8 +685,8 @@ void gpt2_backward(Context& ctx, GPT2 *model) {
     // lazily allocate the memory for gradients of the weights and activations, if needed
     if (model->grads_memory == NULL) {
         printf("Allocating %.2f MB for gradients\n", model->num_parameters * sizeof(float) / (1024.0f * 1024.0f));
-        malloc_and_point_parameters(ctx, &model->grads, model->param_sizes);
-        malloc_and_point_activations(ctx, &model->grads_acts, model->act_sizes);
+        malloc_and_point_parameters(ctx, model->config, &model->grads, model->param_sizes);
+        malloc_and_point_activations(ctx, model->config, &model->grads_acts, model->act_sizes);
         gpt2_zero_grad(model);
     }