Skip to content

Commit d2af304

Browse files
committed
GPU allocation test in preparation for model preallocation implementaiton
1 parent df2ad57 commit d2af304

File tree

3 files changed

+43
-73
lines changed

3 files changed

+43
-73
lines changed

experimental/kernels/Makefile

+8-13
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ else
1212
endif
1313

1414
# ASYNCIFY allows emscripten to sleep
15-
EMFLAGS=-std=c++17 -I$(GPUCPP) -I$(GPUCPP)/third_party/headers/wasm -I. -Iunittest_llmc -Illm.c -s USE_WEBGPU=1 -s ASYNCIFY=1 -s STACK_SIZE=100000 -s MEMORY64=1 -s ALLOW_MEMORY_GROWTH=1
15+
EMFLAGS=-std=c++17 -I$(GPUCPP) -I$(GPUCPP)/third_party/headers/wasm -I. -Iunittest_llmc -I$(GPUCPP)/third_party/llm.c -s USE_WEBGPU=1 -s ASYNCIFY=1 -s STACK_SIZE=100000 -s MEMORY64=1 -s ALLOW_MEMORY_GROWTH=1
1616
CXXFLAGS=-std=c++17 -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -I. -Iunittest_llmc
1717
CFLAGS=-Ofast -march=native -I. -Iunittest_llmc
1818
# CFLAGS=-O2 -march=native -I. -Iunittest_llmc
@@ -35,12 +35,13 @@ run_llm.c: ./build/test_gpt2 dawnlib
3535
$(LIBSPEC) && $<
3636

3737
run_llm.c_train: ./build/train_gpt2 dawnlib
38-
if [ ! -d dev ] ; then ln -s ./llm.c/dev ; fi
39-
if [ ! -f gpt2_tokenizer.bin ] ; then ln -s ./llm.c/gpt2_tokenizer.bin ; fi
38+
if [ ! -d dev ] ; then ln -s $(GPUCPP)/third_party/llm.c/dev ; fi
39+
if [ ! -f gpt2_tokenizer.bin ] ; then ln -s $(GPUCPP)/third_party/llm.c/gpt2_tokenizer.bin ; fi
4040
$(LIBSPEC) && $<
4141

4242
llm.c:
43-
if [ ! -d llm.c ]; then git clone [email protected]:karpathy/llm.c.git ; fi
43+
# if [ ! -d llm.c ]; then git clone [email protected]:karpathy/llm.c.git ; fi
44+
ln -s $(GPUCPP)/third_party/llm.c
4445

4546
gpt2_124M.bin: llm.c
4647
if [ ! -f $@ ]; then ./llm.c/dev/download_starter_pack.sh ; \
@@ -90,14 +91,14 @@ build/train_gpt2: llm.c build/unittest_kernels.o gpt2_124M.bin
9091
grep -q "^#include \"unittest_kernels.h\"" llm.c/train_gpt2.c || sed -i '1i#include \"unittest_kernels.h\"' llm.c/train_gpt2.c
9192
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ llm.c/train_gpt2.c build/unittest_kernels.o
9293

93-
build/ops.o: ops.cpp ops.hpp kernels.h
94+
build/ops.o: ops.cpp ops.hpp kernels.h llm.c
9495
mkdir -p build && $(CXX) $(CXXFLAGS) -c -o $@ $<
9596

96-
build/gpt2_webgpu: llm.c gpt2_124M.bin
97+
build/gpt2_webgpu: llm.c gpt2_124M.bin llm.c
9798
mkdir -p build
9899
$(CC) $(CXXFLAGS) -Illm.c $(LDFLAGS) -o $@ gpt2_webgpu.cpp ops.cpp
99100

100-
build/gpt2_webgpu.html: check-emsdk gpt2_webgpu.cpp term.html
101+
build/gpt2_webgpu.html: check-emsdk gpt2_webgpu.cpp term.html llm.c
101102
em++ gpt2_webgpu.cpp ops.cpp \
102103
--preload-file gpt2_tokenizer.bin@/gpt2_tokenizer.bin \
103104
--preload-file gpt2_124M.bin@/gpt2_124M.bin \
@@ -134,12 +135,6 @@ dawnlib: $(if $(wildcard $(GPUCPP)/third_party/lib/libdawn.so $(GPUCPP)/third_pa
134135
run_setup: check-python
135136
cd $(GPUCPP) && python3 setup.py
136137

137-
#build/$(TARGET): run.cpp
138-
# mkdir -p build && $(CXX) $(FLAGS) -DNDEBUG -o $@ $<
139-
140-
#debug: run.cpp
141-
# mkdir -p build && $(CXX) $(FLAGS) -g -o ./build/$(TARGET) $<
142-
143138
clean:
144139
read -r -p "This will delete the contents of build/*. Are you sure? [CTRL-C to abort] " response && rm -rf build/*
145140

experimental/kernels/dev

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../third_party/llm.c/dev

experimental/kernels/gpt2_webgpu.cpp

+34-60
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,6 @@ There will be other versions of this code that specialize it and make it fast.
3131
// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
3232
#include "llmc/dataloader.h"
3333

34-
// CPU reference implementations
35-
#include <iostream>
36-
// #include "gpt2_cpu.hpp"
37-
3834
using namespace gpu;
3935

4036
// ----------------------------------------------------------------------------
@@ -70,26 +66,6 @@ typedef struct {
7066
float* lnfb; // (C)
7167
} ParameterTensors;
7268

73-
74-
typedef struct {
75-
Tensor wte; // (V, C)
76-
Tensor wpe; // (maxT, C)
77-
Tensor ln1w; // (L, C)
78-
Tensor ln1b; // (L, C)
79-
Tensor qkvw; // (L, 3*C, C)
80-
Tensor qkvb; // (L, 3*C)
81-
Tensor attprojw; // (L, C, C)
82-
Tensor attprojb; // (L, C)
83-
Tensor ln2w; // (L, C)
84-
Tensor ln2b; // (L, C)
85-
Tensor fcw; // (L, 4*C, C)
86-
Tensor fcb; // (L, 4*C)
87-
Tensor fcprojw; // (L, C, 4*C)
88-
Tensor fcprojb; // (L, C)
89-
Tensor lnfw; // (C)
90-
Tensor lnfb; // (C)
91-
} GPUParameterTensors;
92-
9369
void fill_in_parameter_sizes(size_t* param_sizes, GPT2Config config) {
9470
size_t Vp = config.padded_vocab_size;
9571
size_t C = config.channels;
@@ -164,32 +140,6 @@ typedef struct {
164140
} ActivationTensors;
165141

166142

167-
typedef struct {
168-
Tensor encoded; // (B, T, C)
169-
Tensor ln1; // (L, B, T, C)
170-
Tensor ln1_mean; // (L, B, T)
171-
Tensor ln1_rstd; // (L, B, T)
172-
Tensor qkv; // (L, B, T, 3*C)
173-
Tensor atty; // (L, B, T, C)
174-
Tensor preatt; // (L, B, NH, T, T)
175-
Tensor att; // (L, B, NH, T, T)
176-
Tensor attproj; // (L, B, T, C)
177-
Tensor residual2; // (L, B, T, C)
178-
Tensor ln2; // (L, B, T, C)
179-
Tensor ln2_mean; // (L, B, T)
180-
Tensor ln2_rstd; // (L, B, T)
181-
Tensor fch; // (L, B, T, 4*C)
182-
Tensor fch_gelu; // (L, B, T, 4*C)
183-
Tensor fcproj; // (L, B, T, C)
184-
Tensor residual3; // (L, B, T, C)
185-
Tensor lnf; // (B, T, C)
186-
Tensor lnf_mean; // (B, T)
187-
Tensor lnf_rstd; // (B, T)
188-
Tensor logits; // (B, T, V)
189-
Tensor probs; // (B, T, V)
190-
Tensor losses; // (B, T)
191-
} GPUActivationTensors;
192-
193143

194144
void fill_in_activation_sizes(size_t* act_sizes, GPT2Config config, int B, int T) {
195145
size_t C = config.channels;
@@ -241,10 +191,26 @@ float* malloc_and_point_activations(ActivationTensors* acts, size_t* act_sizes)
241191
return acts_memory;
242192
}
243193

194+
struct GPUParameters {
195+
Tensor data[NUM_PARAMETER_TENSORS];
196+
};
197+
198+
struct GPUActivations {
199+
Tensor data[NUM_ACTIVATION_TENSORS];
200+
};
201+
202+
203+
void gpu_alloc(Context& ctx, Tensor* tensors, size_t* sizes, size_t n) {
204+
for (size_t i = 0; i < n; i++) {
205+
tensors[i] = createTensor(ctx, Shape{sizes[i]}, kf32);
206+
}
207+
}
208+
244209
typedef struct {
245210
GPT2Config config;
246211
// the weights (parameters) of the model, and their sizes
247212
ParameterTensors params;
213+
GPUParameters params_; // TODO(avh): eventually this replaces params
248214
size_t param_sizes[NUM_PARAMETER_TENSORS];
249215
float* params_memory;
250216
size_t num_parameters;
@@ -256,6 +222,7 @@ typedef struct {
256222
float* v_memory;
257223
// the activations of the model, and their sizes
258224
ActivationTensors acts;
225+
GPUActivations acts_; // TODO(avh): eventually this replaces params
259226
size_t act_sizes[NUM_ACTIVATION_TENSORS];
260227
float* acts_memory;
261228
size_t num_activations;
@@ -270,7 +237,7 @@ typedef struct {
270237
float mean_loss; // after a forward pass with targets, will be populated with the mean loss
271238
} GPT2;
272239

273-
void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
240+
void gpt2_build_from_checkpoint(Context& ctx, GPT2 *model, const char* checkpoint_path) {
274241

275242
// read in model from a checkpoint file
276243
FILE *model_file = fopenCheck(checkpoint_path, "rb");
@@ -330,6 +297,10 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
330297
model->batch_size = 0;
331298
model->seq_len = 0;
332299
model->mean_loss = -1.0f; // -1.0f will designate no loss
300+
301+
// TODO(avh): this is just a resource test for now, eventually deprecate CPU allocations
302+
gpu_alloc(ctx, model->params_.data, model->param_sizes, NUM_PARAMETER_TENSORS);
303+
333304
}
334305

335306

@@ -364,6 +335,8 @@ void gpt2_forward(Context& ctx, GPT2 *model, int* inputs, int* targets, size_t B
364335
model->seq_len = T;
365336
// and now allocate the space
366337
fill_in_activation_sizes(model->act_sizes, model->config, B, T);
338+
// TODO(avh): this is just a resource test for now, eventually deprecate CPU allocations
339+
gpu_alloc(ctx, model->acts_.data, model->act_sizes, NUM_PARAMETER_TENSORS);
367340
size_t num_activations = 0;
368341
for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
369342
num_activations += model->act_sizes[i];
@@ -678,11 +651,18 @@ int sample_mult(float* probabilities, int n, float coin) {
678651
// main training loop
679652
int main() {
680653

681-
setLogLevel(kError);
654+
setLogLevel(kWarn);
655+
656+
printf("Creating GPU context\n");
657+
WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
658+
gpu::Context ctx = gpu::createContext({}, {}, {
659+
.requiredLimits = &requiredLimits
660+
});
661+
// gpu::Context ctx = gpu::createContext();
682662

683663
// build the GPT-2 model from a checkpoint
684664
GPT2 model;
685-
gpt2_build_from_checkpoint(&model, "gpt2_124M.bin");
665+
gpt2_build_from_checkpoint(ctx, &model, "gpt2_124M.bin");
686666

687667
// build the DataLoaders from tokens files. for now use tiny_shakespeare if available, else tiny_stories
688668
const char* tiny_stories_train = "dev/data/tinystories/TinyStories_train.bin";
@@ -709,13 +689,7 @@ int main() {
709689
int* gen_tokens = (int*)mallocCheck(B * T * sizeof(int));
710690
const int genT = 64; // number of steps of inference we will do
711691

712-
printf("Creating GPU context\n");
713-
WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
714-
gpu::Context ctx = gpu::createContext({}, {}, {
715-
.requiredLimits = &requiredLimits
716-
});
717-
// gpu::Context ctx = gpu::createContext();
718-
692+
719693
// train
720694
struct timespec start, end;
721695
printf("Starting training\n");

0 commit comments

Comments
 (0)