@@ -31,10 +31,6 @@ There will be other versions of this code that specialize it and make it fast.
31
31
// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
32
32
#include " llmc/dataloader.h"
33
33
34
- // CPU reference implementations
35
- #include < iostream>
36
- // #include "gpt2_cpu.hpp"
37
-
38
34
using namespace gpu ;
39
35
40
36
// ----------------------------------------------------------------------------
@@ -70,26 +66,6 @@ typedef struct {
70
66
float * lnfb; // (C)
71
67
} ParameterTensors;
72
68
73
-
74
- typedef struct {
75
- Tensor wte; // (V, C)
76
- Tensor wpe; // (maxT, C)
77
- Tensor ln1w; // (L, C)
78
- Tensor ln1b; // (L, C)
79
- Tensor qkvw; // (L, 3*C, C)
80
- Tensor qkvb; // (L, 3*C)
81
- Tensor attprojw; // (L, C, C)
82
- Tensor attprojb; // (L, C)
83
- Tensor ln2w; // (L, C)
84
- Tensor ln2b; // (L, C)
85
- Tensor fcw; // (L, 4*C, C)
86
- Tensor fcb; // (L, 4*C)
87
- Tensor fcprojw; // (L, C, 4*C)
88
- Tensor fcprojb; // (L, C)
89
- Tensor lnfw; // (C)
90
- Tensor lnfb; // (C)
91
- } GPUParameterTensors;
92
-
93
69
void fill_in_parameter_sizes (size_t * param_sizes, GPT2Config config) {
94
70
size_t Vp = config.padded_vocab_size ;
95
71
size_t C = config.channels ;
@@ -164,32 +140,6 @@ typedef struct {
164
140
} ActivationTensors;
165
141
166
142
167
- typedef struct {
168
- Tensor encoded; // (B, T, C)
169
- Tensor ln1; // (L, B, T, C)
170
- Tensor ln1_mean; // (L, B, T)
171
- Tensor ln1_rstd; // (L, B, T)
172
- Tensor qkv; // (L, B, T, 3*C)
173
- Tensor atty; // (L, B, T, C)
174
- Tensor preatt; // (L, B, NH, T, T)
175
- Tensor att; // (L, B, NH, T, T)
176
- Tensor attproj; // (L, B, T, C)
177
- Tensor residual2; // (L, B, T, C)
178
- Tensor ln2; // (L, B, T, C)
179
- Tensor ln2_mean; // (L, B, T)
180
- Tensor ln2_rstd; // (L, B, T)
181
- Tensor fch; // (L, B, T, 4*C)
182
- Tensor fch_gelu; // (L, B, T, 4*C)
183
- Tensor fcproj; // (L, B, T, C)
184
- Tensor residual3; // (L, B, T, C)
185
- Tensor lnf; // (B, T, C)
186
- Tensor lnf_mean; // (B, T)
187
- Tensor lnf_rstd; // (B, T)
188
- Tensor logits; // (B, T, V)
189
- Tensor probs; // (B, T, V)
190
- Tensor losses; // (B, T)
191
- } GPUActivationTensors;
192
-
193
143
194
144
void fill_in_activation_sizes (size_t * act_sizes, GPT2Config config, int B, int T) {
195
145
size_t C = config.channels ;
@@ -241,10 +191,26 @@ float* malloc_and_point_activations(ActivationTensors* acts, size_t* act_sizes)
241
191
return acts_memory;
242
192
}
243
193
194
+ struct GPUParameters {
195
+ Tensor data[NUM_PARAMETER_TENSORS];
196
+ };
197
+
198
+ struct GPUActivations {
199
+ Tensor data[NUM_ACTIVATION_TENSORS];
200
+ };
201
+
202
+
203
+ void gpu_alloc (Context& ctx, Tensor* tensors, size_t * sizes, size_t n) {
204
+ for (size_t i = 0 ; i < n; i++) {
205
+ tensors[i] = createTensor (ctx, Shape{sizes[i]}, kf32);
206
+ }
207
+ }
208
+
244
209
typedef struct {
245
210
GPT2Config config;
246
211
// the weights (parameters) of the model, and their sizes
247
212
ParameterTensors params;
213
+ GPUParameters params_; // TODO(avh): eventually this replaces params
248
214
size_t param_sizes[NUM_PARAMETER_TENSORS];
249
215
float * params_memory;
250
216
size_t num_parameters;
@@ -256,6 +222,7 @@ typedef struct {
256
222
float * v_memory;
257
223
// the activations of the model, and their sizes
258
224
ActivationTensors acts;
225
+ GPUActivations acts_; // TODO(avh): eventually this replaces params
259
226
size_t act_sizes[NUM_ACTIVATION_TENSORS];
260
227
float * acts_memory;
261
228
size_t num_activations;
@@ -270,7 +237,7 @@ typedef struct {
270
237
float mean_loss; // after a forward pass with targets, will be populated with the mean loss
271
238
} GPT2;
272
239
273
- void gpt2_build_from_checkpoint (GPT2 *model, const char * checkpoint_path) {
240
+ void gpt2_build_from_checkpoint (Context& ctx, GPT2 *model, const char * checkpoint_path) {
274
241
275
242
// read in model from a checkpoint file
276
243
FILE *model_file = fopenCheck (checkpoint_path, " rb" );
@@ -330,6 +297,10 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
330
297
model->batch_size = 0 ;
331
298
model->seq_len = 0 ;
332
299
model->mean_loss = -1 .0f ; // -1.0f will designate no loss
300
+
301
+ // TODO(avh): this is just a resource test for now, eventually deprecate CPU allocations
302
+ gpu_alloc (ctx, model->params_ .data , model->param_sizes , NUM_PARAMETER_TENSORS);
303
+
333
304
}
334
305
335
306
@@ -364,6 +335,8 @@ void gpt2_forward(Context& ctx, GPT2 *model, int* inputs, int* targets, size_t B
364
335
model->seq_len = T;
365
336
// and now allocate the space
366
337
fill_in_activation_sizes (model->act_sizes , model->config , B, T);
338
+ // TODO(avh): this is just a resource test for now, eventually deprecate CPU allocations
339
+ gpu_alloc (ctx, model->acts_ .data , model->act_sizes , NUM_PARAMETER_TENSORS);
367
340
size_t num_activations = 0 ;
368
341
for (size_t i = 0 ; i < NUM_ACTIVATION_TENSORS; i++) {
369
342
num_activations += model->act_sizes [i];
@@ -678,11 +651,18 @@ int sample_mult(float* probabilities, int n, float coin) {
678
651
// main training loop
679
652
int main () {
680
653
681
- setLogLevel (kError );
654
+ setLogLevel (kWarn );
655
+
656
+ printf (" Creating GPU context\n " );
657
+ WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
658
+ gpu::Context ctx = gpu::createContext ({}, {}, {
659
+ .requiredLimits = &requiredLimits
660
+ });
661
+ // gpu::Context ctx = gpu::createContext();
682
662
683
663
// build the GPT-2 model from a checkpoint
684
664
GPT2 model;
685
- gpt2_build_from_checkpoint (&model, " gpt2_124M.bin" );
665
+ gpt2_build_from_checkpoint (ctx, &model, " gpt2_124M.bin" );
686
666
687
667
// build the DataLoaders from tokens files. for now use tiny_shakespeare if available, else tiny_stories
688
668
const char * tiny_stories_train = " dev/data/tinystories/TinyStories_train.bin" ;
@@ -709,13 +689,7 @@ int main() {
709
689
int * gen_tokens = (int *)mallocCheck (B * T * sizeof (int ));
710
690
const int genT = 64 ; // number of steps of inference we will do
711
691
712
- printf (" Creating GPU context\n " );
713
- WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
714
- gpu::Context ctx = gpu::createContext ({}, {}, {
715
- .requiredLimits = &requiredLimits
716
- });
717
- // gpu::Context ctx = gpu::createContext();
718
-
692
+
719
693
// train
720
694
struct timespec start, end;
721
695
printf (" Starting training\n " );
0 commit comments