@@ -47,7 +47,6 @@ typedef struct {
47
47
48
48
// the parameters of the model
49
49
#define NUM_PARAMETER_TENSORS 16
50
- #define NUM_PARAMETER_LAYERS 12
51
50
typedef struct {
52
51
Tensor wte; // (V, C)
53
52
Tensor wpe; // (maxT, C)
@@ -91,22 +90,36 @@ void fill_in_parameter_sizes(size_t* param_sizes, GPT2Config config) {
91
90
}
92
91
93
92
// allocate memory for the parameters and point the individual tensors to the right places
94
- void malloc_and_point_parameters (Context& ctx, ParameterTensors* params, size_t * param_sizes) {
93
+ void malloc_and_point_parameters (Context& ctx, GPT2Config config, ParameterTensors* params, size_t * param_sizes) {
94
+ size_t L = config.num_layers ;
95
95
params->wte = createTensor (ctx, Shape{param_sizes[0 ]}, kf32);
96
96
params->wpe = createTensor (ctx, Shape{param_sizes[1 ]}, kf32);
97
- for (int l = 0 ; l < NUM_PARAMETER_LAYERS; l++) {
98
- params->ln1w .push_back (createTensor (ctx, Shape{param_sizes[2 ]/NUM_PARAMETER_LAYERS}, kf32));
99
- params->ln1b .push_back (createTensor (ctx, Shape{param_sizes[3 ]/NUM_PARAMETER_LAYERS}, kf32));
100
- params->qkvw .push_back (createTensor (ctx, Shape{param_sizes[4 ]/NUM_PARAMETER_LAYERS}, kf32));
101
- params->qkvb .push_back (createTensor (ctx, Shape{param_sizes[5 ]/NUM_PARAMETER_LAYERS}, kf32));
102
- params->attprojw .push_back (createTensor (ctx, Shape{param_sizes[6 ]/NUM_PARAMETER_LAYERS}, kf32));
103
- params->attprojb .push_back (createTensor (ctx, Shape{param_sizes[7 ]/NUM_PARAMETER_LAYERS}, kf32));
104
- params->ln2w .push_back (createTensor (ctx, Shape{param_sizes[8 ]/NUM_PARAMETER_LAYERS}, kf32));
105
- params->ln2b .push_back (createTensor (ctx, Shape{param_sizes[9 ]/NUM_PARAMETER_LAYERS}, kf32));
106
- params->fcw .push_back (createTensor (ctx, Shape{param_sizes[10 ]/NUM_PARAMETER_LAYERS}, kf32));
107
- params->fcb .push_back (createTensor (ctx, Shape{param_sizes[11 ]/NUM_PARAMETER_LAYERS}, kf32));
108
- params->fcprojw .push_back (createTensor (ctx, Shape{param_sizes[12 ]/NUM_PARAMETER_LAYERS}, kf32));
109
- params->fcprojb .push_back (createTensor (ctx, Shape{param_sizes[13 ]/NUM_PARAMETER_LAYERS}, kf32));
97
+
98
+ params->ln1w .resize (L);
99
+ params->ln1b .resize (L);
100
+ params->qkvw .resize (L);
101
+ params->qkvb .resize (L);
102
+ params->attprojw .resize (L);
103
+ params->attprojb .resize (L);
104
+ params->ln2w .resize (L);
105
+ params->ln2b .resize (L);
106
+ params->fcw .resize (L);
107
+ params->fcb .resize (L);
108
+ params->fcprojw .resize (L);
109
+ params->fcprojb .resize (L);
110
+ for (int l = 0 ; l < L ; l++) {
111
+ params->ln1w [l] = createTensor (ctx, Shape{param_sizes[2 ]/config.num_layers }, kf32);
112
+ params->ln1b [l] = createTensor (ctx, Shape{param_sizes[3 ]/config.num_layers }, kf32);
113
+ params->qkvw [l] = createTensor (ctx, Shape{param_sizes[4 ]/config.num_layers }, kf32);
114
+ params->qkvb [l] = createTensor (ctx, Shape{param_sizes[5 ]/config.num_layers }, kf32);
115
+ params->attprojw [l] = createTensor (ctx, Shape{param_sizes[6 ]/config.num_layers }, kf32);
116
+ params->attprojb [l] = createTensor (ctx, Shape{param_sizes[7 ]/config.num_layers }, kf32);
117
+ params->ln2w [l] = createTensor (ctx, Shape{param_sizes[8 ]/config.num_layers }, kf32);
118
+ params->ln2b [l] = createTensor (ctx, Shape{param_sizes[9 ]/config.num_layers }, kf32);
119
+ params->fcw [l] = createTensor (ctx, Shape{param_sizes[10 ]/config.num_layers }, kf32);
120
+ params->fcb [l] = createTensor (ctx, Shape{param_sizes[11 ]/config.num_layers }, kf32);
121
+ params->fcprojw [l] = createTensor (ctx, Shape{param_sizes[12 ]/config.num_layers }, kf32);
122
+ params->fcprojb [l] = createTensor (ctx, Shape{param_sizes[13 ]/config.num_layers }, kf32);
110
123
}
111
124
params->lnfw = createTensor (ctx, Shape{param_sizes[14 ]}, kf32);
112
125
params->lnfb = createTensor (ctx, Shape{param_sizes[15 ]}, kf32);
@@ -201,25 +214,42 @@ void fill_in_activation_sizes(size_t* act_sizes, GPT2Config config, int B, int T
201
214
act_sizes[22 ] = B * T; // losses
202
215
}
203
216
204
- void malloc_and_point_activations (Context& ctx, ActivationTensors* acts, size_t * act_sizes) {
217
+ void malloc_and_point_activations (Context& ctx, GPT2Config config, ActivationTensors* acts, size_t * act_sizes) {
218
+ size_t L = config.num_layers ;
205
219
acts->encoded = createTensor (ctx, Shape{act_sizes[0 ]}, kf32);
206
- for (int l = 0 ; l < NUM_PARAMETER_LAYERS; l++) {
207
- acts->ln1 .push_back (createTensor (ctx, Shape{act_sizes[1 ]/NUM_PARAMETER_LAYERS}, kf32));
208
- acts->ln1_mean .push_back (createTensor (ctx, Shape{act_sizes[2 ]/NUM_PARAMETER_LAYERS}, kf32));
209
- acts->ln1_rstd .push_back (createTensor (ctx, Shape{act_sizes[3 ]/NUM_PARAMETER_LAYERS}, kf32));
210
- acts->qkv .push_back (createTensor (ctx, Shape{act_sizes[4 ]/NUM_PARAMETER_LAYERS}, kf32));
211
- acts->atty .push_back (createTensor (ctx, Shape{act_sizes[5 ]/NUM_PARAMETER_LAYERS}, kf32));
212
- acts->preatt .push_back (createTensor (ctx, Shape{act_sizes[6 ]/NUM_PARAMETER_LAYERS}, kf32));
213
- acts->att .push_back (createTensor (ctx, Shape{act_sizes[7 ]/NUM_PARAMETER_LAYERS}, kf32));
214
- acts->attproj .push_back (createTensor (ctx, Shape{act_sizes[8 ]/NUM_PARAMETER_LAYERS}, kf32));
215
- acts->residual2 .push_back (createTensor (ctx, Shape{act_sizes[9 ]/NUM_PARAMETER_LAYERS}, kf32));
216
- acts->ln2 .push_back (createTensor (ctx, Shape{act_sizes[10 ]/NUM_PARAMETER_LAYERS}, kf32));
217
- acts->ln2_mean .push_back (createTensor (ctx, Shape{act_sizes[11 ]/NUM_PARAMETER_LAYERS}, kf32));
218
- acts->ln2_rstd .push_back (createTensor (ctx, Shape{act_sizes[12 ]/NUM_PARAMETER_LAYERS}, kf32));
219
- acts->fch .push_back (createTensor (ctx, Shape{act_sizes[13 ]/NUM_PARAMETER_LAYERS}, kf32));
220
- acts->fch_gelu .push_back (createTensor (ctx, Shape{act_sizes[14 ]/NUM_PARAMETER_LAYERS}, kf32));
221
- acts->fcproj .push_back (createTensor (ctx, Shape{act_sizes[15 ]/NUM_PARAMETER_LAYERS}, kf32));
222
- acts->residual3 .push_back (createTensor (ctx, Shape{act_sizes[16 ]/NUM_PARAMETER_LAYERS}, kf32));
220
+ acts->ln1 .resize (L);
221
+ acts->ln1_mean .resize (L);
222
+ acts->ln1_rstd .resize (L);
223
+ acts->qkv .resize (L);
224
+ acts->atty .resize (L);
225
+ acts->preatt .resize (L);
226
+ acts->att .resize (L);
227
+ acts->attproj .resize (L);
228
+ acts->residual2 .resize (L);
229
+ acts->ln2 .resize (L);
230
+ acts->ln2_mean .resize (L);
231
+ acts->ln2_rstd .resize (L);
232
+ acts->fch .resize (L);
233
+ acts->fch_gelu .resize (L);
234
+ acts->fcproj .resize (L);
235
+ acts->residual3 .resize (L);
236
+ for (int l = 0 ; l < L; l++) {
237
+ acts->ln1 [l] = createTensor (ctx, Shape{act_sizes[1 ]/config.num_layers }, kf32);
238
+ acts->ln1_mean [l] = createTensor (ctx, Shape{act_sizes[2 ]/config.num_layers }, kf32);
239
+ acts->ln1_rstd [l] = createTensor (ctx, Shape{act_sizes[3 ]/config.num_layers }, kf32);
240
+ acts->qkv [l] = createTensor (ctx, Shape{act_sizes[4 ]/config.num_layers }, kf32);
241
+ acts->atty [l] = createTensor (ctx, Shape{act_sizes[5 ]/config.num_layers }, kf32);
242
+ acts->preatt [l] = createTensor (ctx, Shape{act_sizes[6 ]/config.num_layers }, kf32);
243
+ acts->att [l] = createTensor (ctx, Shape{act_sizes[7 ]/config.num_layers }, kf32);
244
+ acts->attproj [l] = createTensor (ctx, Shape{act_sizes[8 ]/config.num_layers }, kf32);
245
+ acts->residual2 [l] = createTensor (ctx, Shape{act_sizes[9 ]/config.num_layers }, kf32);
246
+ acts->ln2 [l] = createTensor (ctx, Shape{act_sizes[10 ]/config.num_layers }, kf32);
247
+ acts->ln2_mean [l] = createTensor (ctx, Shape{act_sizes[11 ]/config.num_layers }, kf32);
248
+ acts->ln2_rstd [l] = createTensor (ctx, Shape{act_sizes[12 ]/config.num_layers }, kf32);
249
+ acts->fch [l] = createTensor (ctx, Shape{act_sizes[13 ]/config.num_layers }, kf32);
250
+ acts->fch_gelu [l] = createTensor (ctx, Shape{act_sizes[14 ]/config.num_layers }, kf32);
251
+ acts->fcproj [l] = createTensor (ctx, Shape{act_sizes[15 ]/config.num_layers }, kf32);
252
+ acts->residual3 [l] = createTensor (ctx, Shape{act_sizes[16 ]/config.num_layers }, kf32);
223
253
}
224
254
acts->lnf = createTensor (ctx, Shape{act_sizes[17 ]}, kf32);
225
255
acts->lnf_mean = createTensor (ctx, Shape{act_sizes[18 ]}, kf32);
@@ -229,15 +259,6 @@ void malloc_and_point_activations(Context& ctx, ActivationTensors* acts, size_t*
229
259
acts->losses = createTensor (ctx, Shape{act_sizes[22 ]}, kf32);
230
260
}
231
261
232
- struct GPUParameters {
233
- Tensor data[NUM_PARAMETER_TENSORS];
234
- };
235
-
236
- struct GPUActivations {
237
- Tensor data[NUM_ACTIVATION_TENSORS];
238
- };
239
-
240
-
241
262
void gpu_alloc (Context& ctx, Tensor* tensors, size_t * sizes, size_t n) {
242
263
for (size_t i = 0 ; i < n; i++) {
243
264
tensors[i] = createTensor (ctx, Shape{sizes[i]}, kf32);
@@ -325,7 +346,7 @@ void gpt2_build_from_checkpoint(Context& ctx, GPT2 *model, const char* checkpoin
325
346
model->num_parameters = num_parameters;
326
347
327
348
// read in all the parameters from file
328
- malloc_and_point_parameters (ctx, &model->params , model->param_sizes );
349
+ malloc_and_point_parameters (ctx, model-> config , &model->params , model->param_sizes );
329
350
model->params_memory = (float *)mallocCheck (num_parameters * sizeof (float ));
330
351
freadCheck (model->params_memory , sizeof (float ), num_parameters, model_file);
331
352
fcloseCheck (model_file);
@@ -428,7 +449,7 @@ void gpt2_forward(Context& ctx, GPT2 *model, Tensor& inputs, Tensor& targets, si
428
449
printf (" num_activations: %zu\n " , num_activations);
429
450
model->num_activations = num_activations;
430
451
printf (" Allocating %.2f MB for activations\n " , num_activations * sizeof (float ) / (1024 .0f * 1024 .0f ));
431
- malloc_and_point_activations (ctx, &model->acts , model->act_sizes );
452
+ malloc_and_point_activations (ctx, model-> config , &model->acts , model->act_sizes );
432
453
// also create memory for caching inputs and targets
433
454
// model->inputs = (int*)mallocCheck(B * T * sizeof(int));
434
455
// model->targets = (int*)mallocCheck(B * T * sizeof(int)); // might be unused if we never have targets but it's small
@@ -664,8 +685,8 @@ void gpt2_backward(Context& ctx, GPT2 *model) {
664
685
// lazily allocate the memory for gradients of the weights and activations, if needed
665
686
if (model->grads_memory == NULL ) {
666
687
printf (" Allocating %.2f MB for gradients\n " , model->num_parameters * sizeof (float ) / (1024 .0f * 1024 .0f ));
667
- malloc_and_point_parameters (ctx, &model->grads , model->param_sizes );
668
- malloc_and_point_activations (ctx, &model->grads_acts , model->act_sizes );
688
+ malloc_and_point_parameters (ctx, model-> config , &model->grads , model->param_sizes );
689
+ malloc_and_point_activations (ctx, model-> config , &model->grads_acts , model->act_sizes );
669
690
gpt2_zero_grad (model);
670
691
}
671
692
0 commit comments