@@ -47,7 +47,6 @@ typedef struct {
4747
4848// the parameters of the model
4949#define NUM_PARAMETER_TENSORS 16
50- #define NUM_PARAMETER_LAYERS 12
5150typedef struct {
5251 Tensor wte; // (V, C)
5352 Tensor wpe; // (maxT, C)
@@ -91,22 +90,36 @@ void fill_in_parameter_sizes(size_t* param_sizes, GPT2Config config) {
9190}
9291
9392// allocate memory for the parameters and point the individual tensors to the right places
94- void malloc_and_point_parameters (Context& ctx, ParameterTensors* params, size_t * param_sizes) {
93+ void malloc_and_point_parameters (Context& ctx, GPT2Config config, ParameterTensors* params, size_t * param_sizes) {
94+ size_t L = config.num_layers ;
9595 params->wte = createTensor (ctx, Shape{param_sizes[0 ]}, kf32);
9696 params->wpe = createTensor (ctx, Shape{param_sizes[1 ]}, kf32);
97- for (int l = 0 ; l < NUM_PARAMETER_LAYERS; l++) {
98- params->ln1w .push_back (createTensor (ctx, Shape{param_sizes[2 ]/NUM_PARAMETER_LAYERS}, kf32));
99- params->ln1b .push_back (createTensor (ctx, Shape{param_sizes[3 ]/NUM_PARAMETER_LAYERS}, kf32));
100- params->qkvw .push_back (createTensor (ctx, Shape{param_sizes[4 ]/NUM_PARAMETER_LAYERS}, kf32));
101- params->qkvb .push_back (createTensor (ctx, Shape{param_sizes[5 ]/NUM_PARAMETER_LAYERS}, kf32));
102- params->attprojw .push_back (createTensor (ctx, Shape{param_sizes[6 ]/NUM_PARAMETER_LAYERS}, kf32));
103- params->attprojb .push_back (createTensor (ctx, Shape{param_sizes[7 ]/NUM_PARAMETER_LAYERS}, kf32));
104- params->ln2w .push_back (createTensor (ctx, Shape{param_sizes[8 ]/NUM_PARAMETER_LAYERS}, kf32));
105- params->ln2b .push_back (createTensor (ctx, Shape{param_sizes[9 ]/NUM_PARAMETER_LAYERS}, kf32));
106- params->fcw .push_back (createTensor (ctx, Shape{param_sizes[10 ]/NUM_PARAMETER_LAYERS}, kf32));
107- params->fcb .push_back (createTensor (ctx, Shape{param_sizes[11 ]/NUM_PARAMETER_LAYERS}, kf32));
108- params->fcprojw .push_back (createTensor (ctx, Shape{param_sizes[12 ]/NUM_PARAMETER_LAYERS}, kf32));
109- params->fcprojb .push_back (createTensor (ctx, Shape{param_sizes[13 ]/NUM_PARAMETER_LAYERS}, kf32));
97+
98+ params->ln1w .resize (L);
99+ params->ln1b .resize (L);
100+ params->qkvw .resize (L);
101+ params->qkvb .resize (L);
102+ params->attprojw .resize (L);
103+ params->attprojb .resize (L);
104+ params->ln2w .resize (L);
105+ params->ln2b .resize (L);
106+ params->fcw .resize (L);
107+ params->fcb .resize (L);
108+ params->fcprojw .resize (L);
109+ params->fcprojb .resize (L);
110+ for (int l = 0 ; l < L ; l++) {
111+ params->ln1w [l] = createTensor (ctx, Shape{param_sizes[2 ]/config.num_layers }, kf32);
112+ params->ln1b [l] = createTensor (ctx, Shape{param_sizes[3 ]/config.num_layers }, kf32);
113+ params->qkvw [l] = createTensor (ctx, Shape{param_sizes[4 ]/config.num_layers }, kf32);
114+ params->qkvb [l] = createTensor (ctx, Shape{param_sizes[5 ]/config.num_layers }, kf32);
115+ params->attprojw [l] = createTensor (ctx, Shape{param_sizes[6 ]/config.num_layers }, kf32);
116+ params->attprojb [l] = createTensor (ctx, Shape{param_sizes[7 ]/config.num_layers }, kf32);
117+ params->ln2w [l] = createTensor (ctx, Shape{param_sizes[8 ]/config.num_layers }, kf32);
118+ params->ln2b [l] = createTensor (ctx, Shape{param_sizes[9 ]/config.num_layers }, kf32);
119+ params->fcw [l] = createTensor (ctx, Shape{param_sizes[10 ]/config.num_layers }, kf32);
120+ params->fcb [l] = createTensor (ctx, Shape{param_sizes[11 ]/config.num_layers }, kf32);
121+ params->fcprojw [l] = createTensor (ctx, Shape{param_sizes[12 ]/config.num_layers }, kf32);
122+ params->fcprojb [l] = createTensor (ctx, Shape{param_sizes[13 ]/config.num_layers }, kf32);
110123 }
111124 params->lnfw = createTensor (ctx, Shape{param_sizes[14 ]}, kf32);
112125 params->lnfb = createTensor (ctx, Shape{param_sizes[15 ]}, kf32);
@@ -201,25 +214,42 @@ void fill_in_activation_sizes(size_t* act_sizes, GPT2Config config, int B, int T
201214 act_sizes[22 ] = B * T; // losses
202215}
203216
204- void malloc_and_point_activations (Context& ctx, ActivationTensors* acts, size_t * act_sizes) {
217+ void malloc_and_point_activations (Context& ctx, GPT2Config config, ActivationTensors* acts, size_t * act_sizes) {
218+ size_t L = config.num_layers ;
205219 acts->encoded = createTensor (ctx, Shape{act_sizes[0 ]}, kf32);
206- for (int l = 0 ; l < NUM_PARAMETER_LAYERS; l++) {
207- acts->ln1 .push_back (createTensor (ctx, Shape{act_sizes[1 ]/NUM_PARAMETER_LAYERS}, kf32));
208- acts->ln1_mean .push_back (createTensor (ctx, Shape{act_sizes[2 ]/NUM_PARAMETER_LAYERS}, kf32));
209- acts->ln1_rstd .push_back (createTensor (ctx, Shape{act_sizes[3 ]/NUM_PARAMETER_LAYERS}, kf32));
210- acts->qkv .push_back (createTensor (ctx, Shape{act_sizes[4 ]/NUM_PARAMETER_LAYERS}, kf32));
211- acts->atty .push_back (createTensor (ctx, Shape{act_sizes[5 ]/NUM_PARAMETER_LAYERS}, kf32));
212- acts->preatt .push_back (createTensor (ctx, Shape{act_sizes[6 ]/NUM_PARAMETER_LAYERS}, kf32));
213- acts->att .push_back (createTensor (ctx, Shape{act_sizes[7 ]/NUM_PARAMETER_LAYERS}, kf32));
214- acts->attproj .push_back (createTensor (ctx, Shape{act_sizes[8 ]/NUM_PARAMETER_LAYERS}, kf32));
215- acts->residual2 .push_back (createTensor (ctx, Shape{act_sizes[9 ]/NUM_PARAMETER_LAYERS}, kf32));
216- acts->ln2 .push_back (createTensor (ctx, Shape{act_sizes[10 ]/NUM_PARAMETER_LAYERS}, kf32));
217- acts->ln2_mean .push_back (createTensor (ctx, Shape{act_sizes[11 ]/NUM_PARAMETER_LAYERS}, kf32));
218- acts->ln2_rstd .push_back (createTensor (ctx, Shape{act_sizes[12 ]/NUM_PARAMETER_LAYERS}, kf32));
219- acts->fch .push_back (createTensor (ctx, Shape{act_sizes[13 ]/NUM_PARAMETER_LAYERS}, kf32));
220- acts->fch_gelu .push_back (createTensor (ctx, Shape{act_sizes[14 ]/NUM_PARAMETER_LAYERS}, kf32));
221- acts->fcproj .push_back (createTensor (ctx, Shape{act_sizes[15 ]/NUM_PARAMETER_LAYERS}, kf32));
222- acts->residual3 .push_back (createTensor (ctx, Shape{act_sizes[16 ]/NUM_PARAMETER_LAYERS}, kf32));
220+ acts->ln1 .resize (L);
221+ acts->ln1_mean .resize (L);
222+ acts->ln1_rstd .resize (L);
223+ acts->qkv .resize (L);
224+ acts->atty .resize (L);
225+ acts->preatt .resize (L);
226+ acts->att .resize (L);
227+ acts->attproj .resize (L);
228+ acts->residual2 .resize (L);
229+ acts->ln2 .resize (L);
230+ acts->ln2_mean .resize (L);
231+ acts->ln2_rstd .resize (L);
232+ acts->fch .resize (L);
233+ acts->fch_gelu .resize (L);
234+ acts->fcproj .resize (L);
235+ acts->residual3 .resize (L);
236+ for (int l = 0 ; l < L; l++) {
237+ acts->ln1 [l] = createTensor (ctx, Shape{act_sizes[1 ]/config.num_layers }, kf32);
238+ acts->ln1_mean [l] = createTensor (ctx, Shape{act_sizes[2 ]/config.num_layers }, kf32);
239+ acts->ln1_rstd [l] = createTensor (ctx, Shape{act_sizes[3 ]/config.num_layers }, kf32);
240+ acts->qkv [l] = createTensor (ctx, Shape{act_sizes[4 ]/config.num_layers }, kf32);
241+ acts->atty [l] = createTensor (ctx, Shape{act_sizes[5 ]/config.num_layers }, kf32);
242+ acts->preatt [l] = createTensor (ctx, Shape{act_sizes[6 ]/config.num_layers }, kf32);
243+ acts->att [l] = createTensor (ctx, Shape{act_sizes[7 ]/config.num_layers }, kf32);
244+ acts->attproj [l] = createTensor (ctx, Shape{act_sizes[8 ]/config.num_layers }, kf32);
245+ acts->residual2 [l] = createTensor (ctx, Shape{act_sizes[9 ]/config.num_layers }, kf32);
246+ acts->ln2 [l] = createTensor (ctx, Shape{act_sizes[10 ]/config.num_layers }, kf32);
247+ acts->ln2_mean [l] = createTensor (ctx, Shape{act_sizes[11 ]/config.num_layers }, kf32);
248+ acts->ln2_rstd [l] = createTensor (ctx, Shape{act_sizes[12 ]/config.num_layers }, kf32);
249+ acts->fch [l] = createTensor (ctx, Shape{act_sizes[13 ]/config.num_layers }, kf32);
250+ acts->fch_gelu [l] = createTensor (ctx, Shape{act_sizes[14 ]/config.num_layers }, kf32);
251+ acts->fcproj [l] = createTensor (ctx, Shape{act_sizes[15 ]/config.num_layers }, kf32);
252+ acts->residual3 [l] = createTensor (ctx, Shape{act_sizes[16 ]/config.num_layers }, kf32);
223253 }
224254 acts->lnf = createTensor (ctx, Shape{act_sizes[17 ]}, kf32);
225255 acts->lnf_mean = createTensor (ctx, Shape{act_sizes[18 ]}, kf32);
@@ -229,15 +259,6 @@ void malloc_and_point_activations(Context& ctx, ActivationTensors* acts, size_t*
229259 acts->losses = createTensor (ctx, Shape{act_sizes[22 ]}, kf32);
230260}
231261
232- struct GPUParameters {
233- Tensor data[NUM_PARAMETER_TENSORS];
234- };
235-
236- struct GPUActivations {
237- Tensor data[NUM_ACTIVATION_TENSORS];
238- };
239-
240-
241262void gpu_alloc (Context& ctx, Tensor* tensors, size_t * sizes, size_t n) {
242263 for (size_t i = 0 ; i < n; i++) {
243264 tensors[i] = createTensor (ctx, Shape{sizes[i]}, kf32);
@@ -325,7 +346,7 @@ void gpt2_build_from_checkpoint(Context& ctx, GPT2 *model, const char* checkpoin
325346 model->num_parameters = num_parameters;
326347
327348 // read in all the parameters from file
328- malloc_and_point_parameters (ctx, &model->params , model->param_sizes );
349+ malloc_and_point_parameters (ctx, model-> config , &model->params , model->param_sizes );
329350 model->params_memory = (float *)mallocCheck (num_parameters * sizeof (float ));
330351 freadCheck (model->params_memory , sizeof (float ), num_parameters, model_file);
331352 fcloseCheck (model_file);
@@ -428,7 +449,7 @@ void gpt2_forward(Context& ctx, GPT2 *model, Tensor& inputs, Tensor& targets, si
428449 printf (" num_activations: %zu\n " , num_activations);
429450 model->num_activations = num_activations;
430451 printf (" Allocating %.2f MB for activations\n " , num_activations * sizeof (float ) / (1024 .0f * 1024 .0f ));
431- malloc_and_point_activations (ctx, &model->acts , model->act_sizes );
452+ malloc_and_point_activations (ctx, model-> config , &model->acts , model->act_sizes );
432453 // also create memory for caching inputs and targets
433454 // model->inputs = (int*)mallocCheck(B * T * sizeof(int));
434455 // model->targets = (int*)mallocCheck(B * T * sizeof(int)); // might be unused if we never have targets but it's small
@@ -664,8 +685,8 @@ void gpt2_backward(Context& ctx, GPT2 *model) {
664685 // lazily allocate the memory for gradients of the weights and activations, if needed
665686 if (model->grads_memory == NULL ) {
666687 printf (" Allocating %.2f MB for gradients\n " , model->num_parameters * sizeof (float ) / (1024 .0f * 1024 .0f ));
667- malloc_and_point_parameters (ctx, &model->grads , model->param_sizes );
668- malloc_and_point_activations (ctx, &model->grads_acts , model->act_sizes );
688+ malloc_and_point_parameters (ctx, model-> config , &model->grads , model->param_sizes );
689+ malloc_and_point_activations (ctx, model-> config , &model->grads_acts , model->act_sizes );
669690 gpt2_zero_grad (model);
670691 }
671692
0 commit comments