refactor: Small cleanups 🧹

felix-andreas · May 17, 2024 · 59d98fd · 59d98fd
1 parent 709d265
commit 59d98fd
Show file tree

Hide file tree

Showing 4 changed files with 90 additions and 112 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,17 @@
 
 This project aims to be a clean and concise re-implementation of [GPT-2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf). The model implementation, contained in [`src/model.rs`](src/model.rs), is under 300 lines of code. While this was a fun exercise mostly for (my own) educational purposes, it demonstrates the utility of Rust and Burn in the machine learning domain: The entire project compiles into a single binary, making deployment relatively straightforward.
 
-The project also includes a simple CLI for training and inference. At the moment, only a character-level tokenizer is supported, so official weights requiring a BPE tokenizer cannot be used yet. However, for fun, you can try out the small toy model I trained ([see inference](#inference)).
+At the moment, only a character-level tokenizer is supported, so official weights requiring a BPE tokenizer cannot be used yet. However, for fun, you can try out the small toy model I trained ([see inference](#inference)). 
+
+The project also includes a simple CLI for training and inference. 
+
+```
+Usage: gpt-burn <COMMAND>
+
+Commands:
+  run    Generate text using a pre-trained model
+  train  Train a new model
+```
 
 ## Installation
 
@@ -69,6 +79,20 @@ Sie war trotz weniger als 10.000 ausgedehnter Größen wahrscheinlich auf folgen
 2016 wurden rund 145 Händen nach Deutschland geladen.
 ```
 
+Further command line options are:
+
+```
+Usage: gpt-burn run [OPTIONS] <MODEL_PATH>
+
+Arguments:
+  <MODEL_PATH>
+
+Options:
+  -p, --prompt <PROMPT>
+  -n, --n-new-tokens <N_NEW_TOKENS>  [default: 1000]
+  -s, --seed <SEED>                  [default: 0]
+```
+
 ## Training
 
 To train your own model, run:
@@ -80,6 +104,26 @@ gpt-burn train --context-length 128 --n-layers 12 --n-heads 12 --d-model 768 --b
 > [!IMPORTANT]
 > Make sure `corpus.txt` is a utf-8 encoded text file!
 
+You can pass most hyperparameters as a command-line option:
+
+```
+Usage: gpt-burn train [OPTIONS]
+
+Options:
+  -o, --output-path <PATH>
+  -c, --context-length <CONTEXT_LENGTH>  [default: 64]
+  -d, --d-model <D_MODEL>                [default: 64]
+  -l, --n-layers <N_LAYERS>              [default: 2]
+  -h, --n-heads <N_HEADS>                [default: 2]
+  -n, --n-steps <N_STEPS>                [default: 50]
+  -b, --batch-size <BATCH_SIZE>          [default: 32]
+  -r, --learning-rate <LEARNING_RATE>    [default: 0.003]
+  -s, --seed <SEED>                      [default: 0]
+  -t, --text-corpus <TEXT_CORPUS>        [default: .data/corpus.txt]
+  -m, --n-mega-bytes <N_MEGA_BYTES>      Only use first <n> megabytes of dataset for training
+  -x, --no-save                          Don't save trained model (useful for debugging)
+```
+
 ## Tokenizer
 
 The model can be used with different tokenizers via the `Tokenizer` trait. Below you see how the following sentence
@@ -108,53 +152,6 @@ Tokens: ["Albert", " ", "Einst", "ein", " ", "war", " ", "ein", " ", "schw", "ei
 Values: [2, 0, 3, 9, 0, 19, 0, 9, 0, 16, 10, 15, 1, 6, 1, 7, 13, 15, 11, 0, 17, 12, 11, 0, 5, 14, 0, 8, 11, 0, 4, 18]
 ```
 
-## CLI options
-
-The `gpt-burn` command has multiple subcommands:
-
-```
-Usage: gpt-burn <COMMAND>
-
-Commands:
-  train  Train a new model
-  run    Generate text using a pre-trained model
-  help   Print this message or the help of the given subcommand(s)
-```
-
-For inference, you can pass a model path and the number of new tokens that should be generated:
-
-```
-Usage: gpt-burn run [OPTIONS] <MODEL_PATH>
-
-Arguments:
-  <MODEL_PATH>
-
-Options:
-  -p, --prompt <PROMPT>
-  -n, --n-new-tokens <N_NEW_TOKENS>  [default: 1000]
-  -s, --seed <SEED>                  [default: 0]
-```
-
-For training, you can pass most hyperparameters as a command-line option:
-
-```
-Usage: gpt-burn train [OPTIONS]
-
-Options:
-  -o, --output-path <PATH>
-  -c, --context-length <CONTEXT_LENGTH>  [default: 64]
-  -d, --d-model <D_MODEL>                [default: 64]
-  -l, --n-layers <N_LAYERS>              [default: 2]
-  -h, --n-heads <N_HEADS>                [default: 2]
-  -n, --n-steps <N_STEPS>                [default: 50]
-  -b, --batch-size <BATCH_SIZE>          [default: 32]
-  -r, --learning-rate <LEARNING_RATE>    [default: 0.003]
-  -s, --seed <SEED>                      [default: 0]
-  -t, --text-corpus <TEXT_CORPUS>        [default: .data/corpus.txt]
-  -m, --n-mega-bytes <N_MEGA_BYTES>      Only use first <n> megabytes of dataset for training
-  -x, --no-save                          Don't save trained model (useful for debugging)
-```
-
 ## References
 
 * [GPT-2 Paper](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)

diff --git a/src/main.rs b/src/main.rs
@@ -22,12 +22,36 @@ use {
 };
 
 fn main() {
-    /* Alternatively use CPU backend */
-    // type B = burn::backend::ndarray::NdArray;
-    type B = Wgpu;
+    type B = Wgpu; // alternative backend: burn::backend::ndarray::NdArray
     type AutoB = Autodiff<B>;
 
     match Cli::parse().command {
+        Commands::Run {
+            model_path: path,
+            prompt,
+            n_new_tokens,
+            seed,
+            ..
+        } => {
+            let device = <B as Backend>::Device::default();
+
+            let tokenizer = CharTokenizer::new();
+            let config = TrainingConfig::load(path.join("config.json")).unwrap();
+            let model: Model<B> = config.model.init(&device).load_record(
+                CompactRecorder::new()
+                    .load(path.join("model"), &device)
+                    .unwrap(),
+            );
+
+            gpt_burn::run(
+                &model,
+                &tokenizer,
+                &prompt.unwrap_or("\n".into()),
+                n_new_tokens,
+                config.model.context_length,
+                seed,
+            );
+        }
         Commands::Train {
             text_corpus,
             output_path,
@@ -43,10 +67,7 @@ fn main() {
             seed,
             ..
         } => {
-            // cli option defaults
-            let save = !no_save;
-
-            // load text corpus and tokenizer
+            // load text corpus and instantiate tokenizer
             println!(
                 "{BOLD}load {} file {text_corpus:?}{RESET} as dataset",
                 n_mega_bytes.map_or_else(
@@ -69,12 +90,6 @@ fn main() {
                     .filter(|char| tokenizer.ttoi.contains_key(char))
                     .collect::<String>();
 
-                /* Uncomment to use `SimpleVowelTokenizer` */
-                // let tokenizer = {
-                //     let tokens = SimpleVowelTokenizer::tokenize(&text).collect::<Vec<_>>();
-                //     SimpleVowelTokenizer::new(&tokens, vocab_size)
-                // };
-
                 let mut train = tokenizer.encode(&text);
                 let test = train.split_off((0.9 * train.len() as f32) as usize);
 
@@ -119,10 +134,10 @@ fn main() {
                 },
                 optimizer: AdamWConfig::new(),
             };
-            let model = gpt_burn::train(&config, data_train, data_test, save);
+            let model = gpt_burn::train(&config, data_train, data_test, !no_save);
 
             // save trained model
-            if save {
+            if !no_save {
                 let output_path = output_path.unwrap_or_else(|| {
                     format!(
                         ".data/gpt_{}k_{}context_{}",
@@ -137,9 +152,6 @@ fn main() {
                 fs::remove_dir_all(&output_path).ok();
                 fs::create_dir_all(&output_path).ok();
 
-                /* Uncomment to use `SimpleVowelTokenizer` */
-                // tokenizer.save(&format!("{model_path}/tokenizer.bin"));
-
                 config.save(output_path.join("config.json")).unwrap();
                 model
                     .clone()
@@ -158,35 +170,6 @@ fn main() {
                 seed,
             );
         }
-        Commands::Run {
-            model_path: path,
-            prompt,
-            n_new_tokens,
-            seed,
-            ..
-        } => {
-            let device = <B as Backend>::Device::default();
-
-            let tokenizer = CharTokenizer::new();
-
-            /* Alternatively use `SimpleVowelTokenizer` */
-            // let tokenizer = SimpleVowelTokenizer::load(&format!("{path}/tokenizer.bin"));
-
-            let config = TrainingConfig::load(format!("{path}/config.json")).unwrap();
-            let record = CompactRecorder::new()
-                .load(format!("{path}/model").into(), &device)
-                .unwrap();
-            let model: Model<B> = config.model.init(&device).load_record(record);
-
-            gpt_burn::run(
-                &model,
-                &tokenizer,
-                &prompt.unwrap_or("\n".into()),
-                n_new_tokens,
-                config.model.context_length,
-                seed,
-            );
-        }
     }
 }
 
@@ -201,6 +184,18 @@ struct Cli {
 
 #[derive(Subcommand)]
 enum Commands {
+    /// Generate text using a pre-trained model
+    Run {
+        model_path: PathBuf,
+        #[arg(short, long)]
+        prompt: Option<String>,
+        #[arg(short, long, default_value_t = 1000)]
+        n_new_tokens: usize,
+        #[arg(short, long, default_value_t = 0)]
+        seed: u64,
+        #[arg(long , action = clap::ArgAction::HelpLong)]
+        help: Option<bool>,
+    },
     /// Train a new model
     Train {
         #[arg(short = 'o', long, value_name = "PATH")]
@@ -232,16 +227,4 @@ enum Commands {
         #[arg(long , action = clap::ArgAction::HelpLong)]
         help: Option<bool>,
     },
-    /// Generate text using a pre-trained model
-    Run {
-        model_path: String,
-        #[arg(short, long)]
-        prompt: Option<String>,
-        #[arg(short, long, default_value_t = 1000)]
-        n_new_tokens: usize,
-        #[arg(short, long, default_value_t = 0)]
-        seed: u64,
-        #[arg(long , action = clap::ArgAction::HelpLong)]
-        help: Option<bool>,
-    },
 }
diff --git a/src/model.rs b/src/model.rs
@@ -135,7 +135,6 @@ struct Block<B: Backend> {
 impl<B: Backend> Block<B> {
     fn forward(&self, input: Tensor<B, 3>) -> Tensor<B, 3> {
         let x = input.clone();
-
         let x = x.clone() + self.multi_head.forward(self.norm_1.forward(x));
         let x = x.clone() + self.pwff.forward(self.norm_2.forward(x));
 
@@ -230,8 +229,9 @@ impl<B: Backend> MultiHeadAttention<B> {
         let x = x.matmul(v);
         let x = x.swap_dims(1, 2).reshape([b, t, self.n_heads * self.d_k]);
         let x = self.resid_dropout.forward(x);
+        let x = self.out.forward(x);
 
-        self.out.forward(x)
+        x
     }
 }
 
@@ -282,6 +282,7 @@ impl<B: Backend> PositionWiseFeedForward<B> {
         let x = self.gelu.forward(x);
         let x = self.linear_2.forward(x);
         let x = self.dropout.forward(x);
+
         x
     }
 }
diff --git a/src/train.rs b/src/train.rs
@@ -111,9 +111,6 @@ pub fn train<B: AutodiffBackend>(
             println!("{BOLD}store checkpoint model to: {model_path}{RESET}");
             fs::create_dir_all(&model_path).ok();
 
-            /* Uncomment to use `SimpleVowelTokenizer` */
-            // tokenizer.save(&format!("{model_path}/tokenizer.bin"));
-
             config.save(format!("{model_path}/config.json")).unwrap();
             model
                 .clone()