From 59d98fd83cedefb07da861cda5260e216edf4697 Mon Sep 17 00:00:00 2001 From: Felix Andreas Date: Fri, 17 May 2024 18:36:38 +0200 Subject: [PATCH] =?UTF-8?q?refactor:=20Small=20cleanups=20=F0=9F=A7=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 93 +++++++++++++++++++++++------------------------ src/main.rs | 101 +++++++++++++++++++++------------------------------ src/model.rs | 5 ++- src/train.rs | 3 -- 4 files changed, 90 insertions(+), 112 deletions(-) diff --git a/README.md b/README.md index f4c7afd..7967f9b 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,17 @@ This project aims to be a clean and concise re-implementation of [GPT-2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf). The model implementation, contained in [`src/model.rs`](src/model.rs), is under 300 lines of code. While this was a fun exercise mostly for (my own) educational purposes, it demonstrates the utility of Rust and Burn in the machine learning domain: The entire project compiles into a single binary, making deployment relatively straightforward. -The project also includes a simple CLI for training and inference. At the moment, only a character-level tokenizer is supported, so official weights requiring a BPE tokenizer cannot be used yet. However, for fun, you can try out the small toy model I trained ([see inference](#inference)). +At the moment, only a character-level tokenizer is supported, so official weights requiring a BPE tokenizer cannot be used yet. However, for fun, you can try out the small toy model I trained ([see inference](#inference)). + +The project also includes a simple CLI for training and inference. + +``` +Usage: gpt-burn + +Commands: + run Generate text using a pre-trained model + train Train a new model +``` ## Installation @@ -69,6 +79,20 @@ Sie war trotz weniger als 10.000 ausgedehnter Größen wahrscheinlich auf folgen 2016 wurden rund 145 Händen nach Deutschland geladen. ``` +Further command line options are: + +``` +Usage: gpt-burn run [OPTIONS] + +Arguments: + + +Options: + -p, --prompt + -n, --n-new-tokens [default: 1000] + -s, --seed [default: 0] +``` + ## Training To train your own model, run: @@ -80,6 +104,26 @@ gpt-burn train --context-length 128 --n-layers 12 --n-heads 12 --d-model 768 --b > [!IMPORTANT] > Make sure `corpus.txt` is a utf-8 encoded text file! +You can pass most hyperparameters as a command-line option: + +``` +Usage: gpt-burn train [OPTIONS] + +Options: + -o, --output-path + -c, --context-length [default: 64] + -d, --d-model [default: 64] + -l, --n-layers [default: 2] + -h, --n-heads [default: 2] + -n, --n-steps [default: 50] + -b, --batch-size [default: 32] + -r, --learning-rate [default: 0.003] + -s, --seed [default: 0] + -t, --text-corpus [default: .data/corpus.txt] + -m, --n-mega-bytes Only use first megabytes of dataset for training + -x, --no-save Don't save trained model (useful for debugging) +``` + ## Tokenizer The model can be used with different tokenizers via the `Tokenizer` trait. Below you see how the following sentence @@ -108,53 +152,6 @@ Tokens: ["Albert", " ", "Einst", "ein", " ", "war", " ", "ein", " ", "schw", "ei Values: [2, 0, 3, 9, 0, 19, 0, 9, 0, 16, 10, 15, 1, 6, 1, 7, 13, 15, 11, 0, 17, 12, 11, 0, 5, 14, 0, 8, 11, 0, 4, 18] ``` -## CLI options - -The `gpt-burn` command has multiple subcommands: - -``` -Usage: gpt-burn - -Commands: - train Train a new model - run Generate text using a pre-trained model - help Print this message or the help of the given subcommand(s) -``` - -For inference, you can pass a model path and the number of new tokens that should be generated: - -``` -Usage: gpt-burn run [OPTIONS] - -Arguments: - - -Options: - -p, --prompt - -n, --n-new-tokens [default: 1000] - -s, --seed [default: 0] -``` - -For training, you can pass most hyperparameters as a command-line option: - -``` -Usage: gpt-burn train [OPTIONS] - -Options: - -o, --output-path - -c, --context-length [default: 64] - -d, --d-model [default: 64] - -l, --n-layers [default: 2] - -h, --n-heads [default: 2] - -n, --n-steps [default: 50] - -b, --batch-size [default: 32] - -r, --learning-rate [default: 0.003] - -s, --seed [default: 0] - -t, --text-corpus [default: .data/corpus.txt] - -m, --n-mega-bytes Only use first megabytes of dataset for training - -x, --no-save Don't save trained model (useful for debugging) -``` - ## References * [GPT-2 Paper](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) diff --git a/src/main.rs b/src/main.rs index d02ba37..4527df7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -22,12 +22,36 @@ use { }; fn main() { - /* Alternatively use CPU backend */ - // type B = burn::backend::ndarray::NdArray; - type B = Wgpu; + type B = Wgpu; // alternative backend: burn::backend::ndarray::NdArray type AutoB = Autodiff; match Cli::parse().command { + Commands::Run { + model_path: path, + prompt, + n_new_tokens, + seed, + .. + } => { + let device = ::Device::default(); + + let tokenizer = CharTokenizer::new(); + let config = TrainingConfig::load(path.join("config.json")).unwrap(); + let model: Model = config.model.init(&device).load_record( + CompactRecorder::new() + .load(path.join("model"), &device) + .unwrap(), + ); + + gpt_burn::run( + &model, + &tokenizer, + &prompt.unwrap_or("\n".into()), + n_new_tokens, + config.model.context_length, + seed, + ); + } Commands::Train { text_corpus, output_path, @@ -43,10 +67,7 @@ fn main() { seed, .. } => { - // cli option defaults - let save = !no_save; - - // load text corpus and tokenizer + // load text corpus and instantiate tokenizer println!( "{BOLD}load {} file {text_corpus:?}{RESET} as dataset", n_mega_bytes.map_or_else( @@ -69,12 +90,6 @@ fn main() { .filter(|char| tokenizer.ttoi.contains_key(char)) .collect::(); - /* Uncomment to use `SimpleVowelTokenizer` */ - // let tokenizer = { - // let tokens = SimpleVowelTokenizer::tokenize(&text).collect::>(); - // SimpleVowelTokenizer::new(&tokens, vocab_size) - // }; - let mut train = tokenizer.encode(&text); let test = train.split_off((0.9 * train.len() as f32) as usize); @@ -119,10 +134,10 @@ fn main() { }, optimizer: AdamWConfig::new(), }; - let model = gpt_burn::train(&config, data_train, data_test, save); + let model = gpt_burn::train(&config, data_train, data_test, !no_save); // save trained model - if save { + if !no_save { let output_path = output_path.unwrap_or_else(|| { format!( ".data/gpt_{}k_{}context_{}", @@ -137,9 +152,6 @@ fn main() { fs::remove_dir_all(&output_path).ok(); fs::create_dir_all(&output_path).ok(); - /* Uncomment to use `SimpleVowelTokenizer` */ - // tokenizer.save(&format!("{model_path}/tokenizer.bin")); - config.save(output_path.join("config.json")).unwrap(); model .clone() @@ -158,35 +170,6 @@ fn main() { seed, ); } - Commands::Run { - model_path: path, - prompt, - n_new_tokens, - seed, - .. - } => { - let device = ::Device::default(); - - let tokenizer = CharTokenizer::new(); - - /* Alternatively use `SimpleVowelTokenizer` */ - // let tokenizer = SimpleVowelTokenizer::load(&format!("{path}/tokenizer.bin")); - - let config = TrainingConfig::load(format!("{path}/config.json")).unwrap(); - let record = CompactRecorder::new() - .load(format!("{path}/model").into(), &device) - .unwrap(); - let model: Model = config.model.init(&device).load_record(record); - - gpt_burn::run( - &model, - &tokenizer, - &prompt.unwrap_or("\n".into()), - n_new_tokens, - config.model.context_length, - seed, - ); - } } } @@ -201,6 +184,18 @@ struct Cli { #[derive(Subcommand)] enum Commands { + /// Generate text using a pre-trained model + Run { + model_path: PathBuf, + #[arg(short, long)] + prompt: Option, + #[arg(short, long, default_value_t = 1000)] + n_new_tokens: usize, + #[arg(short, long, default_value_t = 0)] + seed: u64, + #[arg(long , action = clap::ArgAction::HelpLong)] + help: Option, + }, /// Train a new model Train { #[arg(short = 'o', long, value_name = "PATH")] @@ -232,16 +227,4 @@ enum Commands { #[arg(long , action = clap::ArgAction::HelpLong)] help: Option, }, - /// Generate text using a pre-trained model - Run { - model_path: String, - #[arg(short, long)] - prompt: Option, - #[arg(short, long, default_value_t = 1000)] - n_new_tokens: usize, - #[arg(short, long, default_value_t = 0)] - seed: u64, - #[arg(long , action = clap::ArgAction::HelpLong)] - help: Option, - }, } diff --git a/src/model.rs b/src/model.rs index e350098..bdcd30d 100644 --- a/src/model.rs +++ b/src/model.rs @@ -135,7 +135,6 @@ struct Block { impl Block { fn forward(&self, input: Tensor) -> Tensor { let x = input.clone(); - let x = x.clone() + self.multi_head.forward(self.norm_1.forward(x)); let x = x.clone() + self.pwff.forward(self.norm_2.forward(x)); @@ -230,8 +229,9 @@ impl MultiHeadAttention { let x = x.matmul(v); let x = x.swap_dims(1, 2).reshape([b, t, self.n_heads * self.d_k]); let x = self.resid_dropout.forward(x); + let x = self.out.forward(x); - self.out.forward(x) + x } } @@ -282,6 +282,7 @@ impl PositionWiseFeedForward { let x = self.gelu.forward(x); let x = self.linear_2.forward(x); let x = self.dropout.forward(x); + x } } diff --git a/src/train.rs b/src/train.rs index b9058d9..8eb5076 100644 --- a/src/train.rs +++ b/src/train.rs @@ -111,9 +111,6 @@ pub fn train( println!("{BOLD}store checkpoint model to: {model_path}{RESET}"); fs::create_dir_all(&model_path).ok(); - /* Uncomment to use `SimpleVowelTokenizer` */ - // tokenizer.save(&format!("{model_path}/tokenizer.bin")); - config.save(format!("{model_path}/config.json")).unwrap(); model .clone()