openai · nalexand · Aug 30, 2025 · Aug 30, 2025 · Aug 30, 2025 · Aug 30, 2025
diff --git a/README.md b/README.md
@@ -10,6 +10,32 @@
 </p>
 
 <br>
+<h1>GPT-OSS-20B optimized to run on 8GB VRAM</h1>
+<ul> 
+<li>Use lazy load for Transformer layears (2.5 times slower than without lazy load, but can be runned on 8GB 3070Ti Laptop with 32GB RAM)</li>
+<li>Added kv_cache to speed up inference (torch)</li>
+<li>Optimized weight loading speed</li>
+<li>Optimized forward pass and attention</li>
+</ul>
+
+## 🎥 Demo Video
+
+Watch GPT-OSS 20B running on just 8GB of VRAM:
+
+[![GPT-OSS 20B Demo](https://englyk.com/gpt-oss-20b-8gb-vram.jpg)](https://englyk.com/gpt_oss_20b_8gb_vram.mp4)
+
+*Click the image to watch the full demonstration* or -
+[Watch on YouTube](https://youtu.be/0g7MBALZM8c)
+
+<h2>UPDATE: 08/31/2025 - Added support 6 Gb VRAM for gpt-oss-20b !!!</h2>
+
+- Optimized MLPBlock
+- gpt_oss.generate min 6 Gb VRAM
+- gpt_oss.chat min 8 Gb VRAM
+- gpt_oss.chat windows support with pyreadline3 module
+- auto tune options for awailable VRAM
+
+__________________________________________
 
 Welcome to the gpt-oss series, [OpenAI's open-weight models](https://openai.com/open-models/) designed for powerful reasoning, agentic tasks, and versatile developer use cases.
 
@@ -199,6 +225,17 @@ And then run:
 torchrun --nproc-per-node=4 -m gpt_oss.generate gpt-oss-120b/original/
 ```
 
+# Windows run example
+```shell
+python -m gpt_oss.generate --backend torch gpt-oss-20b/original/ -p "Hi" -l 10
+
+```
+#with profiler
+```shell
+kernprof -l -v -m gpt_oss.generate --backend torch gpt-oss-20b/original/ -p "Hi" -l 10
+```
+
+
 ## Reference Triton implementation (single GPU)
 
 We also include an optimized reference implementation that uses [an optimized triton MoE kernel](https://github.com/triton-lang/triton/tree/main/python/triton_kernels/triton_kernels) that supports MXFP4. It also has some optimization on the attention code to reduce the memory cost. To run this implementation, the nightly version of triton and torch will be installed. This version can be run on a single 80GB GPU for `gpt-oss-120b`.

diff --git a/gpt_oss/chat.py b/gpt_oss/chat.py
@@ -12,7 +12,10 @@
 try:
     import gnureadline as readline
 except ImportError:
-    import readline
+    try:
+        import readline
+    except ImportError:
+        import pyreadline3 as readline
 
 import torch
 import termcolor
@@ -69,7 +72,7 @@ def main(args):
             from gpt_oss.torch.model import TokenGenerator as TorchGenerator
             from gpt_oss.torch.utils import init_distributed
             device = init_distributed()
-            generator = TorchGenerator(args.checkpoint, device)
+            generator = TorchGenerator(args.checkpoint, device, pin_memory=False)
         case "vllm":
             from gpt_oss.vllm.token_generator import TokenGenerator as VLLMGenerator
             generator = VLLMGenerator(args.checkpoint, tensor_parallel_size=2)
@@ -245,7 +248,9 @@ async def run_tool():
         field_created = False
         current_output_text = ""
         output_text_delta_buffer = ""
-        for predicted_token in generator.generate(tokens, encoding.stop_tokens_for_assistant_actions()):
+        for predicted_token in generator.generate(tokens, encoding.stop_tokens_for_assistant_actions(),
+                                                  #temperature=0, max_tokens=10
+                                                  ):
             parser.process(predicted_token)
             if args.raw:
                 print(encoding.decode([predicted_token]), end="", flush=True)

diff --git a/gpt_oss/generate.py b/gpt_oss/generate.py
@@ -7,14 +7,24 @@
 
 from gpt_oss.tokenizer import get_tokenizer
 
+from line_profiler import profile
 
+try:
+    profile # type: ignore
+except NameError:
+    profile = lambda f: f
+
+import os
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:False"
+
+@profile
 def main(args):
     match args.backend:
         case "torch":
             from gpt_oss.torch.utils import init_distributed
             from gpt_oss.torch.model import TokenGenerator as TorchGenerator
             device = init_distributed()
-            generator = TorchGenerator(args.checkpoint, device=device)
+            generator = TorchGenerator(args.checkpoint, device=device, pin_memory=True)
         case "triton":
             from gpt_oss.torch.utils import init_distributed
             from gpt_oss.triton.model import TokenGenerator as TritonGenerator