huggingface · belambert · Oct 7, 2025
diff --git a/units/en/unit1/4.md b/units/en/unit1/4.md
@@ -170,48 +170,52 @@ Now let's explore the chat template formatting. We will create different types o
 # Create different types of conversations to test
 conversations = {
     "simple_qa": [
+        {"role": "system", "content": "/no_think"},
         {"role": "user", "content": "What is machine learning?"},
     ],
-
     "with_system": [
-        {"role": "system", "content": "You are a helpful AI assistant specialized in explaining technical concepts clearly."},
+        {
+            "role": "system",
+            "content": "You are a helpful AI assistant specialized in explaining technical concepts clearly. /no_think",
+        },
         {"role": "user", "content": "What is machine learning?"},
     ],
-
     "multi_turn": [
-        {"role": "system", "content": "You are a math tutor."},
+        {"role": "system", "content": "You are a math tutor. /no_think"},
         {"role": "user", "content": "What is calculus?"},
-        {"role": "assistant", "content": "Calculus is a branch of mathematics that deals with rates of change and accumulation of quantities."},
+        {
+            "role": "assistant",
+            "content": "Calculus is a branch of mathematics that deals with rates of change and accumulation of quantities.",
+        },
         {"role": "user", "content": "Can you give me a simple example?"},
     ],
-
     "reasoning_task": [
-        {"role": "user", "content": "Solve step by step: If a train travels 120 miles in 2 hours, what is its average speed?"},
-    ]
+        {"role": "system", "content": "/think"},
+        {
+            "role": "user",
+            "content": "Solve step by step: If a train travels 120 miles in 2 hours, what is its average speed?",
+        },
+    ],
 }
 
 for conv_type, messages in conversations.items():
     print(f"--- {conv_type.upper()} ---")
-    
+
     # Format without generation prompt (for completed conversations)
     formatted_complete = instruct_tokenizer.apply_chat_template(
-        messages, 
-        tokenize=False, 
-        add_generation_prompt=False
+        messages, tokenize=False, add_generation_prompt=False
     )
-    
+
     # Format with generation prompt (for inference)
     formatted_prompt = instruct_tokenizer.apply_chat_template(
-        messages, 
-        tokenize=False, 
-        add_generation_prompt=True
+        messages, tokenize=False, add_generation_prompt=True
     )
-    
+
     print("Complete conversation format:")
     print(formatted_complete)
     print("\nWith generation prompt:")
     print(formatted_prompt)
-    print("\n" + "="*50 + "\n")
+    print("\n" + "=" * 50 + "\n")
 ```
 
 <details>
@@ -400,11 +404,12 @@ test_prompt = "Explain quantum computing in simple terms."
 base_inputs = base_tokenizer(test_prompt, return_tensors="pt").to(device)
 
 # Prepare the prompt for instruct model (with chat template)
-instruct_messages = [{"role": "user", "content": test_prompt}]
+instruct_messages = [
+    {"role": "system", "content": "/no_think"},
+    {"role": "user", "content": test_prompt}
+]
 instruct_formatted = instruct_tokenizer.apply_chat_template(
-    instruct_messages, 
-    tokenize=False, 
-    add_generation_prompt=True
+    instruct_messages, tokenize=False, add_generation_prompt=True
 )
 instruct_inputs = instruct_tokenizer(instruct_formatted, return_tensors="pt").to(device)
 
@@ -418,25 +423,29 @@ with torch.no_grad():
         max_new_tokens=150,
         temperature=0.7,
         do_sample=True,
-        pad_token_id=base_tokenizer.eos_token_id
+        pad_token_id=base_tokenizer.eos_token_id,
     )
     base_response = base_tokenizer.decode(base_outputs[0], skip_special_tokens=True)
-    print(base_response[len(test_prompt):])  # Show only the generated part
+    print(base_response[len(test_prompt) :])  # Show only the generated part
 
-print("\n" + "="*50)
+print("\n" + "=" * 50)
 print("Instruct model response:")
 with torch.no_grad():
     instruct_outputs = instruct_model.generate(
         **instruct_inputs,
         max_new_tokens=150,
         temperature=0.7,
         do_sample=True,
-        pad_token_id=instruct_tokenizer.eos_token_id
+        pad_token_id=instruct_tokenizer.eos_token_id,
+    )
+    instruct_response = instruct_tokenizer.decode(
+        instruct_outputs[0], skip_special_tokens=False
     )
-    instruct_response = instruct_tokenizer.decode(instruct_outputs[0], skip_special_tokens=True)
     # Extract only the assistant's response
-    assistant_start = instruct_response.find("<|im_start|>assistant\n") + len("<|im_start|>assistant\n")
-    assistant_response = instruct_response[assistant_start:].split("<|im_end|>")[0]
+    assistant_start = instruct_response.find("<|im_start|>assistant\n") + len(
+        "<|im_start|>assistant\n"
+    )
+    assistant_response = instruct_response[assistant_start:]
     print(assistant_response)
 ```
 
@@ -488,34 +497,46 @@ Here we probe SmolLM3's reasoning mode with math and proportionality problems, k
 reasoning_prompts = [
     "What is 15 × 24? Show your work.",
     "A recipe calls for 2 cups of flour for 12 cookies. How much flour is needed for 30 cookies?",
-    "If I have $50 and spend $18.75 on lunch and $12.30 on a book, how much money do I have left?"
+    "If I have $50 and spend $18.75 on lunch and $12.30 on a book, how much money do I have left?",
+]
+
+thinking_prompts = [
+    "/no_think",
+    "/think"
 ]
 
 print("=== TESTING REASONING CAPABILITIES ===\n")
 
-for i, prompt in enumerate(reasoning_prompts, 1):
-    print(f"Problem {i}: {prompt}")
-
-    messages = [{"role": "user", "content": prompt}]
-    formatted_prompt = instruct_tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    inputs = instruct_tokenizer(formatted_prompt, return_tensors="pt").to(device)
-
-    with torch.no_grad():
-        outputs = instruct_model.generate(
-            **inputs,
-            max_new_tokens=200,
-            temperature=0.3,  # Lower temperature for more consistent reasoning
-            do_sample=True,
-            pad_token_id=instruct_tokenizer.eos_token_id
+for thinking_prompt in thinking_prompts:
+    print(f"Thinking prompt: {thinking_prompt}")
+    for i, prompt in enumerate(reasoning_prompts, 1):
+        print(f"Problem {i}: {prompt}")
+
+        messages = [
+            {"role":"system", "content": thinking_prompt},
+            {"role": "user", "content": prompt}
+        ]
+        formatted_prompt = instruct_tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
         )
-        response = instruct_tokenizer.decode(outputs[0], skip_special_tokens=True)
-        assistant_start = response.find("<|im_start|>assistant\n") + len("<|im_start|>assistant\n")
-        assistant_response = response[assistant_start:].split("<|im_end|>")[0]
-        print(f"Answer: {assistant_response}")
-
-    print("\n" + "-"*50 + "\n")
+        inputs = instruct_tokenizer(formatted_prompt, return_tensors="pt").to(device)
+
+        with torch.no_grad():
+            outputs = instruct_model.generate(
+                **inputs,
+                max_new_tokens=200,
+                temperature=0.3,  # Lower temperature for more consistent reasoning
+                do_sample=True,
+                pad_token_id=instruct_tokenizer.eos_token_id,
+            )
+            response = instruct_tokenizer.decode(outputs[0], skip_special_tokens=False)
+            assistant_start = response.find("<|im_start|>assistant\n") + len(
+                "<|im_start|>assistant\n"
+            )
+            assistant_response = response[assistant_start:].split("<|im_end|>")[0]
+            print(f"Answer: {assistant_response}")
+
+        print("\n" + "-" * 50 + "\n")
 ```
 
 If we dive into the out put below, we can see that the instruct model's hybrid reasoning being applied with the `/no_think` mode. When the mode is activated, the model will enclose thinking process in `<think>` tags. It uses these tokens to explore possible solutions and answer the question. After the thinking process, the model will provide the final answer, which we can extract with the chat template, or string manipulation here.