Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 73 additions & 52 deletions units/en/unit1/4.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,48 +170,52 @@ Now let's explore the chat template formatting. We will create different types o
# Create different types of conversations to test
conversations = {
"simple_qa": [
{"role": "system", "content": "/no_think"},
{"role": "user", "content": "What is machine learning?"},
],

"with_system": [
{"role": "system", "content": "You are a helpful AI assistant specialized in explaining technical concepts clearly."},
{
"role": "system",
"content": "You are a helpful AI assistant specialized in explaining technical concepts clearly. /no_think",
},
{"role": "user", "content": "What is machine learning?"},
],

"multi_turn": [
{"role": "system", "content": "You are a math tutor."},
{"role": "system", "content": "You are a math tutor. /no_think"},
{"role": "user", "content": "What is calculus?"},
{"role": "assistant", "content": "Calculus is a branch of mathematics that deals with rates of change and accumulation of quantities."},
{
"role": "assistant",
"content": "Calculus is a branch of mathematics that deals with rates of change and accumulation of quantities.",
},
{"role": "user", "content": "Can you give me a simple example?"},
],

"reasoning_task": [
{"role": "user", "content": "Solve step by step: If a train travels 120 miles in 2 hours, what is its average speed?"},
]
{"role": "system", "content": "/think"},
{
"role": "user",
"content": "Solve step by step: If a train travels 120 miles in 2 hours, what is its average speed?",
},
],
}

for conv_type, messages in conversations.items():
print(f"--- {conv_type.upper()} ---")

# Format without generation prompt (for completed conversations)
formatted_complete = instruct_tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False
messages, tokenize=False, add_generation_prompt=False
)

# Format with generation prompt (for inference)
formatted_prompt = instruct_tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
messages, tokenize=False, add_generation_prompt=True
)

print("Complete conversation format:")
print(formatted_complete)
print("\nWith generation prompt:")
print(formatted_prompt)
print("\n" + "="*50 + "\n")
print("\n" + "=" * 50 + "\n")
```

<details>
Expand Down Expand Up @@ -400,11 +404,12 @@ test_prompt = "Explain quantum computing in simple terms."
base_inputs = base_tokenizer(test_prompt, return_tensors="pt").to(device)

# Prepare the prompt for instruct model (with chat template)
instruct_messages = [{"role": "user", "content": test_prompt}]
instruct_messages = [
{"role": "system", "content": "/no_think"},
{"role": "user", "content": test_prompt}
]
instruct_formatted = instruct_tokenizer.apply_chat_template(
instruct_messages,
tokenize=False,
add_generation_prompt=True
instruct_messages, tokenize=False, add_generation_prompt=True
)
instruct_inputs = instruct_tokenizer(instruct_formatted, return_tensors="pt").to(device)

Expand All @@ -418,25 +423,29 @@ with torch.no_grad():
max_new_tokens=150,
temperature=0.7,
do_sample=True,
pad_token_id=base_tokenizer.eos_token_id
pad_token_id=base_tokenizer.eos_token_id,
)
base_response = base_tokenizer.decode(base_outputs[0], skip_special_tokens=True)
print(base_response[len(test_prompt):]) # Show only the generated part
print(base_response[len(test_prompt) :]) # Show only the generated part

print("\n" + "="*50)
print("\n" + "=" * 50)
print("Instruct model response:")
with torch.no_grad():
instruct_outputs = instruct_model.generate(
**instruct_inputs,
max_new_tokens=150,
temperature=0.7,
do_sample=True,
pad_token_id=instruct_tokenizer.eos_token_id
pad_token_id=instruct_tokenizer.eos_token_id,
)
instruct_response = instruct_tokenizer.decode(
instruct_outputs[0], skip_special_tokens=False
)
instruct_response = instruct_tokenizer.decode(instruct_outputs[0], skip_special_tokens=True)
# Extract only the assistant's response
assistant_start = instruct_response.find("<|im_start|>assistant\n") + len("<|im_start|>assistant\n")
assistant_response = instruct_response[assistant_start:].split("<|im_end|>")[0]
assistant_start = instruct_response.find("<|im_start|>assistant\n") + len(
"<|im_start|>assistant\n"
)
assistant_response = instruct_response[assistant_start:]
print(assistant_response)
```

Expand Down Expand Up @@ -488,34 +497,46 @@ Here we probe SmolLM3's reasoning mode with math and proportionality problems, k
reasoning_prompts = [
"What is 15 × 24? Show your work.",
"A recipe calls for 2 cups of flour for 12 cookies. How much flour is needed for 30 cookies?",
"If I have $50 and spend $18.75 on lunch and $12.30 on a book, how much money do I have left?"
"If I have $50 and spend $18.75 on lunch and $12.30 on a book, how much money do I have left?",
]

thinking_prompts = [
"/no_think",
"/think"
]

print("=== TESTING REASONING CAPABILITIES ===\n")

for i, prompt in enumerate(reasoning_prompts, 1):
print(f"Problem {i}: {prompt}")

messages = [{"role": "user", "content": prompt}]
formatted_prompt = instruct_tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = instruct_tokenizer(formatted_prompt, return_tensors="pt").to(device)

with torch.no_grad():
outputs = instruct_model.generate(
**inputs,
max_new_tokens=200,
temperature=0.3, # Lower temperature for more consistent reasoning
do_sample=True,
pad_token_id=instruct_tokenizer.eos_token_id
for thinking_prompt in thinking_prompts:
print(f"Thinking prompt: {thinking_prompt}")
for i, prompt in enumerate(reasoning_prompts, 1):
print(f"Problem {i}: {prompt}")

messages = [
{"role":"system", "content": thinking_prompt},
{"role": "user", "content": prompt}
]
formatted_prompt = instruct_tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
response = instruct_tokenizer.decode(outputs[0], skip_special_tokens=True)
assistant_start = response.find("<|im_start|>assistant\n") + len("<|im_start|>assistant\n")
assistant_response = response[assistant_start:].split("<|im_end|>")[0]
print(f"Answer: {assistant_response}")

print("\n" + "-"*50 + "\n")
inputs = instruct_tokenizer(formatted_prompt, return_tensors="pt").to(device)

with torch.no_grad():
outputs = instruct_model.generate(
**inputs,
max_new_tokens=200,
temperature=0.3, # Lower temperature for more consistent reasoning
do_sample=True,
pad_token_id=instruct_tokenizer.eos_token_id,
)
response = instruct_tokenizer.decode(outputs[0], skip_special_tokens=False)
assistant_start = response.find("<|im_start|>assistant\n") + len(
"<|im_start|>assistant\n"
)
assistant_response = response[assistant_start:].split("<|im_end|>")[0]
print(f"Answer: {assistant_response}")

print("\n" + "-" * 50 + "\n")
```

If we dive into the out put below, we can see that the instruct model's hybrid reasoning being applied with the `/no_think` mode. When the mode is activated, the model will enclose thinking process in `<think>` tags. It uses these tokens to explore possible solutions and answer the question. After the thinking process, the model will provide the final answer, which we can extract with the chat template, or string manipulation here.
Expand Down