Fix examples and add them to CI test (#28)

wconstab · web-flow · commit 63bdf9433435 · 2025-07-04T10:37:01.000+02:00
diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
@@ -37,3 +37,5 @@ jobs:
         pip install --no-input --quiet --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
         pip install --quiet .
         pytest tests
+        python examples/example_autoparallel.py
+        python examples/example_llama3.py
diff --git a/examples/example_autoparallel.py b/examples/example_autoparallel.py
@@ -77,17 +77,13 @@ def forward(self, x):
 dim2 = dim1 * 4
 
 
-def model_fn():
-    return Block(nheads, dim1, dim2)
-
-
 def input_fn():
     return torch.rand(bs, seq_len, dim1, device="cuda")
 
 
 # parallelize the model
 with torch.device("meta"):
-    model = model_fn()
+    model = Block(nheads, dim1, dim2)
 autop = AutoParallel(model, input_fn, mesh)
 autop.add_parameter_memory_constraint(low=None, high=None)
 
@@ -96,11 +92,11 @@ def input_fn():
 autop.add_input_constraints([x_sharding])
 autop.add_output_constraints([x_sharding])
 
-
 sharding_placement = autop.optimize_placement()
-parallel_mod = autop.apply_placement(sharding_placement)
 
-# run weight init on our sharded DTensor params
+# AutoParallel produces a module with meta-DTensor parameters that need to be initialized
+parallel_mod = autop.apply_placement(sharding_placement)
+parallel_mod.to_empty(device="cuda")
 parallel_mod.init_weights()
 
 # now let's run it
diff --git a/examples/example_llama3.py b/examples/example_llama3.py
@@ -586,7 +586,7 @@ def forward(self, tokens: torch.Tensor, input_batch: torch.Tensor | None = None)
 
 def model_fn():
     model_args = TransformerModelArgs(
-        n_layers=32, vocab_size=vocab_size, max_seq_len=seqlen
+        n_layers=8, vocab_size=vocab_size, max_seq_len=seqlen
     )
     m = Transformer(model_args)
     return m
@@ -628,6 +628,7 @@ def input_fn():
 parallel_mod = autop.apply_placement(sharding_placement)
 
 # run weight init on our sharded DTensor params
+parallel_mod.to_empty(device="cuda")
 parallel_mod.init_weights()
 
 # now let's run it