@@ -2859,6 +2859,38 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
2859
2859
task = GSM8K (self .MODEL_NAME )
2860
2860
task .evaluate (llm )
2861
2861
2862
+ @skip_pre_hopper
2863
+ @pytest .mark .skip_less_device (8 )
2864
+ @pytest .mark .parametrize (
2865
+ "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend" ,
2866
+ [(8 , 1 , 8 , True , True , True , "DEEPGEMM" ),
2867
+ (8 , 1 , 8 , False , True , True , "DEEPGEMM" ),
2868
+ (8 , 1 , 8 , True , True , True , "TRTLLM" ),
2869
+ (8 , 1 , 8 , False , True , True , "TRTLLM" )],
2870
+ ids = [
2871
+ "latency_deepgemm" , "throughput_latency_deepgemm" , "latency_trtllm" ,
2872
+ "throughput_latency_trtllm"
2873
+ ])
2874
+ def test_fp8_block_scales (self , tp_size , pp_size , ep_size , attention_dp ,
2875
+ cuda_graph , overlap_scheduler , moe_backend ):
2876
+ pytorch_config = dict (
2877
+ disable_overlap_scheduler = not overlap_scheduler ,
2878
+ cuda_graph_config = CudaGraphConfig () if cuda_graph else None ,
2879
+ moe_config = MoeConfig (backend = moe_backend ))
2880
+
2881
+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 )
2882
+ with LLM (f"{ llm_models_root ()} /Qwen3/Qwen3-235B-A22B-FP8" ,
2883
+ tensor_parallel_size = tp_size ,
2884
+ pipeline_parallel_size = pp_size ,
2885
+ moe_expert_parallel_size = ep_size ,
2886
+ ** pytorch_config ,
2887
+ enable_attention_dp = attention_dp ,
2888
+ kv_cache_config = kv_cache_config ) as llm :
2889
+ task = MMLU (self .MODEL_NAME )
2890
+ task .evaluate (llm )
2891
+ task = GSM8K (self .MODEL_NAME )
2892
+ task .evaluate (llm )
2893
+
2862
2894
@skip_pre_blackwell
2863
2895
@pytest .mark .skip_less_mpi_world_size (8 )
2864
2896
@pytest .mark .parametrize (
0 commit comments