FlagOpen
diff --git a/‎.gitattributes
+2 b/‎.gitattributes
+2
diff --git a/‎benchmark/attri_util.py
+2 b/‎benchmark/attri_util.py
+2
diff --git a/‎benchmark/conftest.py
+9-1 b/‎benchmark/conftest.py
+9-1
diff --git a/‎benchmark/core_shapes.yaml
+27 b/‎benchmark/core_shapes.yaml
+27
diff --git a/‎benchmark/performance_utils.py
+20-5 b/‎benchmark/performance_utils.py
+20-5
diff --git a/‎benchmark/summary_for_plot.py
+164-20 b/‎benchmark/summary_for_plot.py
+164-20
@@ -0,0 +1,2 @@
+src/flag_gems/runtime/backend/_iluvatar/**/* diff=nodiff
+src/flag_gems/runtime/backend/_metax/**/* diff=nodiff
@@ -72,6 +72,8 @@ class BenchmarkMetrics:
     tflops: Optional[float] = None
     # Utilization (not implemented yet)
     utilization: Optional[float] = None
+    # Speedup compared to base data
+    compared_speedup: Optional[float] = None
     # Error message
     error_msg: Optional[str] = None
 
 
@@ -21,6 +21,7 @@
 )
 
 device = flag_gems.device
+vendor_name = flag_gems.vendor_name
 
 
 class BenchConfig:
@@ -29,6 +30,11 @@ def __init__(self):
         self.bench_level = BenchLevel.COMPREHENSIVE
         self.warm_up = DEFAULT_WARMUP_COUNT
         self.repetition = DEFAULT_ITER_COUNT
+        if (
+            vendor_name == "kunlunxin"
+        ):  # Speed Up Benchmark Test, Big Shape Will Cause Timeout
+            self.warm_up = 1
+            self.repetition = 1
         self.record_log = False
         self.user_desired_dtypes = None
         self.user_desired_metrics = None
@@ -41,7 +47,9 @@ def __init__(self):
 
 def pytest_addoption(parser):
     parser.addoption(
-        "--mode",
+        "--mode"
+        if vendor_name != "kunlunxin"
+        else "--fg_mode",  # TODO: fix pytest-* common --mode args
         action="store",
         default=device,
         required=False,
 
@@ -192,3 +192,30 @@ AttentionBenchmark:
   - [4, 8, 2048, 128]
   - [4, 8, 3072, 128]
   - [4, 8, 4096, 128]
+
+KronBenchmark:
+  shapes:
+  - [16,16]
+  - [64,64]
+  - [128,128]
+  - [256,256]
+  - [4, 8, 16, 32]
+  - [4, 8, 32, 32]
+  - [4, 8, 64, 32]
+  - [4, 8, 128, 32]
+
+IndexPutAccFalseBenchmark:
+  shapes:
+  - [[268435456,], [[65536,],], [65536,]]
+  - [[32, 32], [[8,], [2, 8]], [8,]]
+  - [[1024, 1024], [[4, 64],], [1024,]]
+  - [[512, 512, 512], [[2, 128], [128,], [128,]], [128,]]
+  - [[512, 512, 512], [[2, 128],], [512,]]
+
+IndexPutAccTrueBenchmark:
+  shapes:
+  - [[268435456,], [[65536,],], [65536,]]
+  - [[32, 32], [[8,], [8,]], [8,]]
+  - [[1024, 1024], [[64,], [64,]], [64,]]
+  - [[512, 512, 512], [[128,], [128,], [128,]], [128,]]
+  - [[512, 512, 512], [[2, 128], [2, 128], [2, 128]], [2, 128]]
@@ -1,6 +1,7 @@
 import gc
 import importlib
 import logging
+import os
 import time
 from typing import Any, Generator, List, Optional, Tuple
 
@@ -28,7 +29,11 @@
 torch_backend_device = flag_gems.runtime.torch_backend_device
 torch_device_fn = flag_gems.runtime.torch_device_fn
 device = flag_gems.device
-torch_backend_device.matmul.allow_tf32 = False
+vendor_name = flag_gems.vendor_name
+if device == "musa":
+    torch.backends.mudnn.allow_tf32 = False
+else:
+    torch_backend_device.matmul.allow_tf32 = False
 
 
 def SkipVersion(module_name, skip_pattern):
@@ -225,6 +230,11 @@ def init_user_config(self):
         self.cpu_mode = Config.cpu_mode
         self.set_dtypes(Config.user_desired_dtypes)
         self.set_metrics(Config.user_desired_metrics)
+        if vendor_name == "kunlunxin":
+            Config.shape_file = os.path.join(
+                os.path.dirname(__file__),
+                "../src/flag_gems/runtime/backend/_kunlunxin/core_shapes.yaml",
+            )  # Speed Up Benchmark Test, Big Shape Will Cause Timeout
         self.set_shapes(Config.shape_file)
 
     def set_gems(self, gems_op):
@@ -247,7 +257,12 @@ def get_latency(self, op, *args, **kwargs):
             end = time.time()
             latency = (end - start) / Config.repetition * 1000
         else:
-            latency = triton.testing.do_bench(
+            do_bench = (
+                triton.musa_testing.do_bench
+                if device == "musa"
+                else triton.testing.do_bench
+            )
+            latency = do_bench(
                 fn,
                 warmup=Config.warm_up,
                 rep=Config.repetition,
@@ -457,10 +472,10 @@ def generate_tensor_input(shape, dtype, device):
             torch.iinfo(dtype).max,
             shape,
             dtype=dtype,
-            device=device,
-        )
+            device="cpu",
+        ).to(device)
     elif dtype in BOOL_DTYPES:
-        return torch.randint(0, 2, size=shape, dtype=dtype, device=device)
+        return torch.randint(0, 2, size=shape, dtype=dtype, device="cpu").to(device)
 
 
 def binary_input_fn(shape, cur_dtype, device):
 
@@ -32,10 +32,13 @@
 import json
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import List
+from typing import Any, Dict, List
 
 from attri_util import BenchmarkMetrics, BenchmarkResult
 
+# to enable log files crossing speedup calculation
+ENABLE_COMPARE = False
+
 
 @dataclass
 class SummaryResultOverDtype:
@@ -47,20 +50,50 @@ class SummaryResultOverDtype:
     int32_speedup: float = 0.0
     bool_speedup: float = 0.0
     cfloat_speedup: float = 0.0
+
+    # to calculate the speedup across log files.
+    compared_float16_speedup: float = 0.0
+    compared_float32_speedup: float = 0.0
+    compared_bfloat16_speedup: float = 0.0
+    compared_int16_speedup: float = 0.0
+    compared_int32_speedup: float = 0.0
+    compared_bool_speedup: float = 0.0
+    compared_cfloat_speedup: float = 0.0
     all_tests_passed: bool = False
 
     def __str__(self) -> str:
         all_shapes_status = "yes" if self.all_tests_passed else "no"
         return (
-            f"{self.op_name:<30} "
-            f"{self.float16_speedup:<20.6f} "
-            f"{self.float32_speedup:<20.6f} "
-            f"{self.bfloat16_speedup:<20.6f} "
-            f"{self.int16_speedup:<20.6f} "
-            f"{self.int32_speedup:<20.6f} "
-            f"{self.bool_speedup:<20.6f} "
-            f"{self.cfloat_speedup:<20.6f}"
-            f"{all_shapes_status:<20}"
+            (
+                f"{self.op_name:<30} "
+                f"{self.float16_speedup:<20.6f} "
+                f"{self.float32_speedup:<20.6f} "
+                f"{self.bfloat16_speedup:<20.6f} "
+                f"{self.int16_speedup:<20.6f} "
+                f"{self.int32_speedup:<20.6f} "
+                f"{self.bool_speedup:<20.6f} "
+                f"{self.cfloat_speedup:<20.6f}"
+                f"{self.compared_float16_speedup:<20.6f}"
+                f"{self.compared_float32_speedup:<20.6f}"
+                f"{self.compared_bfloat16_speedup:<20.6f}"
+                f"{self.compared_int16_speedup:<20.6f}"
+                f"{self.compared_int32_speedup:<20.6f}"
+                f"{self.compared_bool_speedup:<20.6f}"
+                f"{self.compared_cfloat_speedup:<20.6f}"
+                f"{all_shapes_status:<20}"
+            )
+            if ENABLE_COMPARE
+            else (
+                f"{self.op_name:<30} "
+                f"{self.float16_speedup:<20.6f} "
+                f"{self.float32_speedup:<20.6f} "
+                f"{self.bfloat16_speedup:<20.6f} "
+                f"{self.int16_speedup:<20.6f} "
+                f"{self.int32_speedup:<20.6f} "
+                f"{self.bool_speedup:<20.6f} "
+                f"{self.cfloat_speedup:<20.6f}"
+                f"{all_shapes_status:<20}"
+            )
         )
 
 
@@ -103,6 +136,56 @@ def parse_log(log_file_path: str) -> List[BenchmarkResult]:
     return benchmark_results
 
 
+def get_key_by_op_dtype_shape(op_name, dtype, shape):
+    return hex(hash((hash(op_name), hash(dtype), hash(shape))))
+
+
+def parse_log_to_dict(log_file_path: str) -> Dict[int, Any]:
+    with open(log_file_path, "r") as file:
+        log_lines = [
+            line
+            for line in file.read().strip().split("\n")
+            if line.startswith("[INFO]")
+        ]
+
+    # dict(op_name, dict(dtype, dict(shape, latency))
+    benchmark_results = dict()
+    for line in log_lines:
+        if line.startswith("[INFO]"):
+            json_str = line[len("[INFO] ") :]
+            data = json.loads(json_str)
+            op_name = (data["op_name"],)
+            dtype = (data["dtype"],)
+            mode = (data["mode"],)
+            level = (data["level"],)
+            benchmark_result = BenchmarkResult(
+                op_name,
+                dtype,
+                mode,
+                level,
+                result=[
+                    BenchmarkMetrics(
+                        legacy_shape=metric.get("legacy_shape"),
+                        shape_detail=metric.get("shape_detail", []),
+                        latency_base=metric.get("latency_base"),
+                        latency=metric.get("latency"),
+                        speedup=metric.get("speedup"),
+                        accuracy=metric.get("accuracy"),
+                        tflops=metric.get("tflops"),
+                        utilization=metric.get("utilization"),
+                        error_msg=metric.get("error_msg"),
+                    )
+                    for metric in data["result"]
+                ],
+            )
+            for result in benchmark_result.result:
+                key = get_key_by_op_dtype_shape(
+                    op_name[0], dtype[0], str(result.shape_detail)
+                )
+                benchmark_results[key] = result.latency
+    return benchmark_results
+
+
 def calculate_avg_speedup_over_dtype(metrics):
     speedups = [
         metric.speedup
@@ -112,6 +195,15 @@ def calculate_avg_speedup_over_dtype(metrics):
     return sum(speedups) / len(speedups) if speedups else 0.0
 
 
+def calculate_avg_compared_speedup_over_dtype(metrics):
+    compared_speedups = [
+        metric.compared_speedup
+        for metric in metrics
+        if metric.compared_speedup is not None and metric.error_msg is None
+    ]
+    return sum(compared_speedups) / len(compared_speedups) if compared_speedups else 0.0
+
+
 def all_benchshape_passed(metrics):
     return all(metric.error_msg is None for metric in metrics)
 
@@ -132,6 +224,7 @@ def summary_for_plot(benchmark_results):
     for item in benchmark_results:
         op_name = item.op_name
         avg_speedup = calculate_avg_speedup_over_dtype(item.result)
+        avg_compared_speedup = calculate_avg_compared_speedup_over_dtype(item.result)
         cur_op_summary = summary[op_name]
         cur_op_summary.op_name = op_name
         cur_op_summary.all_tests_passed = all_benchshape_passed(item.result)
@@ -140,20 +233,47 @@ def summary_for_plot(benchmark_results):
             dtype_mapping.get(item.dtype, "float16_speedup"),
             avg_speedup,
         )
+        if ENABLE_COMPARE:
+            setattr(
+                summary[op_name],
+                "compared_" + dtype_mapping.get(item.dtype, "float16_speedup"),
+                avg_compared_speedup,
+            )
 
     # sort the keys based on `op_name`
     sorted_summary = sorted(summary.values(), key=lambda x: x.op_name)
 
     header = (
-        f"{'op_name':<30} "
-        f"{'float16_speedup':<20} "
-        f"{'float32_speedup':<20} "
-        f"{'bfloat16_speedup':<20} "
-        f"{'int16_speedup':<20} "
-        f"{'int32_speedup':<20} "
-        f"{'bool_speedup':<20} "
-        f"{'cfloat_speedup':<20}"
-        f"{'all_tests_passed':<20}"
+        (
+            f"{'op_name':<30} "
+            f"{'float16_speedup':<20} "
+            f"{'float32_speedup':<20} "
+            f"{'bfloat16_speedup':<20} "
+            f"{'int16_speedup':<20} "
+            f"{'int32_speedup':<20} "
+            f"{'bool_speedup':<20} "
+            f"{'cfloat_speedup':<20}"
+            f"{'comp_fp16_speedup':<20}"
+            f"{'comp_fp32_speedup':<20}"
+            f"{'comp_bf16_speedup':<20}"
+            f"{'comp_int16_speedup':<20}"
+            f"{'comp_int32_speedup':<20}"
+            f"{'comp_bool_speedup':<20}"
+            f"{'comp_cfloat_speedup':<20}"
+            f"{'all_tests_passed':<20}"
+        )
+        if ENABLE_COMPARE
+        else (
+            f"{'op_name':<30} "
+            f"{'float16_speedup':<20} "
+            f"{'float32_speedup':<20} "
+            f"{'bfloat16_speedup':<20} "
+            f"{'int16_speedup':<20} "
+            f"{'int32_speedup':<20} "
+            f"{'bool_speedup':<20} "
+            f"{'cfloat_speedup':<20}"
+            f"{'all_tests_passed':<20}"
+        )
     )
 
     print(header)
@@ -163,6 +283,19 @@ def summary_for_plot(benchmark_results):
     return summary
 
 
+def compare_main(log_file_a, log_file_b):
+    result_a = parse_log(log_file_a)
+    result_b = parse_log_to_dict(log_file_b)
+    for result in result_a:
+        for sub_result in result.result:
+            key = get_key_by_op_dtype_shape(
+                result.op_name, result.dtype, str(sub_result.shape_detail)
+            )
+            sub_result.compared_speedup = result_b.get(key, 0) / sub_result.latency
+
+    summary_for_plot(result_a)
+
+
 def main(log_file_path):
     result = parse_log(log_file_path)
     summary_for_plot(result)
@@ -171,6 +304,17 @@ def main(log_file_path):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Parse benchmark log file.")
     parser.add_argument("log_file_path", type=str, help="Path to the log file.")
+    parser.add_argument(
+        "--compare",
+        "-c",
+        type=str,
+        default="",
+        help="Path to a log file with baseline data to get speedup statistics across 2 log files",
+    )
     args = parser.parse_args()
 
-    main(args.log_file_path)
+    if not args.compare == "":
+        ENABLE_COMPARE = True
+        compare_main(args.log_file_path, args.compare)
+    else:
+        main(args.log_file_path)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+src/flag_gems/runtime/backend/_iluvatar/*/ diff=nodiff`
	`2`	`+src/flag_gems/runtime/backend/_metax/*/ diff=nodiff`