Fix Big Query feature

ycchenzheng · ycchenzheng · commit a2981594cb1b · 2025-10-02T16:55:53.000Z
diff --git a/benchmarks/benchmark_db_utils.py b/benchmarks/benchmark_db_utils.py
@@ -141,7 +141,7 @@ def write_run(
 
   from benchmark_db_writer import bq_writer_utils
   from benchmark_db_writer import dataclass_bigquery_writer
-  from benchmark_db_writer.run_summary_writer import sample_run_summary_writer
+  from benchmark_db_writer.run_summary_writer import run_summary_writer
   from benchmark_db_writer.schema.workload_benchmark_v2 import workload_benchmark_v2_schema
 
   def get_db_client(
@@ -168,9 +168,9 @@ def get_db_client(
   print(options.model_id)
 
   if (
-      sample_run_summary_writer.validate_model_id(options.model_id, options.is_test)
-      and sample_run_summary_writer.validate_hardware_id(options.hardware_id, options.is_test)
-      and sample_run_summary_writer.validate_software_id(options.software_id, options.is_test)
+      run_summary_writer.validate_model_id(options.model_id, options.is_test)
+      and run_summary_writer.validate_hardware_id(options.hardware_id, options.is_test)
+      and run_summary_writer.validate_software_id(options.software_id, options.is_test)
   ):
     summary = workload_benchmark_v2_schema.WorkloadBenchmarkV2Schema(
         run_id=f"run-{uuid.uuid4()}",
@@ -179,6 +179,7 @@ def get_db_client(
         hardware_id=options.hardware_id,
         hardware_num_chips=number_of_chips,
         hardware_num_nodes=number_of_nodes,
+        hardware_num_slices=options.hardware_num_slices,
         result_success=run_success,
         configs_framework=framework_config_in_json,
         configs_env=env_variables,
diff --git a/benchmarks/globals.py b/benchmarks/globals.py
@@ -17,7 +17,7 @@
 import os.path
 
 # This is the MaxText root: with "max_utils.py"; &etc. TODO: Replace `os.path.basename` with `os.path.abspath`
-MAXTEXT_PKG_DIR = os.environ.get("MAXTEXT_PKG_DIR", "MaxText")
+MAXTEXT_PKG_DIR = os.environ.get("MAXTEXT_PKG_DIR", "src/MaxText")
 
 # This is the maxtext repo root: with ".git" folder; "README.md"; "pyproject.toml"; &etc.
 MAXTEXT_REPO_ROOT = os.environ.get(
diff --git a/benchmarks/maxtext_xpk_runner.py b/benchmarks/maxtext_xpk_runner.py
@@ -158,6 +158,9 @@ def __post_init__(self):
     else:
       self.num_devices_per_slice = int(self.device_type.split("-")[1]) / 2
       self.topology = ""
+    self.hardware_id = self.device_type.split("-")[0]
+    if self.hardware_id == "v5litepod":
+      self.hardware_id = "v5e"
 
 
 def wait_for_xpk_workload_completion(cluster_config: XpkClusterConfig, workload_name, xpk_path) -> int:
@@ -341,6 +344,7 @@ def _build_args_from_config(wl_config: WorkloadConfig) -> dict:
       "model_id": wl_config.model.model_type,
       "hardware_id": wl_config.hardware_id,
       "software_id": "jax_maxtext",
+      "hardware_num_slices": wl_config.num_slices,
       "number_of_chips": wl_config.num_devices_per_slice * wl_config.num_slices,
       "container_image_name": wl_config.base_docker_image,
       "global_batch_size": per_device_batch_size * wl_config.num_devices_per_slice * wl_config.num_slices,
@@ -445,7 +449,8 @@ def build_user_command(
           f"base_output_directory={wl_config.base_output_directory}",
           f"{vertex_tensorboard}",
           f"{run_name_command}",
-          f"{enable_metrics_cmd}" f"{upload_hlo_dump}",
+          f"{enable_metrics_cmd}",
+          f"{upload_hlo_dump}",
       ]
   )
   return command
diff --git a/benchmarks/recipes/runner_utils.py b/benchmarks/recipes/runner_utils.py
@@ -65,6 +65,9 @@ def generate_and_run_workloads(user_config, num_slices_list, num_steps, priority
               xpk_path=user_config.xpk_path,
               num_steps=num_steps,
               priority=priority,
+              generate_metrics_and_upload_to_big_query=user_config.bq_enable,
+              db_project=user_config.bq_db_project,
+              db_dataset=user_config.bq_db_dataset,
           )
 
           # Generate XPK command
diff --git a/benchmarks/recipes/user_configs.py b/benchmarks/recipes/user_configs.py
@@ -70,6 +70,11 @@ class UserConfig:
   selected_model_names: list[str] = dataclasses.field(default_factory=lambda: ["llama3_1_8b_8192"])
   num_slices_list: list[int] = dataclasses.field(default_factory=lambda: [2])
 
+  # BigQuery configuration
+  bq_enable: bool = False
+  bq_db_project: str = ""
+  bq_db_dataset: str = ""
+
   # other configuration
   xpk_path: str = "~/xpk"
   max_restarts: int = 0
diff --git a/benchmarks/upload_metrics_to_bq.py b/benchmarks/upload_metrics_to_bq.py
@@ -186,6 +186,12 @@ def add_parser_arguments(parser: argparse.ArgumentParser):
       default=True,
       help="Whether to use the testing project or production project",
   )
+  parser.add_argument(
+      "--hardware_num_slices",
+      type=int,
+      required=False,
+      help="hardware slice number",
+  )
 
 
 def download_metrics_file_locally(metrics_gcs_file: str, local_file: str) -> int:
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,11 @@
 absl-py
 aqtp
 array-record
+benchmark_db_writer@git+https://github.com/CIeNET-International/aotc.git@c0bef62eac87c99152ff2e9fd48da1f7d9f3cc04#subdirectory=src/aotc/benchmark_db_writer
 cloud-accelerator-diagnostics
 cloud-tpu-diagnostics
 datasets
-flax
+flax==0.11.1
 gcsfs
 google-api-python-client
 google-cloud-aiplatform

Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,9 @@ def generate_and_run_workloads(user_config, num_slices_list, num_steps, priority`
`65`	`65`	`xpk_path=user_config.xpk_path,`
`66`	`66`	`num_steps=num_steps,`
`67`	`67`	`priority=priority,`
	`68`	`+ generate_metrics_and_upload_to_big_query=user_config.bq_enable,`
	`69`	`+ db_project=user_config.bq_db_project,`
	`70`	`+ db_dataset=user_config.bq_db_dataset,`
`68`	`71`	`)`
`69`	`72`
`70`	`73`	`# Generate XPK command`