From 7ad86e84502930bd57c0c1f2486cba1d651368f7 Mon Sep 17 00:00:00 2001
From: depksingh <depksingh@google.com>
Date: Thu, 19 Jun 2025 09:10:05 +0000
Subject: [PATCH 1/3] Updating Documentation and config

---
 .../benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh        |  8 +++-----
 .../benchmarks/DLRM_DCNv2/train_and_checkpoint.sh      |  7 +++----
 recml/inference/benchmarks/README.md                   | 10 +++++++++-
 requirements.txt                                       |  2 +-
 4 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh b/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh
index 02422da..514bed9 100644
--- a/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh
+++ b/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh
@@ -6,14 +6,14 @@ export XLA_FLAGS=
 
 export TPU_NAME=<TPU_NAME>
 export LEARNING_RATE=0.0034
-export BATCH_SIZE=135168
+export BATCH_SIZE=4224
 export EMBEDDING_SIZE=128
 export MODEL_DIR=/tmp/
 export FILE_PATTERN=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/train-*
 export NUM_STEPS=28000
 export CHECKPOINT_INTERVAL=1500
 export EVAL_INTERVAL=1500
-export EVAL_FILE_PATTER=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-*
+export EVAL_FILE_PATTERN=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-*
 export EVAL_STEPS=660
 export MODE=eval
 export EMBEDDING_THRESHOLD=21000
@@ -21,9 +21,7 @@ export LOGGING_INTERVAL=1500
 export RESTORE_CHECKPOINT=true
 
 
-
-python recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
-
+python RecML/recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
 --learning_rate=${LEARNING_RATE} \
 --batch_size=${BATCH_SIZE} \
 --embedding_size=${EMBEDDING_SIZE} \
diff --git a/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh b/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh
index e32639c..97cb764 100644
--- a/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh
+++ b/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh
@@ -6,22 +6,21 @@ export XLA_FLAGS=
 
 export TPU_NAME=<TPU_NAME>
 export LEARNING_RATE=0.0034
-export BATCH_SIZE=135168
+export BATCH_SIZE=4224
 export EMBEDDING_SIZE=128
 export MODEL_DIR=/tmp/
 export FILE_PATTERN=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/train-*
 export NUM_STEPS=28000
 export CHECKPOINT_INTERVAL=1500
 export EVAL_INTERVAL=1500
-export EVAL_FILE_PATTER=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-*
+export EVAL_FILE_PATTERN=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-*
 export EVAL_STEPS=660
 export MODE=train
 export EMBEDDING_THRESHOLD=21000
 export LOGGING_INTERVAL=1500
 export RESTORE_CHECKPOINT=true
 
-python recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
-
+python RecML/recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
 --learning_rate=${LEARNING_RATE} \
 --batch_size=${BATCH_SIZE} \
 --embedding_size=${EMBEDDING_SIZE} \
diff --git a/recml/inference/benchmarks/README.md b/recml/inference/benchmarks/README.md
index 4c05b5f..5edb833 100644
--- a/recml/inference/benchmarks/README.md
+++ b/recml/inference/benchmarks/README.md
@@ -54,10 +54,18 @@ gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${Z
 gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all  --command="pip install -U tensorflow  dm-tree flax google-metrax"
 ```
 
+#### Make script executable
+
+Note: Please update the MODEL_NAME & TASK_NAME before running the below command
+
+```
+gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all  --command="chmod +x RecML/recml/inference/benchmarks/<MODEL_NAME>/<TASK_NAME>"
+```
+
 #### Run workload
 
 Note: Please update the MODEL_NAME & TASK_NAME before running the below command
 
 ```
-gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all  --command="TPU_NAME=${TPU_NAME} ./inference/benchmarks/<MODEL_NAME>/<TASK_NAME>"
+gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all  --command="TPU_NAME=${TPU_NAME} RecML/recml/inference/benchmarks/<MODEL_NAME>/<TASK_NAME>"
 ```
diff --git a/requirements.txt b/requirements.txt
index 580d6c9..a82b751 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -63,7 +63,7 @@ platformdirs==4.3.7
 pluggy==1.5.0
 pre-commit==4.2.0
 promise==2.3
-protobuf==5.29.4
+protobuf==4.21.12
 psutil==7.0.0
 pyarrow==19.0.1
 pygments==2.19.1

From cafdd510a5b7b9bbc8a2d0de442345bb994de07a Mon Sep 17 00:00:00 2001
From: depksingh <depksingh@google.com>
Date: Fri, 20 Jun 2025 05:52:07 +0000
Subject: [PATCH 2/3] Removed repo name from internal scripts

---
 .../benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh      |  2 +-
 .../benchmarks/DLRM_DCNv2/train_and_checkpoint.sh    |  2 +-
 recml/inference/benchmarks/README.md                 | 12 ++----------
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh b/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh
index 514bed9..e35a602 100644
--- a/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh
+++ b/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh
@@ -21,7 +21,7 @@ export LOGGING_INTERVAL=1500
 export RESTORE_CHECKPOINT=true
 
 
-python RecML/recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
+python ./recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
 --learning_rate=${LEARNING_RATE} \
 --batch_size=${BATCH_SIZE} \
 --embedding_size=${EMBEDDING_SIZE} \
diff --git a/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh b/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh
index 97cb764..287b8dc 100644
--- a/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh
+++ b/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh
@@ -20,7 +20,7 @@ export EMBEDDING_THRESHOLD=21000
 export LOGGING_INTERVAL=1500
 export RESTORE_CHECKPOINT=true
 
-python RecML/recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
+python ./recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
 --learning_rate=${LEARNING_RATE} \
 --batch_size=${BATCH_SIZE} \
 --embedding_size=${EMBEDDING_SIZE} \
diff --git a/recml/inference/benchmarks/README.md b/recml/inference/benchmarks/README.md
index 5edb833..98306b2 100644
--- a/recml/inference/benchmarks/README.md
+++ b/recml/inference/benchmarks/README.md
@@ -54,18 +54,10 @@ gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${Z
 gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all  --command="pip install -U tensorflow  dm-tree flax google-metrax"
 ```
 
-#### Make script executable
+#### Make script executable and Run workload
 
 Note: Please update the MODEL_NAME & TASK_NAME before running the below command
 
 ```
-gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all  --command="chmod +x RecML/recml/inference/benchmarks/<MODEL_NAME>/<TASK_NAME>"
-```
-
-#### Run workload
-
-Note: Please update the MODEL_NAME & TASK_NAME before running the below command
-
-```
-gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all  --command="TPU_NAME=${TPU_NAME} RecML/recml/inference/benchmarks/<MODEL_NAME>/<TASK_NAME>"
+gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all  --command="cd RecML && chmod +x ./recml/inference/benchmarks/<MODEL_NAME>/<TASK_NAME> && TPU_NAME=${TPU_NAME} ./recml/inference/benchmarks/<MODEL_NAME>/<TASK_NAME>"
 ```

From 772b27b5073f05675639992e94434188e7f51145 Mon Sep 17 00:00:00 2001
From: depksingh <depksingh@google.com>
Date: Fri, 20 Jun 2025 05:56:06 +0000
Subject: [PATCH 3/3] Updated current dir ref

---
 recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh   | 2 +-
 recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh b/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh
index e35a602..00e68df 100644
--- a/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh
+++ b/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh
@@ -21,7 +21,7 @@ export LOGGING_INTERVAL=1500
 export RESTORE_CHECKPOINT=true
 
 
-python ./recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
+python recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
 --learning_rate=${LEARNING_RATE} \
 --batch_size=${BATCH_SIZE} \
 --embedding_size=${EMBEDDING_SIZE} \
diff --git a/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh b/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh
index 287b8dc..c3b599f 100644
--- a/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh
+++ b/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh
@@ -20,7 +20,7 @@ export EMBEDDING_THRESHOLD=21000
 export LOGGING_INTERVAL=1500
 export RESTORE_CHECKPOINT=true
 
-python ./recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
+python recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
 --learning_rate=${LEARNING_RATE} \
 --batch_size=${BATCH_SIZE} \
 --embedding_size=${EMBEDDING_SIZE} \