From 7ad86e84502930bd57c0c1f2486cba1d651368f7 Mon Sep 17 00:00:00 2001 From: depksingh Date: Thu, 19 Jun 2025 09:10:05 +0000 Subject: [PATCH 1/3] Updating Documentation and config --- .../benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh | 8 +++----- .../benchmarks/DLRM_DCNv2/train_and_checkpoint.sh | 7 +++---- recml/inference/benchmarks/README.md | 10 +++++++++- requirements.txt | 2 +- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh b/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh index 02422da..514bed9 100644 --- a/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh +++ b/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh @@ -6,14 +6,14 @@ export XLA_FLAGS= export TPU_NAME= export LEARNING_RATE=0.0034 -export BATCH_SIZE=135168 +export BATCH_SIZE=4224 export EMBEDDING_SIZE=128 export MODEL_DIR=/tmp/ export FILE_PATTERN=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/train-* export NUM_STEPS=28000 export CHECKPOINT_INTERVAL=1500 export EVAL_INTERVAL=1500 -export EVAL_FILE_PATTER=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-* +export EVAL_FILE_PATTERN=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-* export EVAL_STEPS=660 export MODE=eval export EMBEDDING_THRESHOLD=21000 @@ -21,9 +21,7 @@ export LOGGING_INTERVAL=1500 export RESTORE_CHECKPOINT=true - -python recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \ - +python RecML/recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \ --learning_rate=${LEARNING_RATE} \ --batch_size=${BATCH_SIZE} \ --embedding_size=${EMBEDDING_SIZE} \ diff --git a/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh b/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh index e32639c..97cb764 100644 --- a/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh +++ b/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh @@ -6,22 +6,21 @@ export XLA_FLAGS= export TPU_NAME= export LEARNING_RATE=0.0034 -export BATCH_SIZE=135168 +export BATCH_SIZE=4224 export EMBEDDING_SIZE=128 export MODEL_DIR=/tmp/ export FILE_PATTERN=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/train-* export NUM_STEPS=28000 export CHECKPOINT_INTERVAL=1500 export EVAL_INTERVAL=1500 -export EVAL_FILE_PATTER=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-* +export EVAL_FILE_PATTERN=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-* export EVAL_STEPS=660 export MODE=train export EMBEDDING_THRESHOLD=21000 export LOGGING_INTERVAL=1500 export RESTORE_CHECKPOINT=true -python recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \ - +python RecML/recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \ --learning_rate=${LEARNING_RATE} \ --batch_size=${BATCH_SIZE} \ --embedding_size=${EMBEDDING_SIZE} \ diff --git a/recml/inference/benchmarks/README.md b/recml/inference/benchmarks/README.md index 4c05b5f..5edb833 100644 --- a/recml/inference/benchmarks/README.md +++ b/recml/inference/benchmarks/README.md @@ -54,10 +54,18 @@ gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${Z gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all --command="pip install -U tensorflow dm-tree flax google-metrax" ``` +#### Make script executable + +Note: Please update the MODEL_NAME & TASK_NAME before running the below command + +``` +gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all --command="chmod +x RecML/recml/inference/benchmarks//" +``` + #### Run workload Note: Please update the MODEL_NAME & TASK_NAME before running the below command ``` -gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all --command="TPU_NAME=${TPU_NAME} ./inference/benchmarks//" +gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all --command="TPU_NAME=${TPU_NAME} RecML/recml/inference/benchmarks//" ``` diff --git a/requirements.txt b/requirements.txt index 580d6c9..a82b751 100644 --- a/requirements.txt +++ b/requirements.txt @@ -63,7 +63,7 @@ platformdirs==4.3.7 pluggy==1.5.0 pre-commit==4.2.0 promise==2.3 -protobuf==5.29.4 +protobuf==4.21.12 psutil==7.0.0 pyarrow==19.0.1 pygments==2.19.1 From cafdd510a5b7b9bbc8a2d0de442345bb994de07a Mon Sep 17 00:00:00 2001 From: depksingh Date: Fri, 20 Jun 2025 05:52:07 +0000 Subject: [PATCH 2/3] Removed repo name from internal scripts --- .../benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh | 2 +- .../benchmarks/DLRM_DCNv2/train_and_checkpoint.sh | 2 +- recml/inference/benchmarks/README.md | 12 ++---------- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh b/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh index 514bed9..e35a602 100644 --- a/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh +++ b/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh @@ -21,7 +21,7 @@ export LOGGING_INTERVAL=1500 export RESTORE_CHECKPOINT=true -python RecML/recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \ +python ./recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \ --learning_rate=${LEARNING_RATE} \ --batch_size=${BATCH_SIZE} \ --embedding_size=${EMBEDDING_SIZE} \ diff --git a/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh b/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh index 97cb764..287b8dc 100644 --- a/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh +++ b/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh @@ -20,7 +20,7 @@ export EMBEDDING_THRESHOLD=21000 export LOGGING_INTERVAL=1500 export RESTORE_CHECKPOINT=true -python RecML/recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \ +python ./recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \ --learning_rate=${LEARNING_RATE} \ --batch_size=${BATCH_SIZE} \ --embedding_size=${EMBEDDING_SIZE} \ diff --git a/recml/inference/benchmarks/README.md b/recml/inference/benchmarks/README.md index 5edb833..98306b2 100644 --- a/recml/inference/benchmarks/README.md +++ b/recml/inference/benchmarks/README.md @@ -54,18 +54,10 @@ gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${Z gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all --command="pip install -U tensorflow dm-tree flax google-metrax" ``` -#### Make script executable +#### Make script executable and Run workload Note: Please update the MODEL_NAME & TASK_NAME before running the below command ``` -gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all --command="chmod +x RecML/recml/inference/benchmarks//" -``` - -#### Run workload - -Note: Please update the MODEL_NAME & TASK_NAME before running the below command - -``` -gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all --command="TPU_NAME=${TPU_NAME} RecML/recml/inference/benchmarks//" +gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all --command="cd RecML && chmod +x ./recml/inference/benchmarks// && TPU_NAME=${TPU_NAME} ./recml/inference/benchmarks//" ``` From 772b27b5073f05675639992e94434188e7f51145 Mon Sep 17 00:00:00 2001 From: depksingh Date: Fri, 20 Jun 2025 05:56:06 +0000 Subject: [PATCH 3/3] Updated current dir ref --- recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh | 2 +- recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh b/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh index e35a602..00e68df 100644 --- a/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh +++ b/recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh @@ -21,7 +21,7 @@ export LOGGING_INTERVAL=1500 export RESTORE_CHECKPOINT=true -python ./recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \ +python recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \ --learning_rate=${LEARNING_RATE} \ --batch_size=${BATCH_SIZE} \ --embedding_size=${EMBEDDING_SIZE} \ diff --git a/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh b/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh index 287b8dc..c3b599f 100644 --- a/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh +++ b/recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh @@ -20,7 +20,7 @@ export EMBEDDING_THRESHOLD=21000 export LOGGING_INTERVAL=1500 export RESTORE_CHECKPOINT=true -python ./recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \ +python recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \ --learning_rate=${LEARNING_RATE} \ --batch_size=${BATCH_SIZE} \ --embedding_size=${EMBEDDING_SIZE} \