File tree Expand file tree Collapse file tree 5 files changed +25
-34
lines changed
Expand file tree Collapse file tree 5 files changed +25
-34
lines changed Original file line number Diff line number Diff line change @@ -9,17 +9,11 @@ do_once() {
99}
1010
1111test_body () {
12- start=` date +%s`
13- (sleep 10 && pkill -HUP ls && true) &
14- (ls /data/imagenet/train-jpeg > /dev/null && pkill -HUP sleep) &
15- wait
16- end=` date +%s`
17- runtime=$(( end- start))
18- echo " Data access time: $runtime seconds"
19- if [ $runtime -gt 3 ]; then
20- echo " Data access time is greater than 3 seconds, skipping the test"
12+ if [ $( stat /data/imagenet/train-jpeg --format=" %T" -f) != " ext2/ext3" ]; then
13+ echo " Not available locally, skipping the test"
2114 return 0
2215 fi
16+
2317 python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --separate_queue \
2418 --cpu_size 2 --gpu_size 2 --fp16 --nhwc
2519 python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --separate_queue \
Original file line number Diff line number Diff line change @@ -13,17 +13,11 @@ test_body() {
1313 python test_RN50_data_fw_iterators.py --framework ${fw} --gpus ${NUM_GPUS} -b 13 \
1414 --workers 3 --prefetch 2 --epochs 3
1515 done
16- start=` date +%s`
17- (sleep 10 && pkill -HUP ls && true) &
18- (ls /data/imagenet/train-jpeg > /dev/null && pkill -HUP sleep) &
19- wait
20- end=` date +%s`
21- runtime=$(( end- start))
22- echo " Data access time: $runtime seconds"
23- if [ $runtime -gt 3 ]; then
24- echo " Data access time is greater than 3 seconds, skipping the test"
16+ if [ $( stat /data/imagenet/train-jpeg --format=" %T" -f) != " ext2/ext3" ]; then
17+ echo " Not available locally, skipping the test"
2518 return 0
2619 fi
20+
2721 torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init fork --benchmark_iters 500 --test_pipes parallel
2822 torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init spawn --benchmark_iters 500 --test_pipes parallel
2923 torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --benchmark_iters 500 --test_pipes file_reader
Original file line number Diff line number Diff line change @@ -8,18 +8,11 @@ do_once() {
88}
99
1010test_body () {
11- start=` date +%s`
12- (sleep 10 && pkill -HUP ls && true) &
13- (ls /data/imagenet/train-jpeg > /dev/null && pkill -HUP sleep) &
14- wait
15- end=` date +%s`
16- runtime=$(( end- start))
17- echo " Data access time: $runtime seconds"
18- if [ $runtime -gt 3 ]; then
19- echo " Data access time is greater than 3 seconds, skipping the test"
11+ if [ $( stat /data/imagenet/train-jpeg --format=" %T" -f) != " ext2/ext3" ]; then
12+ echo " Not available locally, skipping the test"
2013 return 0
2114 fi
22- # test code
15+
2316 python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --prefetch 2 --decoder_type " legacy"
2417 python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --prefetch 2 --decoder_type " experimental"
2518 python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 16 --workers 3 --prefetch 11 --decoder_type " legacy"
Original file line number Diff line number Diff line change @@ -13,10 +13,10 @@ cd /opt/dali/docs/examples/use_cases/pytorch/resnet50
1313NUM_GPUS=$( nvidia-smi -L | wc -l)
1414
1515if [ ! -d " val" ]; then
16- ln -sf /data_raid /imagenet/val-jpeg/ val
16+ ln -sf /data /imagenet/val-jpeg/ val
1717fi
1818if [ ! -d " train" ]; then
19- ln -sf /data_raid /imagenet/train-jpeg/ train
19+ ln -sf /data /imagenet/train-jpeg/ train
2020fi
2121
2222LOG=dali.log
@@ -26,7 +26,7 @@ SECONDS=0
2626# turn off SHARP to avoid NCCL errors
2727export NCCL_NVLS_ENABLE=0
2828
29- torchrun --nproc_per_node=${NUM_GPUS} main.py -a resnet50 --b 256 --loss-scale 128.0 --workers 8 --lr=0.4 --fp16-mode --epochs 5 ./ 2>&1 | tee $LOG
29+ torchrun --nproc_per_node=${NUM_GPUS} main.py -a resnet50 --b 256 --loss-scale 128.0 --workers 8 --lr=0.4 --fp16-mode --epochs 2 ./ 2>&1 | tee $LOG
3030
3131RET=${PIPESTATUS[0]}
3232echo " Training ran in $SECONDS seconds"
@@ -57,7 +57,12 @@ printf "TOP-1 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP1 $MIN_TOP1 $TOP
5757printf " TOP-5 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP5 $MIN_TOP5 $TOP5_RESULT
5858printf " Average perf: %.2f (expect at least %f) samples/sec %s\n" $PERF $MIN_PERF $PERF_RESULT
5959
60- if [[ " $TOP1_RESULT " == " OK" && " $TOP5_RESULT " == " OK" && " $PERF_RESULT " == " OK" ]]; then
60+ # check perf only if data is locally available
61+ if [ $( stat /data/imagenet/val-jpeg --format=" %T" -f) == " ext2/ext3" ] && [ " $PERF_RESULT " != " OK" ]; then
62+ CAN_AND_EXIT 4
63+ fi
64+
65+ if [[ " $TOP1_RESULT " == " OK" && " $TOP5_RESULT " == " OK" ]]; then
6166 CLEAN_AND_EXIT 0
6267fi
6368
Original file line number Diff line number Diff line change @@ -6,7 +6,7 @@ mkdir -p idx-files/
66
77NUM_GPUS=$( nvidia-smi -L | wc -l)
88
9- DATA_SET_DIR=/data_raid /imagenet/train-val-tfrecord
9+ DATA_SET_DIR=/data /imagenet/train-val-tfrecord
1010for file in $( ls $DATA_SET_DIR /* -of-* ) ;
1111do
1212 file=$( basename ${file} )
@@ -69,7 +69,12 @@ printf "TOP-1 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP1 $MIN_TOP1 $TOP
6969printf " TOP-5 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP5 $MIN_TOP5 $TOP5_RESULT
7070printf " mean speed %.2f (expect at least %f) samples/sec %s\n" $PERF $MIN_PERF $PERF_RESULT
7171
72- if [[ " $TOP1_RESULT " == " OK" && " $TOP5_RESULT " == " OK" && " $PERF_RESULT " == " OK" ]]; then
72+ # check perf only if data is locally available
73+ if [ $( stat /data/imagenet/train-val-tfrecord --format=" %T" -f) == " ext2/ext3" ] && [ " $PERF_RESULT " != " OK" ]; then
74+ CAN_AND_EXIT 4
75+ fi
76+
77+ if [[ " $TOP1_RESULT " == " OK" && " $TOP5_RESULT " == " OK" ]]; then
7378 CLEAN_AND_EXIT 0
7479fi
7580
You can’t perform that action at this time.
0 commit comments