Skip to content

Commit ff51a3c

Browse files
authored
Benchmarks: Add Feature - Add GPU-Burn as microbenchmark (#324)
**Description** Modifications adding GPU-Burn to SuperBench. - added third party submodule - modified Makefile to make gpu-burn binary - added/modified microbenchmarks to add gpu-burn python scripts - modified default and azure_ndv4 configs to add gpu-burn
1 parent 84359fd commit ff51a3c

File tree

11 files changed

+345
-2
lines changed

11 files changed

+345
-2
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,6 @@
1818
[submodule "third_party/GPCNET"]
1919
path = third_party/GPCNET
2020
url = https://github.com/netbench/GPCNET.git
21+
[submodule "third_party/gpu-burn"]
22+
path = third_party/gpu-burn
23+
url = https://github.com/wilicc/gpu-burn.git

docs/user-tutorial/benchmarks/micro-benchmarks.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,20 @@ The supported percentiles are 50, 90, 95, 99, and 99.9.
142142
| ort-inference/{precision}_{model}_time | time (ms) | The mean latency to execute one batch of inference. |
143143
| ort-inference/{precision}_{model}_time_{percentile} | time (ms) | The {percentile}th percentile latency to execute one batch of inference. |
144144

145+
### `gpu-burn`
146+
147+
#### Introduction
148+
149+
Multi-GPU CUDA stress test for GPU compute and memory utilization, performed by [gpu-burn](https://github.com/wilicc/gpu-burn).
150+
Supports the use of double unit types and the use of tensor cores.
151+
152+
#### Metrics
153+
154+
| Name | Unit | Description |
155+
|--------------------------|------------|-------------------------------------------------------------------------------------|
156+
| gpu-burn/time | time (s) | The runtime for gpu-burn test. |
157+
| gpu-burn/gpu_[0-9]_pass | yes/no | The result of the gpu-burn test for each GPU (1: yes, 0: no). |
158+
| gpu-burn/abort | yes/no | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |
145159

146160
## Communication Benchmarks
147161

examples/benchmarks/gpu_burn_test.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT license.
3+
4+
"""Micro benchmark example for GPU-Burn.
5+
6+
Commands to run:
7+
python3 examples/benchmarks/gpu_burn_test.py
8+
"""
9+
10+
from superbench.benchmarks import BenchmarkRegistry, Platform
11+
from superbench.common.utils import logger
12+
13+
if __name__ == '__main__':
14+
context = BenchmarkRegistry.create_benchmark_context(
15+
'gpu-burn', platform=Platform.CUDA, parameters='--doubles --tensor_core --time 10'
16+
)
17+
18+
benchmark = BenchmarkRegistry.launch_benchmark(context)
19+
if benchmark:
20+
logger.info(
21+
'benchmark: {}, return code: {}, result: {}'.format(
22+
benchmark.name, benchmark.return_code, benchmark.result
23+
)
24+
)

superbench/benchmarks/micro_benchmarks/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from superbench.benchmarks.micro_benchmarks.cpu_memory_bw_latency_performance import CpuMemBwLatencyBenchmark
1818
from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark
1919
from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark
20+
from superbench.benchmarks.micro_benchmarks.gpu_burn_test import GpuBurnBenchmark
2021
from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoopbackBenchmark
2122
from superbench.benchmarks.micro_benchmarks.ib_validation_performance import IBBenchmark
2223
from superbench.benchmarks.micro_benchmarks.kernel_launch_overhead import KernelLaunch
@@ -39,6 +40,7 @@
3940
'GPCNetBenchmark',
4041
'GemmFlopsBenchmark',
4142
'GpuCopyBwBenchmark',
43+
'GpuBurnBenchmark',
4244
'IBBenchmark',
4345
'IBLoopbackBenchmark',
4446
'KernelLaunch',
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT license.
3+
4+
"""Module of the GPU-Burn Test."""
5+
6+
import os
7+
8+
from superbench.common.utils import logger
9+
from superbench.benchmarks import BenchmarkRegistry, Platform
10+
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
11+
12+
13+
class GpuBurnBenchmark(MicroBenchmarkWithInvoke):
14+
"""The GPU Burn Test benchmark class."""
15+
def __init__(self, name, parameters=''):
16+
"""Constructor.
17+
18+
Args:
19+
name (str): benchmark name.
20+
parameters (str): benchmark parameters.
21+
"""
22+
super().__init__(name, parameters)
23+
24+
self._bin_name = 'gpu_burn'
25+
26+
def add_parser_arguments(self):
27+
"""Add the specified arguments."""
28+
super().add_parser_arguments()
29+
30+
self._parser.add_argument(
31+
'--doubles',
32+
action='store_true',
33+
default=False,
34+
help='Use doubles for the data type used in GPU-Burn',
35+
)
36+
self._parser.add_argument(
37+
'--tensor_core',
38+
action='store_true',
39+
default=False,
40+
help='Use tensor cores in GPU-Burn',
41+
)
42+
self._parser.add_argument(
43+
'--time',
44+
type=int,
45+
default=10,
46+
help='Length of time to run GPU-Burn for(in seconds)',
47+
)
48+
49+
def _preprocess(self):
50+
"""Preprocess/preparation operations before the benchmarking.
51+
52+
Return:
53+
True if _preprocess() succeed.
54+
"""
55+
if not super()._preprocess():
56+
return False
57+
58+
if not self._set_binary_path():
59+
return False
60+
61+
command = os.path.join(self._args.bin_dir, self._bin_name)
62+
63+
if self._args.doubles:
64+
command += ' -d'
65+
66+
if self._args.tensor_core:
67+
command += ' -tc'
68+
command += ' {} '.format(self._args.time)
69+
# copy compare.ptx which needs to be in the working directory
70+
compare_copy = 'cp ' + self._args.bin_dir + '/compare.ptx ./'
71+
# remove compare.ptx from working directory
72+
compare_rm = 'rm ' + 'compare.ptx'
73+
74+
self._commands.append(compare_copy + ' && ' + command + ' && ' + compare_rm)
75+
76+
return True
77+
78+
def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901
79+
"""Function to parse raw results and save the summarized results.
80+
81+
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
82+
83+
Args:
84+
cmd_idx (int): the index of command corresponding with the raw_output.
85+
raw_output (str): raw output string of the micro-benchmark.
86+
87+
Return:
88+
True if the raw output string is valid and result can be extracted.
89+
"""
90+
content = raw_output.splitlines()
91+
gpu_res = []
92+
abort = False
93+
failure_msg = 'unknown failure'
94+
index = -1
95+
try:
96+
for idx, line in enumerate(content):
97+
if 'No clients are alive!' in line or "Couldn't init a GPU" \
98+
in line or 'Failure during compute' in line or 'Low mem for result' in line:
99+
abort = True
100+
failure_msg = line
101+
break
102+
if 'done' in line:
103+
index = idx
104+
break
105+
106+
if not abort:
107+
if 'done' not in content[index]:
108+
abort = True
109+
failure_msg = 'The result format invalid'
110+
raise failure_msg
111+
112+
content = content[index + 2:len(content):]
113+
114+
for line in content:
115+
if 'Tested' in line:
116+
continue
117+
if 'GPU' in line:
118+
gpu_res.append(line.strip('\n').strip('\t'))
119+
120+
self._result.add_result('time', self._args.time)
121+
for res in gpu_res:
122+
if 'OK' in res:
123+
self._result.add_result(res.split(':')[0].replace(' ', '_').lower() + '_pass', 1)
124+
else:
125+
self._result.add_result(res.split(':')[0].replace(' ', '_').lower() + '_pass', 0)
126+
self._result.add_raw_data('GPU-Burn_result', res)
127+
else:
128+
self._result.add_raw_data('GPU Burn Failure: ', failure_msg)
129+
self._result.add_result('abort', 1)
130+
return False
131+
except BaseException as e:
132+
logger.error(
133+
'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
134+
self._curr_run_index, self._name, raw_output, str(e)
135+
)
136+
)
137+
self._result.add_result('abort', 1)
138+
return False
139+
self._result.add_result('abort', 0)
140+
return True
141+
142+
143+
BenchmarkRegistry.register_benchmark('gpu-burn', GpuBurnBenchmark, platform=Platform.CUDA)

superbench/config/azure_ndv4.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,16 @@ superbench:
3939
<<: *default_local_mode
4040
gemm-flops:
4141
<<: *default_local_mode
42+
gpu-burn:
43+
enable: false
44+
modes:
45+
- name: local
46+
proc_num: 1
47+
parallel: no
48+
parameters:
49+
time: 900
50+
doubles: true
51+
tensor_core: true
4252
nccl-bw:default:
4353
enable: true
4454
modes:

superbench/config/default.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,16 @@ superbench:
3333
model_action:
3434
- train
3535
benchmarks:
36+
gpu-burn:
37+
enable: true
38+
modes:
39+
- name: local
40+
proc_num: 1
41+
parallel: no
42+
parameters:
43+
time: 300
44+
doubles: true
45+
tensor_core: true
3646
nccl-bw:default:
3747
enable: true
3848
modes:
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
"""Tests for gpu-burn benchmark."""
5+
6+
import unittest
7+
8+
from tests.helper import decorator
9+
from tests.helper.testcase import BenchmarkTestCase
10+
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
11+
12+
13+
class GpuBurnBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
14+
"""Test class for gpu-burn benchmark."""
15+
@classmethod
16+
def setUpClass(cls):
17+
"""Hook method for setting up class fixture before running tests in the class."""
18+
super().setUpClass()
19+
cls.createMockEnvs(cls)
20+
cls.createMockFiles(cls, ['bin/gpu_burn'])
21+
22+
@decorator.load_data('tests/data/gpu_burn.log')
23+
def test_gpu_burn(self, results):
24+
"""Test gpu-burn benchmark command generation."""
25+
benchmark_name = 'gpu-burn'
26+
(benchmark_class,
27+
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
28+
assert (benchmark_class)
29+
30+
time = 10
31+
32+
parameters = '--doubles --tensor_core --time ' + str(time)
33+
benchmark = benchmark_class(benchmark_name, parameters=parameters)
34+
35+
# Check basic information
36+
assert (benchmark)
37+
ret = benchmark._preprocess()
38+
assert (ret is True)
39+
assert (benchmark.return_code == ReturnCode.SUCCESS)
40+
assert (benchmark.name == benchmark_name)
41+
assert (benchmark.type == BenchmarkType.MICRO)
42+
43+
# Check parameters specified in BenchmarkContext.
44+
assert (benchmark._args.time == time)
45+
assert (benchmark._args.doubles)
46+
assert (benchmark._args.tensor_core)
47+
48+
# Check command
49+
compare_copy = 'cp ' + benchmark._args.bin_dir + '/compare.ptx ./'
50+
compare_rm = 'rm ' + 'compare.ptx'
51+
assert (1 == len(benchmark._commands))
52+
assert (benchmark._commands[0].startswith(compare_copy))
53+
assert ('-d' in benchmark._commands[0])
54+
assert ('-tc' in benchmark._commands[0])
55+
assert (str(time) in benchmark._commands[0])
56+
assert (compare_rm in benchmark._commands[0])
57+
58+
# Check results
59+
assert (benchmark._process_raw_result(0, results))
60+
assert (benchmark.result['return_code'][0] == 0)
61+
assert (benchmark.result['time'][0] == time)
62+
for device in range(8):
63+
assert (benchmark.result['gpu_' + str(device) + '_pass'][0] == 1)
64+
assert (benchmark.result['abort'][0] == 0)

tests/data/gpu_burn.log

Lines changed: 64 additions & 0 deletions
Large diffs are not rendered by default.

third_party/Makefile

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@ RCCL_HOME ?= /opt/rocm/rccl
1010
ROCM_VERSION ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
1111
HPCX_HOME ?= /opt/hpcx
1212

13-
.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet
13+
.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn
1414

1515
# Build all targets.
1616
all: cuda rocm
17-
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet
17+
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn
1818
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest
1919
common: fio
2020

@@ -98,3 +98,11 @@ gpcnet: sb_micro_path
9898
bash -c "source ${HPCX_HOME}/hpcx-init.sh && hpcx_load && make CC=mpicc -C GPCNET all && hpcx_unload"
9999
cp -v ./GPCNET/network_test $(SB_MICRO_PATH)/bin/
100100
cp -v ./GPCNET/network_load_test $(SB_MICRO_PATH)/bin/
101+
102+
#Build GPU burn from main branch (only branch that exists)
103+
cuda_gpuburn: sb_micro_path
104+
ifneq (,$(wildcard gpu-burn/Makefile))
105+
cd ./gpu-burn && make
106+
cp -v ./gpu-burn/gpu_burn $(SB_MICRO_PATH)/bin/
107+
cp -v ./gpu-burn/compare.ptx $(SB_MICRO_PATH)/bin/
108+
endif

third_party/gpu-burn

Submodule gpu-burn added at cab8221

0 commit comments

Comments
 (0)