Skip to content

Commit 666e3a9

Browse files
author
Yuting Jiang
authored
Benchmarks: Add Benchmark - Add memory bus bandwidth performance microbenchmark for amd (#153)
**Description** Add memory bus bandwidth performance microbenchmark for amd. **Major Revision** - Add memory bus bandwidth performance microbenchmark for amd. - Add related example and test file.
1 parent 2880f71 commit 666e3a9

File tree

4 files changed

+289
-1
lines changed

4 files changed

+289
-1
lines changed
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT license.
3+
4+
"""Micro benchmark example for device memory bandwidth performance.
5+
6+
Commands to run:
7+
python3 examples/benchmarks/rocm_memory_bw_performance.py
8+
"""
9+
10+
from superbench.benchmarks import BenchmarkRegistry, Platform
11+
from superbench.common.utils import logger
12+
13+
if __name__ == '__main__':
14+
context = BenchmarkRegistry.create_benchmark_context('mem-bw', platform=Platform.ROCM)
15+
16+
benchmark = BenchmarkRegistry.launch_benchmark(context)
17+
if benchmark:
18+
logger.info(
19+
'benchmark: {}, return code: {}, result: {}'.format(
20+
benchmark.name, benchmark.return_code, benchmark.result
21+
)
22+
)

superbench/benchmarks/micro_benchmarks/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@
1515
from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
1616
from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoopbackBenchmark
1717
from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
18+
from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import RocmMemBwBenchmark
1819

1920
__all__ = [
2021
'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
2122
'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda', 'MemBwBenchmark', 'CudaMemBwBenchmark', 'DiskBenchmark',
22-
'IBLoopbackBenchmark', 'CudaNcclBwBenchmark'
23+
'IBLoopbackBenchmark', 'CudaNcclBwBenchmark', 'RocmMemBwBenchmark'
2324
]
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT license.
3+
4+
"""Module of the ROCm memory performance benchmarks."""
5+
6+
import os
7+
8+
from superbench.common.utils import logger
9+
from superbench.benchmarks import BenchmarkRegistry, Platform
10+
from superbench.benchmarks.micro_benchmarks import MemBwBenchmark
11+
12+
13+
class RocmMemBwBenchmark(MemBwBenchmark):
14+
"""The ROCm memory performance benchmark class."""
15+
def __init__(self, name, parameters=''):
16+
"""Constructor.
17+
18+
Args:
19+
name (str): benchmark name.
20+
parameters (str): benchmark parameters.
21+
"""
22+
super().__init__(name, parameters)
23+
24+
self._bin_name = 'hipBusBandwidth'
25+
self._mem_types = ['htod', 'dtoh']
26+
self._parse_logline_map = {'htod': 'H2D_Bandwidth', 'dtoh': 'D2H_Bandwidth'}
27+
28+
def add_parser_arguments(self):
29+
"""Add the specified arguments."""
30+
super().add_parser_arguments()
31+
32+
def _preprocess(self):
33+
"""Preprocess/preparation operations before the benchmarking.
34+
35+
Return:
36+
True if _preprocess() succeed.
37+
"""
38+
if not super()._preprocess():
39+
return False
40+
41+
# Check the arguments and generate the commands
42+
for mem_type in self._args.mem_type:
43+
command = os.path.join(self._args.bin_dir, self._bin_name)
44+
command += ' --' + mem_type.replace('to', '2')
45+
if self._args.memory == 'unpinned':
46+
command += ' --unpinned'
47+
self._commands.append(command)
48+
49+
return True
50+
51+
def _process_raw_result(self, cmd_idx, raw_output):
52+
"""Function to parse raw results and save the summarized results.
53+
54+
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
55+
56+
Args:
57+
cmd_idx (int): the index of command corresponding with the raw_output.
58+
raw_output (str): raw output string of the micro-benchmark.
59+
60+
Return:
61+
True if the raw output string is valid and result can be extracted.
62+
"""
63+
self._result.add_raw_data('raw_output_' + self._args.mem_type[cmd_idx], raw_output)
64+
65+
mem_bw = -1
66+
value_index = -1
67+
size_index = -1
68+
valid = True
69+
content = raw_output.splitlines()
70+
try:
71+
parse_logline = self._parse_logline_map[self._args.mem_type[cmd_idx]]
72+
for line in content:
73+
if parse_logline in line and value_index != -1:
74+
line = line.split()
75+
mem_bw = float(line[value_index])
76+
metric = self._args.mem_type[cmd_idx] + '_' + line[size_index]
77+
self._result.add_result(metric, mem_bw)
78+
elif 'mean' in line:
79+
line = line.split()
80+
value_index = line.index('mean')
81+
size_index = line.index('atts')
82+
except BaseException:
83+
valid = False
84+
finally:
85+
if valid is False or mem_bw == -1:
86+
logger.error(
87+
'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
88+
self._curr_run_index, self._name, raw_output
89+
)
90+
)
91+
return False
92+
93+
return True
94+
95+
96+
BenchmarkRegistry.register_benchmark('mem-bw', RocmMemBwBenchmark, platform=Platform.ROCM)
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
"""Tests for mem-bw benchmark."""
5+
6+
import numbers
7+
from pathlib import Path
8+
import os
9+
import unittest
10+
11+
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
12+
13+
14+
class RocmMemBwTest(unittest.TestCase):
15+
"""Test class for rocm mem-bw benchmark."""
16+
def setUp(self):
17+
"""Method called to prepare the test fixture."""
18+
# Create fake binary file just for testing.
19+
os.environ['SB_MICRO_PATH'] = '/tmp/superbench/'
20+
binary_path = os.path.join(os.getenv('SB_MICRO_PATH'), 'bin')
21+
Path(os.getenv('SB_MICRO_PATH'), 'bin').mkdir(parents=True, exist_ok=True)
22+
self.__binary_file = Path(binary_path, 'hipBusBandwidth')
23+
self.__binary_file.touch(mode=0o755, exist_ok=True)
24+
25+
def tearDown(self):
26+
"""Method called after the test method has been called and the result recorded."""
27+
self.__binary_file.unlink()
28+
29+
def test_rocm_memory_bw_performance(self):
30+
"""Test rocm mem-bw benchmark."""
31+
benchmark_name = 'mem-bw'
32+
(benchmark_class,
33+
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.ROCM)
34+
assert (benchmark_class)
35+
36+
benchmark = benchmark_class(benchmark_name)
37+
38+
ret = benchmark._preprocess()
39+
assert (ret is True)
40+
assert (benchmark.return_code == ReturnCode.SUCCESS)
41+
42+
# Check basic information.
43+
assert (benchmark)
44+
assert (benchmark.name == 'mem-bw')
45+
assert (benchmark.type == BenchmarkType.MICRO)
46+
47+
# Check command list
48+
expected_command = ['hipBusBandwidth --h2d', 'hipBusBandwidth --d2h']
49+
for i in range(len(expected_command)):
50+
commnad = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
51+
assert (commnad == expected_command[i])
52+
53+
# Check results and metrics.
54+
raw_output = {}
55+
raw_output[0] = """
56+
Device:Device 738c Mem=32.0GB #CUs=120 Freq=1502Mhz MallocMode=pinned
57+
test atts units median mean stddev min max
58+
H2D_Bandwidth_pinned +064By GB/sec 0.0000 0.0000 0.0000 0.0000 0.0000
59+
H2D_Bandwidth_pinned +256By GB/sec 0.0000 0.0000 0.0000 0.0000 0.0000
60+
H2D_Bandwidth_pinned +512By GB/sec 0.0000 0.0000 0.0000 0.0000 0.0000
61+
H2D_Bandwidth_pinned 1kB GB/sec 0.0414 0.0411 0.0017 0.0189 0.0434
62+
H2D_Bandwidth_pinned 2kB GB/sec 0.0828 0.0824 0.0018 0.0683 0.0862
63+
H2D_Bandwidth_pinned 4kB GB/sec 0.1656 0.1652 0.0032 0.1374 0.1724
64+
H2D_Bandwidth_pinned 8kB GB/sec 0.3268 0.3251 0.0117 0.1880 0.3425
65+
H2D_Bandwidth_pinned 16kB GB/sec 0.6410 0.6365 0.0259 0.3597 0.6757
66+
H2D_Bandwidth_pinned 32kB GB/sec 1.2422 1.2432 0.0278 0.9346 1.2987
67+
H2D_Bandwidth_pinned 64kB GB/sec 2.3968 2.4161 0.1486 0.7242 2.6042
68+
H2D_Bandwidth_pinned 128kB GB/sec 4.6786 4.6339 0.1310 4.1143 4.8162
69+
H2D_Bandwidth_pinned 256kB GB/sec 7.8349 7.8369 0.1150 6.9093 8.0270
70+
H2D_Bandwidth_pinned 512kB GB/sec 11.9963 11.9828 0.1287 11.2158 12.2201
71+
H2D_Bandwidth_pinned 1024kB GB/sec 16.3342 16.3315 0.0956 16.0147 16.5823
72+
H2D_Bandwidth_pinned 2048kB GB/sec 19.9790 19.9770 0.0853 19.7681 20.1635
73+
H2D_Bandwidth_pinned 4096kB GB/sec 22.2706 22.2642 0.0552 22.0644 22.3847
74+
H2D_Bandwidth_pinned 8192kB GB/sec 22.8232 22.7881 0.1669 21.3196 22.8930
75+
H2D_Bandwidth_pinned 16384kB GB/sec 24.1521 24.1411 0.0429 24.0165 24.2162
76+
H2D_Bandwidth_pinned 32768kB GB/sec 24.8695 24.7086 0.7491 20.6288 24.9035
77+
H2D_Bandwidth_pinned 65536kB GB/sec 24.4840 24.0101 2.5769 6.1754 24.5292
78+
H2D_Bandwidth_pinned 131072kB GB/sec 25.0487 24.9593 0.2601 24.1286 25.0711
79+
H2D_Bandwidth_pinned 262144kB GB/sec 25.3280 25.2351 0.1788 24.8746 25.3498
80+
H2D_Bandwidth_pinned 524288kB GB/sec 24.7523 24.6708 0.1586 24.3154 24.7880
81+
H2D_Timepinned +064By ms 0.0245 0.0253 0.0240 0.0232 0.7821
82+
H2D_Timepinned +256By ms 0.0243 0.0244 0.0013 0.0232 0.0546
83+
H2D_Timepinned +512By ms 0.0243 0.0244 0.0014 0.0230 0.0566
84+
H2D_Timepinned 1kB ms 0.0242 0.0244 0.0016 0.0230 0.0530
85+
H2D_Timepinned 2kB ms 0.0242 0.0243 0.0005 0.0232 0.0293
86+
H2D_Timepinned 4kB ms 0.0242 0.0242 0.0005 0.0232 0.0291
87+
H2D_Timepinned 8kB ms 0.0245 0.0247 0.0013 0.0234 0.0426
88+
H2D_Timepinned 16kB ms 0.0250 0.0252 0.0015 0.0237 0.0445
89+
H2D_Timepinned 32kB ms 0.0258 0.0258 0.0006 0.0246 0.0342
90+
H2D_Timepinned 64kB ms 0.0271 0.0272 0.0045 0.0250 0.0898
91+
H2D_Timepinned 128kB ms 0.0280 0.0283 0.0008 0.0272 0.0318
92+
H2D_Timepinned 256kB ms 0.0334 0.0334 0.0005 0.0326 0.0379
93+
H2D_Timepinned 512kB ms 0.0437 0.0437 0.0005 0.0429 0.0467
94+
H2D_Timepinned 1024kB ms 0.0642 0.0642 0.0004 0.0632 0.0654
95+
H2D_Timepinned 2048kB ms 0.1050 0.1050 0.0004 0.1040 0.1061
96+
H2D_Timepinned 4096kB ms 0.1883 0.1884 0.0005 0.1874 0.1901
97+
H2D_Timepinned 8192kB ms 0.3675 0.3681 0.0028 0.3664 0.3934
98+
H2D_Timepinned 16384kB ms 0.6946 0.6950 0.0012 0.6928 0.6986
99+
H2D_Timepinned 32768kB ms 1.3492 1.3595 0.0482 1.3474 1.6266
100+
H2D_Timepinned 65536kB ms 2.7409 2.9163 1.1368 2.7358 10.8670
101+
H2D_Timepinned 131072kB ms 5.3582 5.3780 0.0576 5.3534 5.5626
102+
H2D_Timepinned 262144kB ms 10.5983 10.6379 0.0761 10.5892 10.7915
103+
H2D_Timepinned 524288kB ms 21.6897 21.7622 0.1411 21.6585 22.0794
104+
105+
Note: results marked with (*) had missing values such as
106+
might occur with a mixture of architectural capabilities.
107+
"""
108+
raw_output[1] = """
109+
Device:Device 738c Mem=32.0GB #CUs=120 Freq=1502Mhz MallocMode=pinned
110+
test atts units median mean stddev min max
111+
D2H_Bandwidth_pinned +064By GB/sec 0.0000 0.0000 0.0000 0.0000 0.0000
112+
D2H_Bandwidth_pinned +256By GB/sec 0.0000 0.0000 0.0000 0.0000 0.0000
113+
D2H_Bandwidth_pinned +512By GB/sec 0.0000 0.0000 0.0000 0.0000 0.0000
114+
D2H_Bandwidth_pinned 1kB GB/sec 0.0428 0.0426 0.0019 0.0114 0.0446
115+
D2H_Bandwidth_pinned 2kB GB/sec 0.0850 0.0844 0.0034 0.0415 0.0893
116+
D2H_Bandwidth_pinned 4kB GB/sec 0.1701 0.1687 0.0084 0.0504 0.1773
117+
D2H_Bandwidth_pinned 8kB GB/sec 0.3378 0.3348 0.0168 0.1085 0.3546
118+
D2H_Bandwidth_pinned 16kB GB/sec 0.6667 0.6606 0.0218 0.5618 0.6897
119+
D2H_Bandwidth_pinned 32kB GB/sec 1.3072 1.2954 0.0663 0.5682 1.3605
120+
D2H_Bandwidth_pinned 64kB GB/sec 2.5550 2.5339 0.0955 2.1382 2.6904
121+
D2H_Bandwidth_pinned 128kB GB/sec 4.8162 4.7807 0.2331 2.0940 4.9621
122+
D2H_Bandwidth_pinned 256kB GB/sec 8.2286 8.2192 0.1671 7.2456 8.5286
123+
D2H_Bandwidth_pinned 512kB GB/sec 12.7930 12.7062 0.4407 7.1196 13.0478
124+
D2H_Bandwidth_pinned 1024kB GB/sec 17.5603 17.4938 0.3921 12.7184 17.7989
125+
D2H_Bandwidth_pinned 2048kB GB/sec 21.6275 21.5591 0.2233 20.6073 21.8076
126+
D2H_Bandwidth_pinned 4096kB GB/sec 24.2708 24.2556 0.0942 23.5724 24.4292
127+
D2H_Bandwidth_pinned 8192kB GB/sec 24.9287 24.9093 0.0733 24.7171 25.0359
128+
D2H_Bandwidth_pinned 16384kB GB/sec 26.4588 26.1976 2.4387 1.9387 26.5191
129+
D2H_Bandwidth_pinned 32768kB GB/sec 27.2939 27.1202 0.7941 23.2086 27.3277
130+
D2H_Bandwidth_pinned 65536kB GB/sec 26.8278 26.7238 0.3894 24.7946 26.9000
131+
D2H_Bandwidth_pinned 131072kB GB/sec 27.4751 27.3457 0.3968 25.4168 27.5098
132+
D2H_Bandwidth_pinned 262144kB GB/sec 27.8236 27.7173 0.3072 26.7977 27.8525
133+
D2H_Bandwidth_pinned 524288kB GB/sec 28.0193 27.9348 0.1912 27.4707 28.0314
134+
D2H_Time_pinned +064By ms 0.0229 0.0246 0.0457 0.0216 1.4690
135+
D2H_Time_pinned +256By ms 0.0232 0.0234 0.0013 0.0221 0.0378
136+
D2H_Time_pinned +512By ms 0.0234 0.0238 0.0063 0.0224 0.2091
137+
D2H_Time_pinned 1kB ms 0.0234 0.0236 0.0028 0.0224 0.0875
138+
D2H_Time_pinned 2kB ms 0.0235 0.0237 0.0014 0.0224 0.0482
139+
D2H_Time_pinned 4kB ms 0.0235 0.0239 0.0031 0.0226 0.0794
140+
D2H_Time_pinned 8kB ms 0.0237 0.0240 0.0027 0.0226 0.0738
141+
D2H_Time_pinned 16kB ms 0.0240 0.0242 0.0009 0.0232 0.0285
142+
D2H_Time_pinned 32kB ms 0.0245 0.0248 0.0021 0.0235 0.0563
143+
D2H_Time_pinned 64kB ms 0.0254 0.0257 0.0011 0.0242 0.0304
144+
D2H_Time_pinned 128kB ms 0.0272 0.0275 0.0026 0.0264 0.0626
145+
D2H_Time_pinned 256kB ms 0.0318 0.0319 0.0007 0.0307 0.0362
146+
D2H_Time_pinned 512kB ms 0.0410 0.0413 0.0024 0.0402 0.0736
147+
D2H_Time_pinned 1024kB ms 0.0597 0.0599 0.0017 0.0589 0.0824
148+
D2H_Time_pinned 2048kB ms 0.0970 0.0973 0.0010 0.0962 0.1018
149+
D2H_Time_pinned 4096kB ms 0.1728 0.1729 0.0007 0.1717 0.1779
150+
D2H_Time_pinned 8192kB ms 0.3365 0.3367 0.0010 0.3350 0.3394
151+
D2H_Time_pinned 16384kB ms 0.6341 0.7147 0.7979 0.6326 8.6538
152+
D2H_Time_pinned 32768kB ms 1.2294 1.2385 0.0420 1.2278 1.4458
153+
D2H_Time_pinned 65536kB ms 2.5014 2.5117 0.0391 2.4947 2.7066
154+
D2H_Time_pinned 131072kB ms 4.8850 4.9092 0.0748 4.8789 5.2806
155+
D2H_Time_pinned 262144kB ms 9.6478 9.6860 0.1106 9.6377 10.0171
156+
D2H_Time_pinned 524288kB ms 19.1607 19.2196 0.1333 19.1525 19.5434
157+
158+
Note: results marked with (*) had missing values such as
159+
might occur with a mixture of architectural capabilities.
160+
"""
161+
162+
for i, metric in enumerate(['htod_524288kB', 'htod_524288kB']):
163+
assert (benchmark._process_raw_result(i, raw_output[i]))
164+
assert (metric in benchmark.result)
165+
assert (len(benchmark.result[metric]) == 1)
166+
assert (isinstance(benchmark.result[metric][0], numbers.Number))
167+
168+
assert (benchmark.result['htod_524288kB'][0] == 24.6708)
169+
assert (benchmark.result['dtoh_524288kB'][0] == 27.9348)

0 commit comments

Comments
 (0)