-
Notifications
You must be signed in to change notification settings - Fork 39
/
Copy pathmm_flops.py
60 lines (47 loc) · 2.56 KB
/
mm_flops.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import time
import torch
import sys
import numpy as np
import argparse
import os
from utils import Tee, benchmark_mm, print_benchmark_header
file_dir = os.path.abspath(os.path.dirname(__file__))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
m_group = parser.add_mutually_exclusive_group(required=True)
m_group.add_argument("-m", nargs="+", type=int, help='The first dimension of the GEMM, enter any number of arguments')
m_group.add_argument("--m_range", nargs='+', type=int, help="The first dimension of the GEMM, [start,stop,step]")
n_group = parser.add_mutually_exclusive_group(required=True)
n_group.add_argument("-n", nargs="*", type=int, help='The shared dimension of the GEMM, enter any number of arguments')
n_group.add_argument("--n_range", nargs='+', type=int, help="The shared dimension of the GEMM, [start,stop,step]")
k_group = parser.add_mutually_exclusive_group(required=True)
k_group.add_argument("-k", nargs="*", type=int, help='The last dimension of the GEMM, enter any number of arguments')
k_group.add_argument("--k_range", nargs='+', type=int, help="The last dimension of the GEMM, [start,stop,step]")
parser.add_argument("--num_iterations", type=int, default=200, help='The number of iterations used to benchmark each GEMM')
parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations')
parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on")
parser.add_argument("--output_file", type=str, default=f"{file_dir}/results/mm.out")
parser.add_argument("--notes", type=str, default="", help="benchmark-specific notes to add to the output_file's header")
parser.add_argument("--verbose", default=True, action=argparse.BooleanOptionalAction, help='log to stdout besides output_file?')
args = parser.parse_args()
m = args.m
n = args.n
k = args.k
if m is None:
start,stop,step = args.m_range
m = np.arange(start,stop,step)
if n is None:
start,stop,step = args.n_range
n = np.arange(start,stop,step)
if k is None:
start,stop,step = args.k_range
k = np.arange(start,stop,step)
# set cuda device
torch.cuda.set_device(f"cuda:{args.cuda_device}")
sys.stdout = Tee(args.output_file, args.verbose)
print_benchmark_header(args.notes)
# loop through all sizes to benchmark
for M in m:
for N in n:
for K in k:
benchmark_mm(M, N, K, args.num_iterations, args.num_warmup_iterations)