Skip to content

Commit b682506

Browse files
committed
Applied suggestions from comments by @csbnw
1 parent 0cb5e3a commit b682506

File tree

8 files changed

+5
-65
lines changed

8 files changed

+5
-65
lines changed

kernel_tuner/backends/cupy.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
4747
self.devprops = dev.attributes
4848
self.cc = dev.compute_capability
4949
self.max_threads = self.devprops["MaxThreadsPerBlock"]
50-
self.cache_size_L2 = self.devprops["L2CacheSize"]
5150

5251
self.iterations = iterations
5352
self.current_module = None
@@ -126,7 +125,7 @@ def compile(self, kernel_instance):
126125
compiler_options = self.compiler_options
127126
if not any(["-std=" in opt for opt in self.compiler_options]):
128127
compiler_options = ["--std=c++11"] + self.compiler_options
129-
# CuPy already sets the --gpu-architecture by itself, as per https://github.com/cupy/cupy/blob/20ccd63c0acc40969c851b1917dedeb032209e8b/cupy/cuda/compiler.py#L145
128+
# CuPy already sets the --gpu-architecture by itself, as per https://github.com/cupy/cupy/blob/main/cupy/cuda/compiler.py#L145
130129

131130
options = tuple(compiler_options)
132131

kernel_tuner/backends/hip.py

-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
5959

6060
self.name = self.hipProps._name.decode('utf-8')
6161
self.max_threads = self.hipProps.maxThreadsPerBlock
62-
self.cache_size_L2 = self.hipProps.l2CacheSize
6362
self.device = device
6463
self.compiler_options = compiler_options or []
6564
self.iterations = iterations

kernel_tuner/backends/nvcuda.py

-4
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
6868
cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device
6969
)
7070
cuda_error_check(err)
71-
err, self.cache_size_L2 = cudart.cudaDeviceGetAttribute(
72-
cudart.cudaDeviceAttr.cudaDevAttrL2CacheSize, device
73-
)
74-
cuda_error_check(err)
7571
self.cc = f"{major}{minor}"
7672
self.iterations = iterations
7773
self.current_module = None

kernel_tuner/backends/opencl.py

-4
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,6 @@ def __init__(
4545
self.max_threads = self.ctx.devices[0].get_info(
4646
cl.device_info.MAX_WORK_GROUP_SIZE
4747
)
48-
# TODO the L2 cache size request fails
49-
# self.cache_size_L2 = self.ctx.devices[0].get_info(
50-
# cl.device_affinity_domain.L2_CACHE
51-
# )
5248
self.compiler_options = compiler_options or []
5349

5450
# observer stuff

kernel_tuner/backends/pycuda.py

-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ def _finish_up():
101101
str(k): v for (k, v) in self.context.get_device().get_attributes().items()
102102
}
103103
self.max_threads = devprops["MAX_THREADS_PER_BLOCK"]
104-
self.cache_size_L2 = devprops["L2_CACHE_SIZE"]
105104
cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str(
106105
devprops.get("COMPUTE_CAPABILITY_MINOR", "0")
107106
)

kernel_tuner/core.py

+2-50
Original file line numberDiff line numberDiff line change
@@ -340,62 +340,14 @@ def __init__(
340340
if not quiet:
341341
print("Using: " + self.dev.name)
342342

343-
if lang.upper() not in ['OPENCL', 'C', 'FORTRAN']:
344-
# flush the L2 cache, inspired by https://github.com/pytorch/FBGEMM/blob/eb3c304e6c213b81f2b2077813d3c6d16597aa97/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu#L130
345-
flush_gpu_string = """
346-
__global__ void flush_gpu(char* d_flush, char* d_flush2, bool do_write) {
347-
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
348-
const char val = d_flush[idx];
349-
if (do_write * val) {
350-
d_flush2[idx] = val;
351-
}
352-
}
353-
"""
354-
cache_size = self.dev.cache_size_L2
355-
d_flush = np.ones((cache_size), order='F').astype(np.float32)
356-
d_flush2 = np.ones((cache_size), order='F').astype(np.float32)
357-
self.flush_kernel_gpu_args = [d_flush, d_flush2, np.int32(True)]
358-
359-
from kernel_tuner.interface import Options
360-
options = {
361-
'kernel_name': 'flush_gpu',
362-
'lang': 'CUDA',
363-
'arguments': self.flush_kernel_gpu_args,
364-
'problem_size': cache_size,
365-
'grid_div_x': None,
366-
'grid_div_y': None,
367-
'grid_div_z': None,
368-
'block_size_names': None,
369-
}
370-
options = Options(options)
371-
flush_kernel_lang = lang.upper() if lang.upper() in ['CUDA', 'CUPY', 'NVCUDA'] else 'CUPY'
372-
flush_kernel_source = KernelSource('flush_gpu', flush_gpu_string, flush_kernel_lang)
373-
self.flush_kernel_instance = self.create_kernel_instance(flush_kernel_source, kernel_options=options, params=dict(), verbose=not quiet)
374-
self.flush_kernel = self.compile_kernel(self.flush_kernel_instance, verbose=not quiet)
375-
self.flush_kernel_gpu_args = self.ready_argument_list(self.flush_kernel_gpu_args)
376-
377-
# from kernel_tuner.kernelbuilder import PythonKernel
378-
# self.flush_kernel = PythonKernel('flush_gpu', flush_gpu_string, cache_size, self.flush_kernel_gpu_args)
379-
380-
def flush_cache(self):
381-
"""This special function can be called to flush the L2 cache."""
382-
if hasattr(self, 'flush_kernel'):
383-
return
384-
self.dev.synchronize()
385-
assert self.run_kernel(self.flush_kernel, self.flush_kernel_gpu_args, self.flush_kernel_instance)
386-
# self.flush_kernel.run_kernel(self.flush_kernel.gpu_args)
387-
self.dev.synchronize()
388-
389-
def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True):
390-
"""Benchmark one kernel execution at a time. Run with `flush_cache=True` to avoid caching effects between iterations."""
343+
def benchmark_default(self, func, gpu_args, threads, grid, result):
344+
"""Benchmark one kernel execution at a time."""
391345
observers = [
392346
obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver)
393347
]
394348

395349
self.dev.synchronize()
396350
for _ in range(self.iterations):
397-
if flush_cache:
398-
self.flush_cache()
399351
for obs in observers:
400352
obs.before_start()
401353
self.dev.synchronize()

test/test_pycuda_mocked.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@ def setup_mock(drv):
1313
context = Mock()
1414
devprops = {'MAX_THREADS_PER_BLOCK': 1024,
1515
'COMPUTE_CAPABILITY_MAJOR': 5,
16-
'COMPUTE_CAPABILITY_MINOR': 5,
17-
'L2_CACHE_SIZE': 4096}
16+
'COMPUTE_CAPABILITY_MINOR': 5,}
1817
context.return_value.get_device.return_value.get_attributes.return_value = devprops
1918
context.return_value.get_device.return_value.compute_capability.return_value = "55"
2019
drv.Device.return_value.retain_primary_context.return_value = context()

test/test_util_functions.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def test_to_valid_nvrtc_gpu_arch_cc():
153153
assert to_valid_nvrtc_gpu_arch_cc("40") == "52"
154154
assert to_valid_nvrtc_gpu_arch_cc("90b") == "90a"
155155
assert to_valid_nvrtc_gpu_arch_cc("91c") == "90a"
156-
assert to_valid_nvrtc_gpu_arch_cc("10123001") == "52"
156+
assert to_valid_nvrtc_gpu_arch_cc("1234") == "52"
157157
with pytest.raises(ValueError):
158158
assert to_valid_nvrtc_gpu_arch_cc("")
159159
assert to_valid_nvrtc_gpu_arch_cc("1")

0 commit comments

Comments
 (0)