diff --git a/src/neural/backends/cuda/common_kernels.cu b/src/neural/backends/cuda/common_kernels.cu index 61a791b036..f4e8906fca 100644 --- a/src/neural/backends/cuda/common_kernels.cu +++ b/src/neural/backends/cuda/common_kernels.cu @@ -70,6 +70,7 @@ void addVectors(T* c, T* a, T* b, int size, int asize, int bsize, const int kBlockSize = 256; int blocks = DivUp(size, kBlockSize); + CUDA_KERNEL_LAUNCH_LOG(addVectors_kernel, blocks, kBlockSize, 0, stream); addVectors_kernel<<>>(c, a, b, size, asize, bsize, activation); ReportCUDAErrors(cudaGetLastError()); @@ -98,6 +99,7 @@ template void addVectorsHNC_NHC(T* a, T* b, int N, int H, int C, cudaStream_t stream) { const int kBlockSize = 256; int blocks = DivUp(N * H * C, kBlockSize); + CUDA_KERNEL_LAUNCH_LOG(addVectorsHNC_NHC_kernel, blocks, kBlockSize, 0, stream); addVectorsHNC_NHC_kernel<<>>(a, b, N, H, C); ReportCUDAErrors(cudaGetLastError()); @@ -171,26 +173,32 @@ void addBiasBatched(T* output, const T* input, const T* bias, int Batch, int N, switch (activation) { case ACTIVATION_NONE: + CUDA_KERNEL_LAUNCH_LOG(addBiasBatched_kernel_NONE, gridDim, blockDim, 0, stream); addBiasBatched_kernel <<>>(output, input, bias, N, C); break; case ACTIVATION_SELU: + CUDA_KERNEL_LAUNCH_LOG(addBiasBatched_kernel_SELU, gridDim, blockDim, 0, stream); addBiasBatched_kernel <<>>(output, input, bias, N, C); break; case ACTIVATION_MISH: + CUDA_KERNEL_LAUNCH_LOG(addBiasBatched_kernel_MISH, gridDim, blockDim, 0, stream); addBiasBatched_kernel <<>>(output, input, bias, N, C); break; case ACTIVATION_RELU: + CUDA_KERNEL_LAUNCH_LOG(addBiasBatched_kernel_RELU, gridDim, blockDim, 0, stream); addBiasBatched_kernel <<>>(output, input, bias, N, C); break; case ACTIVATION_SWISH: + CUDA_KERNEL_LAUNCH_LOG(addBiasBatched_kernel_SWISH, gridDim, blockDim, 0, stream); addBiasBatched_kernel <<>>(output, input, bias, N, C); break; case ACTIVATION_RELU_2: // square relu + CUDA_KERNEL_LAUNCH_LOG(addBiasBatched_kernel_RELU_2, gridDim, blockDim, 0, stream); addBiasBatched_kernel <<>>(output, input, bias, N, C); break; @@ -271,31 +279,37 @@ void addBiasBatched(T* output, const T* input, const T* bias, int Batch, int N, switch (activation) { case ACTIVATION_NONE: + CUDA_KERNEL_LAUNCH_LOG(addBiasBatched_kernel_NONE, gridDim, blockDim, 0, stream); addBiasBatched_kernel <<>>(output, input, bias, N, C, Nstride); break; case ACTIVATION_SELU: + CUDA_KERNEL_LAUNCH_LOG(addBiasBatched_kernel_SELU, gridDim, blockDim, 0, stream); addBiasBatched_kernel <<>>(output, input, bias, N, C, Nstride); break; case ACTIVATION_MISH: + CUDA_KERNEL_LAUNCH_LOG(addBiasBatched_kernel_MISH, gridDim, blockDim, 0, stream); addBiasBatched_kernel <<>>(output, input, bias, N, C, Nstride); break; case ACTIVATION_RELU: + CUDA_KERNEL_LAUNCH_LOG(addBiasBatched_kernel_RELU, gridDim, blockDim, 0, stream); addBiasBatched_kernel <<>>(output, input, bias, N, C, Nstride); break; case ACTIVATION_SWISH: + CUDA_KERNEL_LAUNCH_LOG(addBiasBatched_kernel_SWISH, gridDim, blockDim, 0, stream); addBiasBatched_kernel <<>>(output, input, bias, N, C, Nstride); break; case ACTIVATION_RELU_2: // square relu + CUDA_KERNEL_LAUNCH_LOG(addBiasBatched_kernel_RELU_2, gridDim, blockDim, 0, stream); addBiasBatched_kernel <<>>(output, input, bias, N, C, Nstride); @@ -336,6 +350,7 @@ void addBias_NCHW(T* c, T* a, T* b, int N, int C, int H, int W, const int kBlockSize = 256; int blocks = DivUp(size, kBlockSize); + CUDA_KERNEL_LAUNCH_LOG(addBias_NCHW_kernel, blocks, kBlockSize, 0, stream); addBias_NCHW_kernel<<>>(c, a, b, N, C, H, W, activation); ReportCUDAErrors(cudaGetLastError()); @@ -387,6 +402,7 @@ void convertNCHWtoNHWC(DstType* output_tensor, const SrcType* input_tensor, size_t numElements = Nout * Cout * H * W; const int blockSize = 256; int blocks = DivUp(numElements, blockSize); + CUDA_KERNEL_LAUNCH_LOG(NCHWtoNHWC_kernel, blocks, blockSize, 0, stream); NCHWtoNHWC_kernel<<>>( output_tensor, input_tensor, Nin, Cin, Nout, Cout, H, W); } @@ -405,6 +421,7 @@ template void copyTypeConverted(DstType* op, SrcType* ip, int N, cudaStream_t stream) { const int kBlockSize = 256; int blocks = DivUp(N, kBlockSize); + CUDA_KERNEL_LAUNCH_LOG(copyTypeConverted_kernel, blocks, kBlockSize, 0, stream); copyTypeConverted_kernel<<>>(op, ip, N); } @@ -444,6 +461,7 @@ void batchNorm(T* output, const T* input, const T* skipInput, int N, int C, const int kBlockSize = 256; int blocks = DivUp(total_elements, kBlockSize); + CUDA_KERNEL_LAUNCH_LOG(batchNorm_kernel, blocks, kBlockSize, 0, stream); batchNorm_kernel<<>>( output, input, skipInput, N, C, H, W, means, var_multipliers, activation); @@ -483,6 +501,7 @@ void expandPlanes_Fp32_NCHW(float* output, const uint64_t* masks, int threads = n * 8 * 8 / 2; // Each thread writes two elements. const int blockSize = 256; int blocks = DivUp(threads, blockSize); + CUDA_KERNEL_LAUNCH_LOG(expandPlanes_kernel_Fp32_NCHW, blocks, blockSize, 0, stream); expandPlanes_kernel_Fp32_NCHW<<>>(output, masks, values, n); ReportCUDAErrors(cudaGetLastError()); @@ -517,6 +536,7 @@ void expandPlanes_Fp16_NHWC(half* output, const uint64_t* masks, int threads = n * 8 * 8; // Each thread writes a single element. const int kBlockSize = 256; int blocks = DivUp(threads, kBlockSize); + CUDA_KERNEL_LAUNCH_LOG(expandPlanes_kernel_Fp16_NHWC, blocks, kBlockSize, 0, stream); expandPlanes_kernel_Fp16_NHWC<<>>( output, masks, values, n); ReportCUDAErrors(cudaGetLastError()); @@ -558,6 +578,7 @@ void expandPlanes_Fp16_NCHW(half* output, const uint64_t* masks, unsigned threads = n * 8 * 8 / 2; // each thread writes two elements. const int blockSize = 256; unsigned blocks = DivUp(threads, blockSize); + CUDA_KERNEL_LAUNCH_LOG(expandPlanes_kernel_Fp16_NCHW, blocks, blockSize, 0, stream); expandPlanes_kernel_Fp16_NCHW<<>>(output, masks, values, n); ReportCUDAErrors(cudaGetLastError()); @@ -704,6 +725,7 @@ void globalAvgPool(int N, int C, T* output, const T* input, if (nhwc) { assert((std::is_same::value)); // For NHWC fp16, simply launch N blocks, each with C threads. + CUDA_KERNEL_LAUNCH_LOG(globalAvgPool_kernel_NHWC_fp16, N, C, 0, stream); globalAvgPool_kernel_NHWC_fp16<<>>( (half*)output, (half*)input, (half*)prevLayerBias, N * C * kPlaneSize, N * C); @@ -717,6 +739,7 @@ void globalAvgPool(int N, int C, T* output, const T* input, const int kBlockSize = kWarpsPerBlock * 32; int blocks = DivUp(kTotalWarps, kWarpsPerBlock); + CUDA_KERNEL_LAUNCH_LOG(globalAvgPool_kernel, blocks, kBlockSize, 0, stream); globalAvgPool_kernel<<>>( output, input, prevLayerBias, N * C * kPlaneSize, N * C, C); } @@ -733,10 +756,12 @@ void globalScale(int N, int C, T* output, const T* input, const T* scaleBias, if (nhwc) { assert((std::is_same::value)); + CUDA_KERNEL_LAUNCH_LOG(globalScale_kernel_fp16_nhwc, kBlocks, kBlockSize, 0, stream); globalScale_kernel_fp16_nhwc<<>>( (half*)output, (half*)input, (half*)scaleBias, (half*)prevLayerBias, N * C * 8 * 8, C, 8 * 8 * C, activation); } else { + CUDA_KERNEL_LAUNCH_LOG(globalScale_kernel, kBlocks, kBlockSize, 0, stream); globalScale_kernel<<>>( output, input, scaleBias, prevLayerBias, N * C * 8 * 8, C, activation); } @@ -770,6 +795,7 @@ void PolicyMap(int N, T* output, const T* input, const short* indices, const int kBlockSize = 256; const int kBlocks = DivUp(N * usedSize, kBlockSize); + CUDA_KERNEL_LAUNCH_LOG(policyMap_kernel, kBlocks, kBlockSize, 0, stream); policyMap_kernel<<>>( (T*)output, (T*)input, (short*)indices, N, inputSize, usedSize, outputSize); @@ -785,6 +811,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input, // Each thread processes entire chess board if (use_se == false) { dim3 grid_dim(DivUp(C, kOpInpTransformBlockSize), N, 1); + CUDA_KERNEL_LAUNCH_LOG(OutputTransform_relu_InputTransform_kernel, grid_dim, kOpInpTransformBlockSize, 0, stream); OutputTransform_relu_InputTransform_kernel <<>>(N, C, output, input, @@ -794,6 +821,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input, "res block fusing opt not supported for the given data type and no " "of filters\n"); } else { + CUDA_KERNEL_LAUNCH_LOG(OutputTransform_SE_relu_InputTransform_kernel, N, C, 0, stream); OutputTransform_SE_relu_InputTransform_kernel <<>>(N, C, se_K, output, input, (float*)skip, bias, w1, @@ -934,9 +962,11 @@ void Softmax(int N, int C, T* output, const T* input, const T* input2, int size = N * 32; // Total no of threads needed const int kBlockSize = 256; int blocks = DivUp(size, kBlockSize); + CUDA_KERNEL_LAUNCH_LOG(softmax_opt_64_kernel, blocks, kBlockSize, 0, stream); softmax_opt_64_kernel <<>>(output, input, input2, size); } else { + CUDA_KERNEL_LAUNCH_LOG(softmax_kernel, N, C, 0, stream); softmax_kernel<<>>(output, input, input2); } @@ -1143,6 +1173,7 @@ void LayerNorm(int N, int C, T* output, const T* input, const T* bias, gridDim.y = 1; gridDim.z = 1; + CUDA_KERNEL_LAUNCH_LOG(layer_norm_kernel, gridDim, blockDim, 0, stream); layer_norm_kernel<<>>( N, C, output, input, bias, skip, gammas, betas, ep, alpha, act); @@ -1239,6 +1270,7 @@ void ComputePromotionLogits(int N, int C, T* output, const T* keys, // 8 * 24 threads // Each thread computes a single output element dim3 blockDim(24, 8, 1); + CUDA_KERNEL_LAUNCH_LOG(promotion_logits_kernel, N, blockDim, 0, stream); promotion_logits_kernel <<>>(C, output, keys, ppo, policy_attn_logits); } @@ -1281,6 +1313,7 @@ void inputPreprocessForAttentionBody(T* output, const T* input, // Each thread computes a single output element dim3 gridSize = dim3(N, 64); int blockSize = input_size + encoding_size; + CUDA_KERNEL_LAUNCH_LOG(preprocess_for_attention_body_kernel, gridSize, blockSize, 0, stream); preprocess_for_attention_body_kernel<<>>( output, input, encoding, input_size, encoding_size, is_pe_dense_embedding); @@ -1317,6 +1350,7 @@ void applyInputGating(T* output, const T* input, const T* mult, const T* add, gridSize.x = DivUp(C, blockSize.x); gridSize.y = 1; gridSize.z = N; + CUDA_KERNEL_LAUNCH_LOG(input_gating_kernel, gridSize, blockSize, 0, stream); input_gating_kernel <<>>(output, input, mult, add, HW, C); diff --git a/src/neural/backends/cuda/cuda_common.h b/src/neural/backends/cuda/cuda_common.h index 450bdb8a8f..9fae21e840 100644 --- a/src/neural/backends/cuda/cuda_common.h +++ b/src/neural/backends/cuda/cuda_common.h @@ -38,6 +38,9 @@ typedef void* cudnnHandle_t; #endif +// Include CUDA wrapper functions for debug logging +#include "cuda_wrapper.h" + #if CUBLAS_VER_MAJOR < 11 #define CUBLAS_PEDANTIC_MATH CUBLAS_DEFAULT_MATH #endif diff --git a/src/neural/backends/cuda/cuda_wrapper.h b/src/neural/backends/cuda/cuda_wrapper.h new file mode 100644 index 0000000000..2fc0dc006d --- /dev/null +++ b/src/neural/backends/cuda/cuda_wrapper.h @@ -0,0 +1,764 @@ +/* + This file is part of Leela Chess Zero. + Copyright (C) 2024 The LCZero Authors + + Leela Chess is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Leela Chess is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Leela Chess. If not, see . + + Additional permission under GNU GPL version 3 section 7 + + If you modify this Program, or any covered work, by linking or + combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA + Toolkit and the NVIDIA CUDA Deep Neural Network library (or a + modified version of those libraries), containing parts covered by the + terms of the respective license agreement, the licensors of this + Program grant you additional permission to convey the resulting work. +*/ + +#pragma once + +#include +#include +#include + +#ifdef USE_CUDNN +#include +#endif + +#include +#include +#include "utils/logging.h" + +// Debug wrapper functions for CUDA, cuBLAS, and cuDNN calls +// These wrappers log function calls with their input and output parameters + +namespace lczero { +namespace cudnn_backend { + +// Enable/disable debug logging at compile time +#ifndef CUDA_WRAPPER_DEBUG +#define CUDA_WRAPPER_DEBUG 0 +#endif + +// Helper macro for conditional logging +#if CUDA_WRAPPER_DEBUG +#define CUDA_DEBUG_LOG(msg) LOGFILE << "[CUDA_WRAPPER] " << msg +#else +#define CUDA_DEBUG_LOG(msg) do {} while(0) +#endif + +// Helper function to convert pointer to string +template +inline std::string PtrToStr(T* ptr) { + std::ostringstream oss; + oss << "0x" << std::hex << reinterpret_cast(ptr); + return oss.str(); +} + +// CUDA Runtime API Wrappers + +template +inline cudaError_t cudaMalloc(T** devPtr, size_t size) { + CUDA_DEBUG_LOG("cudaMalloc(devPtr=" << PtrToStr(devPtr) + << ", size=" << size << ")"); + cudaError_t result = ::cudaMalloc(reinterpret_cast(devPtr), size); + CUDA_DEBUG_LOG("cudaMalloc -> " << cudaGetErrorString(result) + << ", *devPtr=" << PtrToStr(*devPtr)); + return result; +} + +template +inline cudaError_t cudaFree(T* devPtr) { + CUDA_DEBUG_LOG("cudaFree(devPtr=" << PtrToStr(devPtr) << ")"); + cudaError_t result = ::cudaFree(devPtr); + CUDA_DEBUG_LOG("cudaFree -> " << cudaGetErrorString(result)); + return result; +} + +template +inline cudaError_t cudaMemcpy(T1* dst, const T2* src, size_t count, + cudaMemcpyKind kind) { + const char* kind_str = + (kind == cudaMemcpyHostToDevice) ? "HostToDevice" : + (kind == cudaMemcpyDeviceToHost) ? "DeviceToHost" : + (kind == cudaMemcpyDeviceToDevice) ? "DeviceToDevice" : + (kind == cudaMemcpyHostToHost) ? "HostToHost" : "Unknown"; + CUDA_DEBUG_LOG("cudaMemcpy(dst=" << PtrToStr(dst) + << ", src=" << PtrToStr(src) + << ", count=" << count + << ", kind=" << kind_str << ")"); + cudaError_t result = ::cudaMemcpy(dst, src, count, kind); + CUDA_DEBUG_LOG("cudaMemcpy -> " << cudaGetErrorString(result)); + return result; +} + +template +inline cudaError_t cudaMemcpyAsync(T1* dst, const T2* src, size_t count, + cudaMemcpyKind kind, cudaStream_t stream) { + const char* kind_str = + (kind == cudaMemcpyHostToDevice) ? "HostToDevice" : + (kind == cudaMemcpyDeviceToHost) ? "DeviceToHost" : + (kind == cudaMemcpyDeviceToDevice) ? "DeviceToDevice" : + (kind == cudaMemcpyHostToHost) ? "HostToHost" : "Unknown"; + CUDA_DEBUG_LOG("cudaMemcpyAsync(dst=" << PtrToStr(dst) + << ", src=" << PtrToStr(src) + << ", count=" << count + << ", kind=" << kind_str + << ", stream=" << PtrToStr(stream) << ")"); + cudaError_t result = ::cudaMemcpyAsync(dst, src, count, kind, stream); + CUDA_DEBUG_LOG("cudaMemcpyAsync -> " << cudaGetErrorString(result)); + return result; +} + +template +inline cudaError_t cudaMemset(T* devPtr, int value, size_t count) { + CUDA_DEBUG_LOG("cudaMemset(devPtr=" << PtrToStr(devPtr) + << ", value=" << value + << ", count=" << count << ")"); + cudaError_t result = ::cudaMemset(devPtr, value, count); + CUDA_DEBUG_LOG("cudaMemset -> " << cudaGetErrorString(result)); + return result; +} + +inline cudaError_t cudaStreamCreateWithFlags(cudaStream_t* pStream, + unsigned int flags) { + CUDA_DEBUG_LOG("cudaStreamCreateWithFlags(pStream=" << PtrToStr(pStream) + << ", flags=" << flags << ")"); + cudaError_t result = ::cudaStreamCreateWithFlags(pStream, flags); + CUDA_DEBUG_LOG("cudaStreamCreateWithFlags -> " << cudaGetErrorString(result) + << ", *pStream=" << PtrToStr(*pStream)); + return result; +} + +inline cudaError_t cudaStreamDestroy(cudaStream_t stream) { + CUDA_DEBUG_LOG("cudaStreamDestroy(stream=" << PtrToStr(stream) << ")"); + cudaError_t result = ::cudaStreamDestroy(stream); + CUDA_DEBUG_LOG("cudaStreamDestroy -> " << cudaGetErrorString(result)); + return result; +} + +inline cudaError_t cudaStreamSynchronize(cudaStream_t stream) { + CUDA_DEBUG_LOG("cudaStreamSynchronize(stream=" << PtrToStr(stream) << ")"); + cudaError_t result = ::cudaStreamSynchronize(stream); + CUDA_DEBUG_LOG("cudaStreamSynchronize -> " << cudaGetErrorString(result)); + return result; +} + +inline cudaError_t cudaStreamSetAttribute(cudaStream_t stream, + cudaStreamAttrID attr, + const cudaStreamAttrValue* value) { + CUDA_DEBUG_LOG("cudaStreamSetAttribute(stream=" << PtrToStr(stream) + << ", attr=" << attr << ")"); + cudaError_t result = ::cudaStreamSetAttribute(stream, attr, value); + CUDA_DEBUG_LOG("cudaStreamSetAttribute -> " << cudaGetErrorString(result)); + return result; +} + +inline cudaError_t cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, + unsigned int flags) { + CUDA_DEBUG_LOG("cudaStreamWaitEvent(stream=" << PtrToStr(stream) + << ", event=" << PtrToStr(event) + << ", flags=" << flags << ")"); + cudaError_t result = ::cudaStreamWaitEvent(stream, event, flags); + CUDA_DEBUG_LOG("cudaStreamWaitEvent -> " << cudaGetErrorString(result)); + return result; +} + +inline cudaError_t cudaEventCreateWithFlags(cudaEvent_t* event, + unsigned int flags) { + CUDA_DEBUG_LOG("cudaEventCreateWithFlags(event=" << PtrToStr(event) + << ", flags=" << flags << ")"); + cudaError_t result = ::cudaEventCreateWithFlags(event, flags); + CUDA_DEBUG_LOG("cudaEventCreateWithFlags -> " << cudaGetErrorString(result) + << ", *event=" << PtrToStr(*event)); + return result; +} + +inline cudaError_t cudaEventDestroy(cudaEvent_t event) { + CUDA_DEBUG_LOG("cudaEventDestroy(event=" << PtrToStr(event) << ")"); + cudaError_t result = ::cudaEventDestroy(event); + CUDA_DEBUG_LOG("cudaEventDestroy -> " << cudaGetErrorString(result)); + return result; +} + +inline cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t stream) { + CUDA_DEBUG_LOG("cudaEventRecord(event=" << PtrToStr(event) + << ", stream=" << PtrToStr(stream) << ")"); + cudaError_t result = ::cudaEventRecord(event, stream); + CUDA_DEBUG_LOG("cudaEventRecord -> " << cudaGetErrorString(result)); + return result; +} + +inline cudaError_t cudaEventRecordWithFlags(cudaEvent_t event, + cudaStream_t stream, + unsigned int flags) { + CUDA_DEBUG_LOG("cudaEventRecordWithFlags(event=" << PtrToStr(event) + << ", stream=" << PtrToStr(stream) + << ", flags=" << flags << ")"); + cudaError_t result = ::cudaEventRecordWithFlags(event, stream, flags); + CUDA_DEBUG_LOG("cudaEventRecordWithFlags -> " << cudaGetErrorString(result)); + return result; +} + +inline cudaError_t cudaEventSynchronize(cudaEvent_t event) { + CUDA_DEBUG_LOG("cudaEventSynchronize(event=" << PtrToStr(event) << ")"); + cudaError_t result = ::cudaEventSynchronize(event); + CUDA_DEBUG_LOG("cudaEventSynchronize -> " << cudaGetErrorString(result)); + return result; +} + +inline cudaError_t cudaGetDeviceCount(int* count) { + CUDA_DEBUG_LOG("cudaGetDeviceCount(count=" << PtrToStr(count) << ")"); + cudaError_t result = ::cudaGetDeviceCount(count); + CUDA_DEBUG_LOG("cudaGetDeviceCount -> " << cudaGetErrorString(result) + << ", *count=" << *count); + return result; +} + +inline cudaError_t cudaSetDevice(int device) { + CUDA_DEBUG_LOG("cudaSetDevice(device=" << device << ")"); + cudaError_t result = ::cudaSetDevice(device); + CUDA_DEBUG_LOG("cudaSetDevice -> " << cudaGetErrorString(result)); + return result; +} + +inline cudaError_t cudaGetDeviceProperties(cudaDeviceProp* prop, int device) { + CUDA_DEBUG_LOG("cudaGetDeviceProperties(prop=" << PtrToStr(prop) + << ", device=" << device << ")"); + cudaError_t result = ::cudaGetDeviceProperties(prop, device); + CUDA_DEBUG_LOG("cudaGetDeviceProperties -> " << cudaGetErrorString(result) + << ", name=" << (result == cudaSuccess ? prop->name : "N/A")); + return result; +} + +inline cudaError_t cudaDeviceGetAttribute(int* value, cudaDeviceAttr attr, + int device) { + CUDA_DEBUG_LOG("cudaDeviceGetAttribute(value=" << PtrToStr(value) + << ", attr=" << attr + << ", device=" << device << ")"); + cudaError_t result = ::cudaDeviceGetAttribute(value, attr, device); + CUDA_DEBUG_LOG("cudaDeviceGetAttribute -> " << cudaGetErrorString(result) + << ", *value=" << *value); + return result; +} + +inline cudaError_t cudaGetLastError() { + cudaError_t result = ::cudaGetLastError(); + CUDA_DEBUG_LOG("cudaGetLastError() -> " << cudaGetErrorString(result)); + return result; +} + +inline const char* cudaGetErrorString(cudaError_t error) { + return ::cudaGetErrorString(error); +} + +inline cudaError_t cudaRuntimeGetVersion(int* runtimeVersion) { + CUDA_DEBUG_LOG("cudaRuntimeGetVersion(runtimeVersion=" + << PtrToStr(runtimeVersion) << ")"); + cudaError_t result = ::cudaRuntimeGetVersion(runtimeVersion); + CUDA_DEBUG_LOG("cudaRuntimeGetVersion -> " << cudaGetErrorString(result) + << ", *runtimeVersion=" << *runtimeVersion); + return result; +} + +inline cudaError_t cudaDriverGetVersion(int* driverVersion) { + CUDA_DEBUG_LOG("cudaDriverGetVersion(driverVersion=" + << PtrToStr(driverVersion) << ")"); + cudaError_t result = ::cudaDriverGetVersion(driverVersion); + CUDA_DEBUG_LOG("cudaDriverGetVersion -> " << cudaGetErrorString(result) + << ", *driverVersion=" << *driverVersion); + return result; +} + +template +inline cudaError_t cudaFuncSetAttribute(const T* func, cudaFuncAttribute attr, + int value) { + CUDA_DEBUG_LOG("cudaFuncSetAttribute(func=" << PtrToStr(func) + << ", attr=" << attr + << ", value=" << value << ")"); + cudaError_t result = ::cudaFuncSetAttribute(reinterpret_cast(func), attr, value); + CUDA_DEBUG_LOG("cudaFuncSetAttribute -> " << cudaGetErrorString(result)); + return result; +} + +inline cudaError_t cudaCtxResetPersistingL2Cache() { + CUDA_DEBUG_LOG("cudaCtxResetPersistingL2Cache()"); + cudaError_t result = ::cudaCtxResetPersistingL2Cache(); + CUDA_DEBUG_LOG("cudaCtxResetPersistingL2Cache -> " << cudaGetErrorString(result)); + return result; +} + +// cuBLAS API Wrappers + +inline cublasStatus_t cublasCreate(cublasHandle_t* handle) { + CUDA_DEBUG_LOG("cublasCreate(handle=" << PtrToStr(handle) << ")"); + cublasStatus_t result = ::cublasCreate(handle); + CUDA_DEBUG_LOG("cublasCreate -> " << static_cast(result) + << ", *handle=" << PtrToStr(*handle)); + return result; +} + +inline cublasStatus_t cublasDestroy(cublasHandle_t handle) { + CUDA_DEBUG_LOG("cublasDestroy(handle=" << PtrToStr(handle) << ")"); + cublasStatus_t result = ::cublasDestroy(handle); + CUDA_DEBUG_LOG("cublasDestroy -> " << static_cast(result)); + return result; +} + +inline cublasStatus_t cublasSetStream(cublasHandle_t handle, + cudaStream_t streamId) { + CUDA_DEBUG_LOG("cublasSetStream(handle=" << PtrToStr(handle) + << ", streamId=" << PtrToStr(streamId) << ")"); + cublasStatus_t result = ::cublasSetStream(handle, streamId); + CUDA_DEBUG_LOG("cublasSetStream -> " << static_cast(result)); + return result; +} + +inline cublasStatus_t cublasSetMathMode(cublasHandle_t handle, + cublasMath_t mode) { + CUDA_DEBUG_LOG("cublasSetMathMode(handle=" << PtrToStr(handle) + << ", mode=" << static_cast(mode) << ")"); + cublasStatus_t result = ::cublasSetMathMode(handle, mode); + CUDA_DEBUG_LOG("cublasSetMathMode -> " << static_cast(result)); + return result; +} + +inline cublasStatus_t cublasSgemm(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const float* alpha, const float* A, int lda, + const float* B, int ldb, const float* beta, + float* C, int ldc) { + CUDA_DEBUG_LOG("cublasSgemm(handle=" << PtrToStr(handle) + << ", transa=" << static_cast(transa) + << ", transb=" << static_cast(transb) + << ", m=" << m << ", n=" << n << ", k=" << k + << ", alpha=" << *alpha << ", lda=" << lda + << ", ldb=" << ldb << ", beta=" << *beta + << ", ldc=" << ldc << ")"); + cublasStatus_t result = ::cublasSgemm(handle, transa, transb, m, n, k, + alpha, A, lda, B, ldb, beta, C, ldc); + CUDA_DEBUG_LOG("cublasSgemm -> " << static_cast(result)); + return result; +} + +inline cublasStatus_t cublasHgemm(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const __half* alpha, const __half* A, int lda, + const __half* B, int ldb, const __half* beta, + __half* C, int ldc) { + CUDA_DEBUG_LOG("cublasHgemm(handle=" << PtrToStr(handle) + << ", transa=" << static_cast(transa) + << ", transb=" << static_cast(transb) + << ", m=" << m << ", n=" << n << ", k=" << k + << ", lda=" << lda << ", ldb=" << ldb + << ", ldc=" << ldc << ")"); + cublasStatus_t result = ::cublasHgemm(handle, transa, transb, m, n, k, + alpha, A, lda, B, ldb, beta, C, ldc); + CUDA_DEBUG_LOG("cublasHgemm -> " << static_cast(result)); + return result; +} + +inline cublasStatus_t cublasSgemmBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, int n, int k, + const float* alpha, + const float* const Aarray[], int lda, + const float* const Barray[], int ldb, + const float* beta, + float* const Carray[], int ldc, + int batchCount) { + CUDA_DEBUG_LOG("cublasSgemmBatched(handle=" << PtrToStr(handle) + << ", m=" << m << ", n=" << n << ", k=" << k + << ", batchCount=" << batchCount << ")"); + cublasStatus_t result = ::cublasSgemmBatched(handle, transa, transb, m, n, k, + alpha, Aarray, lda, Barray, ldb, + beta, Carray, ldc, batchCount); + CUDA_DEBUG_LOG("cublasSgemmBatched -> " << static_cast(result)); + return result; +} + +inline cublasStatus_t cublasHgemmBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, int n, int k, + const __half* alpha, + const __half* const Aarray[], int lda, + const __half* const Barray[], int ldb, + const __half* beta, + __half* const Carray[], int ldc, + int batchCount) { + CUDA_DEBUG_LOG("cublasHgemmBatched(handle=" << PtrToStr(handle) + << ", m=" << m << ", n=" << n << ", k=" << k + << ", batchCount=" << batchCount << ")"); + cublasStatus_t result = ::cublasHgemmBatched(handle, transa, transb, m, n, k, + alpha, Aarray, lda, Barray, ldb, + beta, Carray, ldc, batchCount); + CUDA_DEBUG_LOG("cublasHgemmBatched -> " << static_cast(result)); + return result; +} + +inline cublasStatus_t cublasSgemmStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, int n, int k, + const float* alpha, + const float* A, int lda, + long long int strideA, + const float* B, int ldb, + long long int strideB, + const float* beta, + float* C, int ldc, + long long int strideC, + int batchCount) { + CUDA_DEBUG_LOG("cublasSgemmStridedBatched(handle=" << PtrToStr(handle) + << ", m=" << m << ", n=" << n << ", k=" << k + << ", batchCount=" << batchCount << ")"); + cublasStatus_t result = ::cublasSgemmStridedBatched(handle, transa, transb, + m, n, k, alpha, A, lda, + strideA, B, ldb, strideB, + beta, C, ldc, strideC, + batchCount); + CUDA_DEBUG_LOG("cublasSgemmStridedBatched -> " << static_cast(result)); + return result; +} + +template +inline cublasStatus_t cublasGemmStridedBatchedEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, int n, int k, + const T1* alpha, + const T2* A, cudaDataType Atype, + int lda, long long int strideA, + const T3* B, cudaDataType Btype, + int ldb, long long int strideB, + const T1* beta, + T4* C, cudaDataType Ctype, + int ldc, long long int strideC, + int batchCount, + cublasComputeType_t computeType, + cublasGemmAlgo_t algo) { + CUDA_DEBUG_LOG("cublasGemmStridedBatchedEx(handle=" << PtrToStr(handle) + << ", m=" << m << ", n=" << n << ", k=" << k + << ", batchCount=" << batchCount << ")"); + cublasStatus_t result = ::cublasGemmStridedBatchedEx(handle, transa, transb, + m, n, k, alpha, A, Atype, + lda, strideA, B, Btype, + ldb, strideB, beta, C, + Ctype, ldc, strideC, + batchCount, computeType, + algo); + CUDA_DEBUG_LOG("cublasGemmStridedBatchedEx -> " << static_cast(result)); + return result; +} + +// cuDNN API Wrappers + +#ifdef USE_CUDNN + +inline cudnnStatus_t cudnnCreate(cudnnHandle_t* handle) { + CUDA_DEBUG_LOG("cudnnCreate(handle=" << PtrToStr(handle) << ")"); + cudnnStatus_t result = ::cudnnCreate(handle); + CUDA_DEBUG_LOG("cudnnCreate -> " << cudnnGetErrorString(result) + << ", *handle=" << PtrToStr(*handle)); + return result; +} + +inline cudnnStatus_t cudnnDestroy(cudnnHandle_t handle) { + CUDA_DEBUG_LOG("cudnnDestroy(handle=" << PtrToStr(handle) << ")"); + cudnnStatus_t result = ::cudnnDestroy(handle); + CUDA_DEBUG_LOG("cudnnDestroy -> " << cudnnGetErrorString(result)); + return result; +} + +inline cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId) { + CUDA_DEBUG_LOG("cudnnSetStream(handle=" << PtrToStr(handle) + << ", streamId=" << PtrToStr(streamId) << ")"); + cudnnStatus_t result = ::cudnnSetStream(handle, streamId); + CUDA_DEBUG_LOG("cudnnSetStream -> " << cudnnGetErrorString(result)); + return result; +} + +inline size_t cudnnGetVersion() { + size_t version = ::cudnnGetVersion(); + CUDA_DEBUG_LOG("cudnnGetVersion() -> " << version); + return version; +} + +inline const char* cudnnGetErrorString(cudnnStatus_t status) { + return ::cudnnGetErrorString(status); +} + +inline cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t* tensorDesc) { + CUDA_DEBUG_LOG("cudnnCreateTensorDescriptor(tensorDesc=" + << PtrToStr(tensorDesc) << ")"); + cudnnStatus_t result = ::cudnnCreateTensorDescriptor(tensorDesc); + CUDA_DEBUG_LOG("cudnnCreateTensorDescriptor -> " << cudnnGetErrorString(result) + << ", *tensorDesc=" << PtrToStr(*tensorDesc)); + return result; +} + +inline cudnnStatus_t cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) { + CUDA_DEBUG_LOG("cudnnDestroyTensorDescriptor(tensorDesc=" + << PtrToStr(tensorDesc) << ")"); + cudnnStatus_t result = ::cudnnDestroyTensorDescriptor(tensorDesc); + CUDA_DEBUG_LOG("cudnnDestroyTensorDescriptor -> " << cudnnGetErrorString(result)); + return result; +} + +inline cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, + cudnnTensorFormat_t format, + cudnnDataType_t dataType, + int n, int c, int h, int w) { + CUDA_DEBUG_LOG("cudnnSetTensor4dDescriptor(tensorDesc=" << PtrToStr(tensorDesc) + << ", format=" << static_cast(format) + << ", dataType=" << static_cast(dataType) + << ", n=" << n << ", c=" << c + << ", h=" << h << ", w=" << w << ")"); + cudnnStatus_t result = ::cudnnSetTensor4dDescriptor(tensorDesc, format, dataType, + n, c, h, w); + CUDA_DEBUG_LOG("cudnnSetTensor4dDescriptor -> " << cudnnGetErrorString(result)); + return result; +} + +inline cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t* filterDesc) { + CUDA_DEBUG_LOG("cudnnCreateFilterDescriptor(filterDesc=" + << PtrToStr(filterDesc) << ")"); + cudnnStatus_t result = ::cudnnCreateFilterDescriptor(filterDesc); + CUDA_DEBUG_LOG("cudnnCreateFilterDescriptor -> " << cudnnGetErrorString(result) + << ", *filterDesc=" << PtrToStr(*filterDesc)); + return result; +} + +inline cudnnStatus_t cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) { + CUDA_DEBUG_LOG("cudnnDestroyFilterDescriptor(filterDesc=" + << PtrToStr(filterDesc) << ")"); + cudnnStatus_t result = ::cudnnDestroyFilterDescriptor(filterDesc); + CUDA_DEBUG_LOG("cudnnDestroyFilterDescriptor -> " << cudnnGetErrorString(result)); + return result; +} + +inline cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, + cudnnTensorFormat_t format, + int k, int c, int h, int w) { + CUDA_DEBUG_LOG("cudnnSetFilter4dDescriptor(filterDesc=" << PtrToStr(filterDesc) + << ", dataType=" << static_cast(dataType) + << ", format=" << static_cast(format) + << ", k=" << k << ", c=" << c + << ", h=" << h << ", w=" << w << ")"); + cudnnStatus_t result = ::cudnnSetFilter4dDescriptor(filterDesc, dataType, format, + k, c, h, w); + CUDA_DEBUG_LOG("cudnnSetFilter4dDescriptor -> " << cudnnGetErrorString(result)); + return result; +} + +inline cudnnStatus_t cudnnCreateConvolutionDescriptor( + cudnnConvolutionDescriptor_t* convDesc) { + CUDA_DEBUG_LOG("cudnnCreateConvolutionDescriptor(convDesc=" + << PtrToStr(convDesc) << ")"); + cudnnStatus_t result = ::cudnnCreateConvolutionDescriptor(convDesc); + CUDA_DEBUG_LOG("cudnnCreateConvolutionDescriptor -> " << cudnnGetErrorString(result) + << ", *convDesc=" << PtrToStr(*convDesc)); + return result; +} + +inline cudnnStatus_t cudnnDestroyConvolutionDescriptor( + cudnnConvolutionDescriptor_t convDesc) { + CUDA_DEBUG_LOG("cudnnDestroyConvolutionDescriptor(convDesc=" + << PtrToStr(convDesc) << ")"); + cudnnStatus_t result = ::cudnnDestroyConvolutionDescriptor(convDesc); + CUDA_DEBUG_LOG("cudnnDestroyConvolutionDescriptor -> " + << cudnnGetErrorString(result)); + return result; +} + +inline cudnnStatus_t cudnnSetConvolution2dDescriptor( + cudnnConvolutionDescriptor_t convDesc, + int pad_h, int pad_w, int u, int v, int dilation_h, int dilation_w, + cudnnConvolutionMode_t mode, cudnnDataType_t computeType) { + CUDA_DEBUG_LOG("cudnnSetConvolution2dDescriptor(convDesc=" << PtrToStr(convDesc) + << ", pad_h=" << pad_h << ", pad_w=" << pad_w + << ", u=" << u << ", v=" << v + << ", dilation_h=" << dilation_h << ", dilation_w=" << dilation_w + << ", mode=" << static_cast(mode) + << ", computeType=" << static_cast(computeType) << ")"); + cudnnStatus_t result = ::cudnnSetConvolution2dDescriptor(convDesc, pad_h, pad_w, + u, v, dilation_h, + dilation_w, mode, + computeType); + CUDA_DEBUG_LOG("cudnnSetConvolution2dDescriptor -> " << cudnnGetErrorString(result)); + return result; +} + +inline cudnnStatus_t cudnnSetConvolutionMathType( + cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) { + CUDA_DEBUG_LOG("cudnnSetConvolutionMathType(convDesc=" << PtrToStr(convDesc) + << ", mathType=" << static_cast(mathType) << ")"); + cudnnStatus_t result = ::cudnnSetConvolutionMathType(convDesc, mathType); + CUDA_DEBUG_LOG("cudnnSetConvolutionMathType -> " << cudnnGetErrorString(result)); + return result; +} + +inline cudnnStatus_t cudnnCreateActivationDescriptor( + cudnnActivationDescriptor_t* activationDesc) { + CUDA_DEBUG_LOG("cudnnCreateActivationDescriptor(activationDesc=" + << PtrToStr(activationDesc) << ")"); + cudnnStatus_t result = ::cudnnCreateActivationDescriptor(activationDesc); + CUDA_DEBUG_LOG("cudnnCreateActivationDescriptor -> " << cudnnGetErrorString(result) + << ", *activationDesc=" << PtrToStr(*activationDesc)); + return result; +} + +inline cudnnStatus_t cudnnDestroyActivationDescriptor( + cudnnActivationDescriptor_t activationDesc) { + CUDA_DEBUG_LOG("cudnnDestroyActivationDescriptor(activationDesc=" + << PtrToStr(activationDesc) << ")"); + cudnnStatus_t result = ::cudnnDestroyActivationDescriptor(activationDesc); + CUDA_DEBUG_LOG("cudnnDestroyActivationDescriptor -> " + << cudnnGetErrorString(result)); + return result; +} + +inline cudnnStatus_t cudnnSetActivationDescriptor( + cudnnActivationDescriptor_t activationDesc, + cudnnActivationMode_t mode, cudnnNanPropagation_t reluNanOpt, double coef) { + CUDA_DEBUG_LOG("cudnnSetActivationDescriptor(activationDesc=" + << PtrToStr(activationDesc) + << ", mode=" << static_cast(mode) + << ", reluNanOpt=" << static_cast(reluNanOpt) + << ", coef=" << coef << ")"); + cudnnStatus_t result = ::cudnnSetActivationDescriptor(activationDesc, mode, + reluNanOpt, coef); + CUDA_DEBUG_LOG("cudnnSetActivationDescriptor -> " << cudnnGetErrorString(result)); + return result; +} + +inline cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + cudnnConvolutionFwdAlgo_t algo, + size_t* sizeInBytes) { + CUDA_DEBUG_LOG("cudnnGetConvolutionForwardWorkspaceSize(handle=" + << PtrToStr(handle) << ", algo=" << static_cast(algo) << ")"); + cudnnStatus_t result = ::cudnnGetConvolutionForwardWorkspaceSize( + handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes); + CUDA_DEBUG_LOG("cudnnGetConvolutionForwardWorkspaceSize -> " + << cudnnGetErrorString(result) + << ", *sizeInBytes=" << *sizeInBytes); + return result; +} + +template +inline cudnnStatus_t cudnnConvolutionForward( + cudnnHandle_t handle, + const T1* alpha, + const cudnnTensorDescriptor_t xDesc, const T2* x, + const cudnnFilterDescriptor_t wDesc, const T3* w, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionFwdAlgo_t algo, + T4* workSpace, size_t workSpaceSizeInBytes, + const T1* beta, + const cudnnTensorDescriptor_t yDesc, T5* y) { + CUDA_DEBUG_LOG("cudnnConvolutionForward(handle=" << PtrToStr(handle) + << ", algo=" << static_cast(algo) + << ", workSpaceSizeInBytes=" << workSpaceSizeInBytes << ")"); + cudnnStatus_t result = ::cudnnConvolutionForward(handle, alpha, xDesc, x, + wDesc, w, convDesc, algo, + workSpace, workSpaceSizeInBytes, + beta, yDesc, y); + CUDA_DEBUG_LOG("cudnnConvolutionForward -> " << cudnnGetErrorString(result)); + return result; +} + +template +inline cudnnStatus_t cudnnConvolutionBiasActivationForward( + cudnnHandle_t handle, + const T1* alpha1, + const cudnnTensorDescriptor_t xDesc, const T2* x, + const cudnnFilterDescriptor_t wDesc, const T3* w, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionFwdAlgo_t algo, + T4* workSpace, size_t workSpaceSizeInBytes, + const T1* alpha2, + const cudnnTensorDescriptor_t zDesc, const T5* z, + const cudnnTensorDescriptor_t biasDesc, const T6* bias, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t yDesc, T7* y) { + CUDA_DEBUG_LOG("cudnnConvolutionBiasActivationForward(handle=" << PtrToStr(handle) + << ", algo=" << static_cast(algo) + << ", workSpaceSizeInBytes=" << workSpaceSizeInBytes << ")"); + cudnnStatus_t result = ::cudnnConvolutionBiasActivationForward( + handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, + workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, + yDesc, y); + CUDA_DEBUG_LOG("cudnnConvolutionBiasActivationForward -> " + << cudnnGetErrorString(result)); + return result; +} + +template +inline cudnnStatus_t cudnnActivationForward( + cudnnHandle_t handle, + cudnnActivationDescriptor_t activationDesc, + const T1* alpha, + const cudnnTensorDescriptor_t xDesc, const T2* x, + const T1* beta, + const cudnnTensorDescriptor_t yDesc, T3* y) { + CUDA_DEBUG_LOG("cudnnActivationForward(handle=" << PtrToStr(handle) << ")"); + cudnnStatus_t result = ::cudnnActivationForward(handle, activationDesc, alpha, + xDesc, x, beta, yDesc, y); + CUDA_DEBUG_LOG("cudnnActivationForward -> " << cudnnGetErrorString(result)); + return result; +} + +template +inline cudnnStatus_t cudnnAddTensor( + cudnnHandle_t handle, + const T1* alpha, + const cudnnTensorDescriptor_t aDesc, const T2* A, + const T1* beta, + const cudnnTensorDescriptor_t cDesc, T3* C) { + CUDA_DEBUG_LOG("cudnnAddTensor(handle=" << PtrToStr(handle) << ")"); + cudnnStatus_t result = ::cudnnAddTensor(handle, alpha, aDesc, A, beta, cDesc, C); + CUDA_DEBUG_LOG("cudnnAddTensor -> " << cudnnGetErrorString(result)); + return result; +} + +#endif // USE_CUDNN + +// Kernel Launch Logging Macros +// These macros wrap CUDA kernel launches to log execution parameters + +#if CUDA_WRAPPER_DEBUG +// Log kernel launch with grid, block, shared memory, and stream information +#define CUDA_KERNEL_LAUNCH_LOG(kernel_name, grid_dim, block_dim, shared_mem, stream_ptr) \ + do { \ + dim3 _grid = (grid_dim); \ + dim3 _block = (block_dim); \ + LOGFILE << "[CUDA_WRAPPER] Kernel launch: " << #kernel_name \ + << " grid=(" << _grid.x << "," << _grid.y << "," << _grid.z << ")" \ + << " block=(" << _block.x << "," << _block.y << "," << _block.z << ")" \ + << " smem=" << (shared_mem) \ + << " stream=" << PtrToStr(stream_ptr); \ + } while(0) +#else +#define CUDA_KERNEL_LAUNCH_LOG(kernel_name, grid_dim, block_dim, shared_mem, stream_ptr) do {} while(0) +#endif + +} // namespace cudnn_backend +} // namespace lczero diff --git a/src/neural/backends/cuda/fp16_kernels.cu b/src/neural/backends/cuda/fp16_kernels.cu index 37827ba0eb..ca139cee50 100644 --- a/src/neural/backends/cuda/fp16_kernels.cu +++ b/src/neural/backends/cuda/fp16_kernels.cu @@ -142,6 +142,7 @@ bool Se_Fp16_NHWC(int N, int C, int numFc1Out, half* output, const half* skip, // TODO: Think of more elegant way to avoid this hardcoding :-/ if (numFc1Out == 16) { if (C == 64) { + CUDA_KERNEL_LAUNCH_LOG(SE_Layer_NHWC_64_16, N, C, 0, stream); SE_Layer_NHWC<64, 16><<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); } else { @@ -150,24 +151,31 @@ bool Se_Fp16_NHWC(int N, int C, int numFc1Out, half* output, const half* skip, } } else if (numFc1Out == 32) { if (C == 64) { + CUDA_KERNEL_LAUNCH_LOG(SE_Layer_NHWC_64_32, N, C, 0, stream); SE_Layer_NHWC<64, 32><<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); } else if (C == 128) { + CUDA_KERNEL_LAUNCH_LOG(SE_Layer_NHWC_128_32, N, C, 0, stream); SE_Layer_NHWC<128, 32><<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); } else if (C == 192) { + CUDA_KERNEL_LAUNCH_LOG(SE_Layer_NHWC_192_32, N, C, 0, stream); SE_Layer_NHWC<192, 32><<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); } else if (C == 256) { + CUDA_KERNEL_LAUNCH_LOG(SE_Layer_NHWC_256_32, N, C, 0, stream); SE_Layer_NHWC<256, 32><<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); } else if (C == 320) { + CUDA_KERNEL_LAUNCH_LOG(SE_Layer_NHWC_320_32, N, C, 0, stream); SE_Layer_NHWC<320, 32><<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); } else if (C == 352) { + CUDA_KERNEL_LAUNCH_LOG(SE_Layer_NHWC_352_32, N, C, 0, stream); SE_Layer_NHWC<352, 32><<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); } else if (C == 384) { + CUDA_KERNEL_LAUNCH_LOG(SE_Layer_NHWC_384_32, N, C, 0, stream); SE_Layer_NHWC<384, 32><<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); } else { @@ -176,21 +184,27 @@ bool Se_Fp16_NHWC(int N, int C, int numFc1Out, half* output, const half* skip, } } else if (numFc1Out == 64) { if (C == 64) { + CUDA_KERNEL_LAUNCH_LOG(SE_Layer_NHWC_64_64, N, C, 0, stream); SE_Layer_NHWC<64, 64><<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); } else if (C == 128) { + CUDA_KERNEL_LAUNCH_LOG(SE_Layer_NHWC_128_64, N, C, 0, stream); SE_Layer_NHWC<128, 64><<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); } else if (C == 192) { + CUDA_KERNEL_LAUNCH_LOG(SE_Layer_NHWC_192_64, N, C, 0, stream); SE_Layer_NHWC<192, 64><<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); } else if (C == 256) { + CUDA_KERNEL_LAUNCH_LOG(SE_Layer_NHWC_256_64, N, C, 0, stream); SE_Layer_NHWC<256, 64><<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); } else if (C == 320) { + CUDA_KERNEL_LAUNCH_LOG(SE_Layer_NHWC_320_64, N, C, 0, stream); SE_Layer_NHWC<320, 64><<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); } else if (C == 384) { + CUDA_KERNEL_LAUNCH_LOG(SE_Layer_NHWC_384_64, N, C, 0, stream); SE_Layer_NHWC<384, 64><<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); } else { @@ -443,6 +457,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input, // Each thread processes entire chess board. if (use_se == false) { dim3 grid_dim(DivUp(C, kOpInpTransformBlockSize), N, 1); + CUDA_KERNEL_LAUNCH_LOG(OutputTransform_relu_InputTransform_kernel, grid_dim, kOpInpTransformBlockSize, 0, stream); OutputTransform_relu_InputTransform_kernel <<>>(N, C, output, input, @@ -455,6 +470,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input, OutputInputTransformKernel_fp16_shmem_board, cudaFuncAttributeMaxDynamicSharedMemorySize, 72 * C * sizeof(half)); + CUDA_KERNEL_LAUNCH_LOG(OutputInputTransformKernel_fp16_shmem_board, N, C, 72 * C * sizeof(half), stream); OutputInputTransformKernel_fp16_shmem_board <<>>( @@ -466,6 +482,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input, "of filters\n"); } } else { + CUDA_KERNEL_LAUNCH_LOG(OutputTransform_SE_relu_InputTransform_kernel, N, C, 0, stream); OutputTransform_SE_relu_InputTransform_kernel <<>>(N, C, se_K, output, input, (half*)skip, bias, w1,