Skip to content

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
amarin16 committed Oct 22, 2024
1 parent 1b64388 commit 1256074
Show file tree
Hide file tree
Showing 3 changed files with 0 additions and 147 deletions.
88 changes: 0 additions & 88 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,93 +22,5 @@
"-build/include_subdir",
"-runtime/references"
],
<<<<<<< HEAD
"files.associations": {
"span": "cpp",
"*.inc": "cpp",
"numeric": "cpp",
"span_ext": "cpp",
"string_span": "cpp",
"array": "cpp",
"deque": "cpp",
"forward_list": "cpp",
"list": "cpp",
"string": "cpp",
"unordered_map": "cpp",
"unordered_set": "cpp",
"vector": "cpp",
"string_view": "cpp",
"hash_map": "cpp",
"initializer_list": "cpp",
"regex": "cpp",
"valarray": "cpp",
"bitset": "cpp",
"utility": "cpp",
"cmath": "cpp",
"pointers": "cpp",
"any": "cpp",
"atomic": "cpp",
"bit": "cpp",
"*.tcc": "cpp",
"cctype": "cpp",
"cfenv": "cpp",
"charconv": "cpp",
"chrono": "cpp",
"cinttypes": "cpp",
"clocale": "cpp",
"codecvt": "cpp",
"compare": "cpp",
"complex": "cpp",
"concepts": "cpp",
"condition_variable": "cpp",
"csignal": "cpp",
"cstdarg": "cpp",
"cstddef": "cpp",
"cstdint": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"cstring": "cpp",
"ctime": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"map": "cpp",
"set": "cpp",
"exception": "cpp",
"algorithm": "cpp",
"functional": "cpp",
"iterator": "cpp",
"memory": "cpp",
"memory_resource": "cpp",
"optional": "cpp",
"random": "cpp",
"ratio": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"fstream": "cpp",
"future": "cpp",
"iomanip": "cpp",
"iosfwd": "cpp",
"iostream": "cpp",
"istream": "cpp",
"limits": "cpp",
"mutex": "cpp",
"new": "cpp",
"numbers": "cpp",
"ostream": "cpp",
"scoped_allocator": "cpp",
"semaphore": "cpp",
"shared_mutex": "cpp",
"sstream": "cpp",
"stdexcept": "cpp",
"stop_token": "cpp",
"streambuf": "cpp",
"thread": "cpp",
"typeindex": "cpp",
"typeinfo": "cpp",
"variant": "cpp"
}
=======
"C_Cpp.autoAddFileAssociations": false
>>>>>>> main
}
21 changes: 0 additions & 21 deletions onnxruntime/core/providers/cuda/math/matmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -245,19 +245,6 @@ Status ComputeUsingFp8(OpKernelContext* ctx, MatMulComputeHelper& helper, cudaS
// https://docs.nvidia.com/cuda/cublas/index.html?highlight=cublasLtMatmul#cublasltmatmul
float beta = 0;

// TODO delete
const int left_X_num_elems = left_X->SizeInBytes() / sizeof(MLFloat16);
const int right_X_num_elems = right_X->SizeInBytes() / sizeof(MLFloat16);
printf("\nPrinting tensor data for left_X:\n");
PrintTensorData<MLFloat16>(stream, left_X->DataRaw(), left_X_num_elems, 4);
printf("\nPrinting tensor data for right_X:\n");
PrintTensorData<MLFloat16>(stream, right_X->DataRaw(), right_X_num_elems, 4);

printf("\nPrinting tensor data for left_X_fp8:\n");
PrintTensorData<Float8E4M3FN>(stream, left_X_fp8.get(), left_X_num_elems, 4);
printf("\nPrinting tensor data for right_X_fp8 tranposed:\n");
PrintTensorData<Float8E4M3FN>(stream, right_X_fp8.get(), right_X_num_elems, 4);

const void* p_input_a = right_X_fp8.get();
const void* p_input_b = left_X_fp8.get();
const void* p_input_c = C.get();
Expand Down Expand Up @@ -334,9 +321,6 @@ PrintTensorData<Float8E4M3FN>(stream, right_X_fp8.get(), right_X_num_elems, 4);
return status;
float scale_y = 1.0f;

// TODO delete
printf("scale_a = %f, scale_b = %f\n", scale_a, scale_b);

// Create the on device scale values.
IAllocatorUniquePtr<void> p_scale_a = IAllocator::MakeUniquePtr<void>(allocator, sizeof(float), false, ctx->GetComputeStream());
IAllocatorUniquePtr<void> p_scale_b = IAllocator::MakeUniquePtr<void>(allocator, sizeof(float), false, ctx->GetComputeStream());
Expand Down Expand Up @@ -382,8 +366,6 @@ CUBLAS_RETURN_IF_ERROR(cublasLtMatrixLayoutCreate(&Cdesc, y_cuda_type, N, M, ldc
cublasStatus_t cuda_status = cublasLtMatmulAlgoGetHeuristic(
cublasLt, operationDesc, Adesc, Bdesc, Cdesc, Ydesc, preference, 1, &heuristicResult, &returnedResults);

printf("returnedResults = %d, cuda_status == CUBLAS_STATUS_SUCCESS = %d\n", returnedResults, cuda_status == CUBLAS_STATUS_SUCCESS);

int n_inputs = ctx->InputCount();
ORT_ENFORCE(
returnedResults > 0 && cuda_status == CUBLAS_STATUS_SUCCESS,
Expand Down Expand Up @@ -429,9 +411,6 @@ printf("returnedResults = %d, cuda_status == CUBLAS_STATUS_SUCCESS = %d\n", retu
workspaceSize,
stream); /* stream */

printf("\nPrinting tensor data for Y:\n");
PrintTensorData<MLFloat16>(stream, Y->DataRaw(), M * N, M * N);

ORT_ENFORCE(
cuda_status == CUBLAS_STATUS_SUCCESS,
" Unable to run cublasLtMatmul due to ", onnxruntime::cuda::cublasGetErrorEnum(cuda_status),
Expand Down
38 changes: 0 additions & 38 deletions onnxruntime/core/providers/cuda/math/matmul_utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,44 +13,11 @@
namespace onnxruntime {
namespace cuda {

// namespace {

// constexpr int kElementsPerThread = GridDim::maxElementsPerThread;

// } // namespace

template <typename CudaFp16T, typename CudaFp8T>
__global__ void MLFloat16ToFloat8E4M3FNKernel(const CudaFp16T* src_data, CudaFp8T* dest_data, int num_elems)
{
// const auto kElementsPerBlock = kElementsPerThread * blockDim.x;
// const auto input_base_idx = kElementsPerBlock * blockIdx.x + threadIdx.x;
// const auto element_stride = blockDim.x;

// CudaFp16T local_src[kElementsPerThread];

// {
// auto input_idx = input_base_idx;
// #pragma unroll
// for (int element_idx = 0; element_idx < kElementsPerThread; ++element_idx) {
// local_src[element_idx] = src_data[input_idx];
// input_idx += element_stride;
// }
// }

// {
// auto input_idx = input_base_idx;
// #pragma unroll
// for (int element_idx = 0; element_idx < kElementsPerThread; ++element_idx) {
// // if (input_idx < num_elems) {
// dest_data[input_idx] = CudaFp8T(src_data[element_idx]);
// input_idx += element_stride;
// // }
// }
// }

int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < num_elems) {
// dest_data[i] = CudaFp8T(src_data[i]);
dest_data[i] = CudaFp8T(static_cast<unsigned char>(__nv_cvt_halfraw_to_fp8(src_data[i], __NV_SATFINITE, __NV_E4M3)),
CudaFp8T::FromBits());
}
Expand All @@ -65,13 +32,10 @@ Status MLFloat16ToFloat8E4M3FN(cudaStream_t stream, const Tensor* src, void* des
typedef typename ToCudaType<Float8E4M3FN>::MappedType CudaFp8T;
CudaFp8T* dest_data = reinterpret_cast<CudaFp8T*>(dest);

// TODO optimize using: constexpr int kElementsPerThread = GridDim::maxElementsPerThread;
// https://github.com/microsoft/onnxruntime/blob/7df8776322bc66bda9bb1bff1502fcceb8596efc/onnxruntime/contrib_ops/cuda/math/bias_gelu_impl.cu#L16
int num_elems = src->SizeInBytes() / sizeof(MLFloat16);
int blocks_per_grid = static_cast<int>((num_elems + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock);
int threads_per_block = GridDim::maxThreadsPerBlock;
MLFloat16ToFloat8E4M3FNKernel<CudaFp16T, CudaFp8T><<<blocks_per_grid, threads_per_block, 0, stream>>>(
// MLFloat16ToFloat8E4M3FNKernel<CudaFp16T, CudaFp16T><<<blocks_per_grid, threads_per_block, 0, stream>>>(
src_data, dest_data, num_elems);

CUDA_RETURN_IF_ERROR(cudaGetLastError());
Expand All @@ -80,7 +44,6 @@ Status MLFloat16ToFloat8E4M3FN(cudaStream_t stream, const Tensor* src, void* des
}



template <typename CudaT>
__global__ void ComputeStdDevCoefficientsForScaleKernel(const CudaT* tensor_data, CudaT* d_scale_coef)
{
Expand Down Expand Up @@ -116,7 +79,6 @@ Status ComputeStdDevCoefficientsForScale(cudaStream_t stream, const Tensor* tens
}



// Debugging utility that prints values for all indexes < last_index.
// If last_index is -1, then it prints values for all indexes.
template <typename CudaT>
Expand Down

0 comments on commit 1256074

Please sign in to comment.