Skip to content
Permalink

Comparing changes

This is a direct comparison between two commits made in this repository or its related repositories. View the default comparison for this range or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: NVIDIA/TransformerEngine
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: 4a6ac72d54b21b620b2cc09cb0c3e69f754767a6
Choose a base ref
..
head repository: NVIDIA/TransformerEngine
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: 9063a52dc3f44afdc7f8e676abe6a901ca4258cc
Choose a head ref
Showing with 4,039 additions and 854 deletions.
  1. +1 −0 .github/workflows/trigger-ci.yml
  2. +1 −0 .gitignore
  3. +1 −1 docs/examples/attention/attention.ipynb
  4. 0 qa/L0_cppunittest/test.sh
  5. +1 −0 qa/L0_pytorch_unittest/test.sh
  6. +2 −0 tests/cpp/operator/CMakeLists.txt
  7. +4 −0 tests/cpp/operator/test_cast.cu
  8. +214 −0 tests/cpp/operator/test_cast_current_scaling.cu
  9. +4 −0 tests/cpp/operator/test_cast_transpose.cu
  10. +210 −0 tests/cpp/operator/test_cast_transpose_current_scaling.cu
  11. +14 −14 tests/cpp/test_common.cu
  12. +4 −0 tests/cpp/test_common.h
  13. +123 −2 tests/pytorch/distributed/run_numerics.py
  14. +3 −1 tests/pytorch/distributed/test_numerics.py
  15. +105 −0 tests/pytorch/references/ref_per_tensor_cs.py
  16. +44 −16 tests/pytorch/test_cpu_offloading.py
  17. +802 −0 tests/pytorch/test_float8_current_scaling_exact.py
  18. +119 −1 tests/pytorch/test_float8tensor.py
  19. +7 −0 tests/pytorch/test_numerics.py
  20. +1 −0 tests/pytorch/test_recipe.py
  21. +50 −0 tests/pytorch/test_sanity.py
  22. +1 −0 transformer_engine/common/CMakeLists.txt
  23. +4 −2 transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
  24. +33 −16 transformer_engine/common/common.h
  25. +23 −0 transformer_engine/common/include/transformer_engine/recipe.h
  26. +109 −3 transformer_engine/common/include/transformer_engine/transformer_engine.h
  27. +94 −0 transformer_engine/common/recipe/__init__.py
  28. +237 −0 transformer_engine/common/recipe/current_scaling.cu
  29. +79 −5 transformer_engine/common/transformer_engine.cpp
  30. +4 −1 transformer_engine/common/transpose/cast_transpose.cu
  31. +16 −10 transformer_engine/common/util/cast_kernels.cuh
  32. +1 −1 transformer_engine/common/utils.cuh
  33. +232 −575 transformer_engine/pytorch/attention.py
  34. +10 −0 transformer_engine/pytorch/constants.py
  35. +78 −19 transformer_engine/pytorch/cpu_offload.py
  36. +9 −2 transformer_engine/pytorch/csrc/common.cpp
  37. +26 −0 transformer_engine/pytorch/csrc/common.h
  38. +30 −1 transformer_engine/pytorch/csrc/extensions/activation.cpp
  39. +23 −0 transformer_engine/pytorch/csrc/extensions/cast.cpp
  40. +10 −1 transformer_engine/pytorch/csrc/extensions/gemm.cpp
  41. +88 −74 transformer_engine/pytorch/csrc/extensions/normalization.cpp
  42. +3 −0 transformer_engine/pytorch/csrc/extensions/pybind.cpp
  43. +117 −0 transformer_engine/pytorch/csrc/extensions/quantizer.cpp
  44. +1 −1 transformer_engine/pytorch/csrc/extensions/swizzle.cpp
  45. +2 −1 transformer_engine/pytorch/csrc/extensions/type_converters.cpp
  46. +11 −4 transformer_engine/pytorch/csrc/pybind.h
  47. +13 −5 transformer_engine/pytorch/distributed.py
  48. +457 −0 transformer_engine/pytorch/dot_product_attention/utils.py
  49. +62 −8 transformer_engine/pytorch/fp8.py
  50. +13 −1 transformer_engine/pytorch/module/base.py
  51. +3 −0 transformer_engine/pytorch/module/grouped_linear.py
  52. +95 −8 transformer_engine/pytorch/module/layernorm_linear.py
  53. +172 −34 transformer_engine/pytorch/module/layernorm_mlp.py
  54. +99 −7 transformer_engine/pytorch/module/linear.py
  55. +1 −8 transformer_engine/pytorch/tensor/_internal/float8_tensor_base.py
  56. +1 −8 transformer_engine/pytorch/tensor/_internal/mxfp8_tensor_base.py
  57. +166 −14 transformer_engine/pytorch/tensor/float8_tensor.py
  58. +0 −9 transformer_engine/pytorch/tensor/mxfp8_tensor.py
  59. +1 −1 transformer_engine/pytorch/tensor/quantized_tensor.py
  60. +5 −0 transformer_engine/pytorch/transformer.py
1 change: 1 addition & 0 deletions .github/workflows/trigger-ci.yml
Original file line number Diff line number Diff line change
@@ -43,6 +43,7 @@ jobs:
|| github.actor == 'youngeunkwon0405'
|| github.actor == 'KshitijLakhani'
|| github.actor == 'jberchtold-nvidia'
|| github.actor == 'sanandaraj5597'
|| github.actor == 'negvet'
)
steps:
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -38,3 +38,4 @@ downloads/
.pytest_cache/
compile_commands.json
.nfs
tensor_dumps/
2 changes: 1 addition & 1 deletion docs/examples/attention/attention.ipynb
Original file line number Diff line number Diff line change
@@ -458,7 +458,7 @@
" </tr>\n",
"</table>\n",
"\n",
"Some example usage of the different layouts can be found at [test_dpa_qkv_layout](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py) and [test_dpa_qkv_layout_thd](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py). Transformer Engine also provides a utility function [transformer_engine.pytorch.attention.get_qkv_layout](https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py) to help determine which layout a set of `q`, `k`, `v` tensors have (PyTorch only).\n",
"Some example usage of the different layouts can be found at [test_dpa_qkv_layout](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py) and [test_dpa_qkv_layout_thd](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py). Transformer Engine also provides a utility function [transformer_engine.pytorch.dot_product_attention.utils.get_qkv_layout](https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py) to help determine which layout a set of `q`, `k`, `v` tensors have (PyTorch only).\n",
"\n",
"<div class=\"alert alert-info\">\n",
"<b>Note</b>\n",
Empty file modified qa/L0_cppunittest/test.sh
100644 → 100755
Empty file.
1 change: 1 addition & 0 deletions qa/L0_pytorch_unittest/test.sh
Original file line number Diff line number Diff line change
@@ -24,6 +24,7 @@ pytest -v -s $TE_PATH/tests/pytorch/test_multi_tensor.py || FAIL=1
pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops.py || FAIL=1
pytest -v -s $TE_PATH/tests/pytorch/test_permutation.py || FAIL=1
pytest -v -s $TE_PATH/tests/pytorch/test_parallel_cross_entropy.py || FAIL=1
pytest -v -s $TE_PATH/tests/pytorch/test_cpu_offloading.py || FAIL=1
NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 pytest -o log_cli=true --log-cli-level=INFO -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py || FAIL=1

exit $FAIL
2 changes: 2 additions & 0 deletions tests/cpp/operator/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@

add_executable(test_operator
test_cast.cu
test_cast_current_scaling.cu
test_cast_dbias.cu
test_cast_dbias_dgelu.cu
test_cast_gated_swiglu.cu
@@ -13,6 +14,7 @@ add_executable(test_operator
test_dequantize_mxfp8.cu
test_transpose.cu
test_cast_transpose.cu
test_cast_transpose_current_scaling.cu
test_cast_transpose_dbias.cu
test_cast_transpose_dbias_dgelu.cu
test_cast_transpose_dgeglu.cu
4 changes: 4 additions & 0 deletions tests/cpp/operator/test_cast.cu
Original file line number Diff line number Diff line change
@@ -35,6 +35,8 @@ void compute_ref(const InputType *data, OutputType *output_c,
*amax = current_max;
}


// delayed tensor scaling test
template <typename InputType, typename OutputType>
void performTest(const std::vector<size_t>& shape) {
using namespace test;
@@ -55,6 +57,7 @@ void performTest(const std::vector<size_t>& shape) {
nvte_quantize(input.data(), output_c.data(), 0);

float ref_amax;

compute_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output_c.get(),
full_size, &ref_amax, output_c.scale());

@@ -105,6 +108,7 @@ TEST_P(CastTestSuite, TestCast) {

TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(input_type, InputType,
TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(output_type, OutputType,
// delayed tensor scaling
performTest<InputType, OutputType>(size);
);
);
214 changes: 214 additions & 0 deletions tests/cpp/operator/test_cast_current_scaling.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
/*************************************************************************
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See LICENSE for license information.
************************************************************************/

#include <cstring>
#include <iomanip>
#include <iostream>
#include <memory>
#include <random>

#include <cuda_bf16.h>
#include <cuda_runtime.h>
#include <gtest/gtest.h>

#include <transformer_engine/cast.h>
#include <transformer_engine/recipe.h>
#include "../test_common.h"

using namespace transformer_engine;

namespace {

template <typename InputType, typename OutputType>
void compute_ref(const InputType *data, OutputType *output_c,
const size_t size,
float *amax, float scale) {
using compute_t = float;
compute_t current_max = -1e100;
for (size_t i = 0; i < size; ++i) {
compute_t current = static_cast<compute_t>(data[i]);
current_max = fmaxf(current_max, fabsf(current));
output_c[i] = OutputType(scale * current);
}
}


template <typename InputType, typename OutputType>
void compute_amax_scale_ref(const InputType *data,
const size_t size,
float *amax_ptr, float *scale_ptr, float* scale_inv_ptr,
float max_fp8, float epsilon) {
using compute_t = float;
compute_t current_max = -1e100;
for (size_t i = 0; i < size; ++i) {
compute_t current = static_cast<compute_t>(data[i]);
current_max = fmaxf(current_max, fabsf(current));
}
*amax_ptr = current_max;

// compute scale from amax
float clamp_amax = current_max;
if (current_max <= epsilon){
clamp_amax = epsilon;
}

float scale = 1.f;
float scale_inv = 1.f;

if (isinf(clamp_amax) || clamp_amax == 0.f) {
*scale_ptr = scale;
*scale_inv_ptr = scale_inv;
return;
}

// use ieee_div in CPU
scale = max_fp8 / clamp_amax;

// The amax is too small that the scale becoming infinite in FP32. In other word,
// the scale is not representable in FP32.
if (isinf(scale)) {
scale = std::numeric_limits<float>::max();
}

if (isnan(scale)) {
scale = 1.f;
}

scale_inv = 1.0f / scale;

*scale_ptr = scale;
*scale_inv_ptr = scale_inv;
}

// current tensor scaling test
template <typename InputType, typename OutputType>
void performTest(const std::vector<size_t>& shape) {
using namespace test;

const size_t full_size = product(shape);

DType itype = TypeInfo<InputType>::dtype;
DType otype = TypeInfo<OutputType>::dtype;

bool is_out_fp8 = isFp8Type(otype);

// find out max fp8 value
float max_fp8;
if (is_out_fp8){
switch (otype) {
case DType::kFloat8E5M2: {
max_fp8 = Quantized_Limits<fp8e5m2>::max();
} break;
case DType::kFloat8E4M3: {
max_fp8 = Quantized_Limits<fp8e4m3>::max();
} break;
default:
NVTE_ERROR("Invalid type.");
}
}

Tensor input("input", shape, itype);
Tensor output_c("output_c", shape, otype, true, false);

std::unique_ptr<OutputType[]> ref_output_c = std::make_unique<OutputType[]>(full_size);

fillUniform(&input);

// compute amax
float amax_to_check = 0.0f;
if (is_out_fp8){
nvte_compute_amax(input.data(), output_c.data(), 0);
QuantizationConfigWrapper config;
nvte_compute_scale_from_amax(output_c.data(), config, 0);
// avoid atomic amax update in cuda cast kernels because of current per-tensor scaling
amax_to_check = output_c.amax();
output_c.set_tensor_amax_nullptr();
}
nvte_quantize(input.data(), output_c.data(), 0);

float ref_amax;
float ref_scale;
float ref_scale_inv;
if (is_out_fp8){
compute_amax_scale_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(),
full_size, &ref_amax, &ref_scale, &ref_scale_inv, max_fp8, 0.0f);
}

compute_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output_c.get(),
full_size, nullptr, is_out_fp8 ? output_c.scale() : 1.0f );

cudaDeviceSynchronize();

auto err = cudaGetLastError();
ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
if (isFp8Type(otype)) {
auto [atol_fp32, rtol_fp32] = getTolerances(DType::kFloat32);
compareResults("amax", amax_to_check, ref_amax, 0.0f, rtol_fp32);
compareResults("scale", output_c.scale(), ref_scale, 0.0f, rtol_fp32);
compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, 0.0f, rtol_fp32);
}
auto [atol, rtol] = getTolerances(otype);
compareResults("output_c", output_c, ref_output_c.get(), true, 0.0f, rtol);
}

std::vector<std::vector<size_t>> test_cases = {
{16},
{16000},
{128, 128},
{256, 256},
{768, 1024},
{256, 65536},
{2048, 12288},
{65536, 128},
{65536, 160},
{16384, 1616},
{1, 128},
{1, 1296},
{1, 16},
{5, 160},
{5, 4, 3, 160},
{217, 256},
};
} // namespace

class CastCSTestSuite : public ::testing::TestWithParam<std::tuple<transformer_engine::DType,
transformer_engine::DType,
std::vector<size_t>>> {};

TEST_P(CastCSTestSuite, TestCastCS) {
using namespace transformer_engine;
using namespace test;

const DType input_type = std::get<0>(GetParam());
const DType output_type = std::get<1>(GetParam());
const auto size = std::get<2>(GetParam());

TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(input_type, InputType,
TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(output_type, OutputType,
// current tensor scaling
performTest<InputType, OutputType>(size);
);
);
}



INSTANTIATE_TEST_SUITE_P(
OperatorTest,
CastCSTestSuite,
::testing::Combine(
::testing::Values(DType::kFloat32, DType::kBFloat16, DType::kFloat16),
::testing::Values(DType::kFloat8E4M3, DType::kFloat8E5M2),
::testing::ValuesIn(test_cases)),
[](const testing::TestParamInfo<CastCSTestSuite::ParamType>& info) {
std::string name = test::typeName(std::get<0>(info.param)) + "X" +
test::typeName(std::get<1>(info.param));
const auto& shape = std::get<2>(info.param);
for ( const auto& s: shape) {
name += "X" + std::to_string(s);
}
return name;
});
4 changes: 4 additions & 0 deletions tests/cpp/operator/test_cast_transpose.cu
Original file line number Diff line number Diff line change
@@ -38,6 +38,8 @@ void compute_ref(const InputType *data, OutputType *output_c, OutputType *output
*amax = current_max;
}


// delayed tensor scaling test
template <typename InputType, typename OutputType>
void performTest(const size_t N, const size_t H) {
using namespace test;
@@ -75,6 +77,7 @@ void performTest(const size_t N, const size_t H) {
compareResults("output_t", output, ref_output_t.get(), false, atol, rtol);
}


std::vector<std::pair<size_t, size_t>> test_cases = {{2048, 12288},
{768, 1024},
{256, 65536},
@@ -101,6 +104,7 @@ TEST_P(CTTestSuite, TestCastTranspose) {

TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(input_type, InputType,
TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(output_type, OutputType,
// delayed tensor scaling
performTest<InputType, OutputType>(size.first, size.second);
);
);
Loading