diff --git a/third_party/nvfuser/csrc/executor.cpp b/third_party/nvfuser/csrc/executor.cpp index b5b5289094fd..2ddbf6f025dd 100644 --- a/third_party/nvfuser/csrc/executor.cpp +++ b/third_party/nvfuser/csrc/executor.cpp @@ -236,6 +236,10 @@ void FusionExecutor::compileFusion( } } + if (isDebugDumpEnabled(DebugDumpOption::FusionDebug)) { + fusion->printDebug(); + } + if (isDebugDumpEnabled(DebugDumpOption::FusionIr)) { fusion->print(); } else if (isDebugDumpEnabled(DebugDumpOption::FusionIrMath)) { diff --git a/third_party/nvfuser/csrc/expr_simplifier.cpp b/third_party/nvfuser/csrc/expr_simplifier.cpp index 53ea426a57c2..0c31a9ea664c 100644 --- a/third_party/nvfuser/csrc/expr_simplifier.cpp +++ b/third_party/nvfuser/csrc/expr_simplifier.cpp @@ -411,7 +411,9 @@ class FlattenedAssocCommOp : public Expr { return other_inputs.empty(); } - std::string toString(int indent_size = 0) const override { + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override { std::stringstream ss; indent(ss, indent_size) << getOpString() << "("; bool needs_comma = false; @@ -426,7 +428,9 @@ class FlattenedAssocCommOp : public Expr { return ss.str(); } - std::string toInlineString(int = 0) const override { + std::string toInlineString( + int = 0, + SerializationFormat fmt = SerializationFormat::Default) const override { std::stringstream ss; ss << getOpString() << "("; bool needs_comma = false; diff --git a/third_party/nvfuser/csrc/fusion.cpp b/third_party/nvfuser/csrc/fusion.cpp index 31bb763ac559..300effdf4590 100644 --- a/third_party/nvfuser/csrc/fusion.cpp +++ b/third_party/nvfuser/csrc/fusion.cpp @@ -12,6 +12,7 @@ #include #include #include +#include namespace torch { namespace jit { @@ -344,26 +345,150 @@ void Fusion::validateInputs() { } } -void Fusion::print() { +void Fusion::serialize(std::ostream& out, SerializationFormat fmt) { + FUSER_PERF_SCOPE("Fusion::serialize"); + + switch (fmt) { + case SerializationFormat::NameOnly: + out << "Fusion"; + break; + case SerializationFormat::Default: { + FusionGuard fg(this); + out << "\n%kernel {\n"; + IrMathPrinter op_exprs(out); + op_exprs.handle(this); + out << "\nTransformPrinter : \n"; + IrTransformPrinter t_exprs(out); + t_exprs.handle(this); + out << "}\n\n"; + break; + } + case SerializationFormat::Debug: { + break; + } + case SerializationFormat::EndOfOption: + break; + } +} + +void Fusion::printDebug(std::ostream& out) { + FUSER_PERF_SCOPE("Fusion::printDebug"); + + out << "Fusion DEBUG INFO {"; + std::vector inputs_; + out << "\n inputs_ = {"; + for (auto& it : inputs_) { + out << "\n " << it->toString(0, SerializationFormat::Debug); + } + out << " }\n"; + out << "\n outputs_ = {"; + for (auto& it : outputs_) { + out << "\n " << it->toString(0, SerializationFormat::Debug); + auto a = getOutputAlias(it); + if (a != nullptr) { + out << " ALIASES " << a->toString(0, SerializationFormat::NameOnly); + } + } + out << " }\n"; + out << "\n all_tv_uses_valid_ = " << all_tv_uses_valid_; + out << "\n is_during_update_uses_ = " << is_during_update_uses_; + out << "\n io_alias_ = {"; + for (auto& it : io_alias_) { // NOTE: ordering arbitrary + out << "\n " << it.first->toString(0, SerializationFormat::Debug) + << " => " << it.second->toString(0, SerializationFormat::Debug); + } + out << " }\n"; + out << "\n permuted_input_map_ = {"; + for (auto& it : permuted_input_map_) { // NOTE: ordering arbitrary + out << "\n " << it.first << " => " << it.second; + } + out << " }\n"; + out << "\n permuted_output_map_ = {"; + for (auto& it : permuted_input_map_) { // NOTE: ordering arbitrary + out << "\n " << it.first << " => " << it.second; + } + out << " }\n"; + + auto ind = " "; + out << ind << " expr_name_counter = " << expr_name_counter_; + out << ind << "\n vals_ (" << vals_.size() << ") = ["; + std::vector valstrs; + std::vector> all_logs; + for (auto& it : vals_) { // NOTE: ordering arbitrary + std::stringstream obss; + obss << it->toString(3, SerializationFormat::Debug); + auto log = it->getLogMessages(); + if (log.size() > 0) { + auto val_name = it->toString(0, SerializationFormat::NameOnly); + for (auto num_msg : + it->getLogMessages()) { // pair log sequence number & message + all_logs.push_back({num_msg.first, val_name, num_msg.second}); + } + } + valstrs.push_back(obss.str()); + } + std::sort(valstrs.begin(), valstrs.end()); + for (auto& it : valstrs) { // sorted + out << ind << "\n " << it; + } + out << ind << " ]\n"; + out << ind << "\n exprs_ (" << exprs_.size() << ") = [\n"; + std::vector expstrs; + for (auto& it : exprs_) { // NOTE: ordering arbitrary + expstrs.push_back(it->toString(3, SerializationFormat::NameOnly)); + } + std::sort(expstrs.begin(), expstrs.end()); + for (auto& it : expstrs) { // sorted + out << ind << it; + } + out << ind << " ]\n"; + out << ind << "\n val_type_name_map_ (" << val_type_name_map_.size() + << ") = {"; + for (auto& it : val_type_name_map_) { // NOTE: ordering arbitrary + out << ind << "\n " << (int)it.first << " => " << it.second; + } + out << ind << " }\n"; + std::sort(all_logs.begin(), all_logs.end()); + out << ind << "\n Logged operations:"; +#ifdef NDEBUG + std::cerr << "WARNING: Fusion operations are only logged in Debug builds." + << std::endl; +#endif + // Actual lognum may be large if there are multiple Fusions defined in this + // process. Instead, just print a local counter for the log messages + // appearing in this Fusion's vals_. + int local_lognum = 0; + for (auto entry : all_logs) { + int lognum; + std::string val_name; + std::string msg; + std::tie(lognum, val_name, msg) = entry; + out << ind << "\n " << local_lognum++ << ") " << val_name << " : " + << msg; + } + out << "\n}\n"; +} + +void Fusion::print(std::ostream& out, SerializationFormat fmt) { FUSER_PERF_SCOPE("Fusion::print"); FusionGuard fg(this); - std::cout << "\n%kernel {\n"; - IrMathPrinter op_exprs(std::cout); + out << "\n%kernel {\n"; + IrMathPrinter op_exprs(out, fmt); op_exprs.handle(this); - std::cout << "\nTransformPrinter : \n"; - IrTransformPrinter t_exprs(std::cout); + out << "\nTransformPrinter : \n"; + IrTransformPrinter t_exprs(out, fmt); t_exprs.handle(this); - std::cout << "}\n\n"; + out << "}\n\n"; } -void Fusion::printKernel(DataType index_type) { +void Fusion::printKernel(DataType index_type, std::ostream& out) { FUSER_PERF_SCOPE("Fusion::printKernel"); TORCH_INTERNAL_ASSERT( !this->isA(), "Cannot \"print kernel\" of a kernel container. ", "This would require lowering during lowering."); - std::cout << codegen::generateCudaKernel(GpuLower(this, index_type).kernel()); + out << codegen::generateCudaKernel(GpuLower(this, index_type).kernel()); } std::unordered_map> Fusion::bankConflictInfo( @@ -380,38 +505,54 @@ std::unordered_map> Fusion::bankConflictInfo( return result; } -void Fusion::printMath(bool from_outputs_only) { +void Fusion::printMath( + bool from_outputs_only, + std::ostream& out, + SerializationFormat fmt) { FUSER_PERF_SCOPE("Fusion::printMath"); - FusionGuard fg(this); - auto exprs_for_print = exprs(); - std::cout << "Inputs:" << std::endl; - for (auto inp : inputs()) { - std::cout << " " << inp << ", " << inp->getDataType().value() << std::endl; - } + switch (fmt) { + case SerializationFormat::NameOnly: + out << "Fusion Math"; + break; + case SerializationFormat::Default: { + FusionGuard fg(this); + auto exprs_for_print = exprs(); + out << "Inputs:" << std::endl; + for (auto inp : inputs()) { + out << " " << inp << ", " << inp->getDataType().value() << std::endl; + } - std::cout << "Outputs:" << std::endl; - for (auto out : outputs()) { - std::cout << " " << out << ", " << out->getDataType().value() << std::endl; - } + out << "Outputs:" << std::endl; + for (auto output : outputs()) { + out << " " << output << ", " << output->getDataType().value() + << std::endl; + } - // If we want everything in the fusion, grab all values without uses to - // traverse from. - if (!from_outputs_only) { - std::vector leaf_vals; - for (auto val : deterministic_vals()) { - if (val->uses().empty()) { - leaf_vals.push_back(val); + // If we want everything in the fusion, grab all values without uses to + // traverse from. + if (!from_outputs_only) { + std::vector leaf_vals; + for (auto val : deterministic_vals()) { + if (val->uses().empty()) { + leaf_vals.push_back(val); + } + } + exprs_for_print = StmtSort::getExprs(this, leaf_vals); } - } - exprs_for_print = StmtSort::getExprs(this, leaf_vals); - } - std::cout << "\n%kernel_math {\n"; - for (auto expr : exprs_for_print) { - std::cout << expr; + out << "\n%kernel_math {\n"; + for (auto expr : exprs_for_print) { + out << expr; + } + out << "}\n\n"; + break; + } + case SerializationFormat::Debug: + break; + case SerializationFormat::EndOfOption: + break; } - std::cout << "}\n\n"; } std::vector Fusion::inputsAndCreated() { @@ -427,12 +568,25 @@ std::vector Fusion::inputsAndCreated() { return result; } -void Fusion::printTransforms() { +void Fusion::printTransforms(std::ostream& out, SerializationFormat fmt) { FUSER_PERF_SCOPE("Fusion::printTransforms"); - FusionGuard fg(this); - IrTransformPrinter t_exprs(std::cout); - t_exprs.handle(this); + switch (fmt) { + case SerializationFormat::NameOnly: + out << "Fusion Transforms"; + break; + case SerializationFormat::Default: { + FusionGuard fg(this); + IrTransformPrinter t_exprs(out); + t_exprs.handle(this); + break; + } + case SerializationFormat::Debug: { + out << "DEBUG OUTPUT:" << std::endl; + } + case SerializationFormat::EndOfOption: + break; + } } void Fusion::registerVal(Val* val) { @@ -660,6 +814,12 @@ bool Fusion::isAliasCompatible(Val* left, Val* right) { } void Fusion::aliasOutputToInput(Val* output, Val* input) { + VAL_LOG_EXPLICIT( + output, + "Fusion::aliasOutputToInput", + output->toString(0, SerializationFormat::NameOnly), + input->toString(0, SerializationFormat::NameOnly), ); + // Because we could cast output when input is cast. TORCH_INTERNAL_ASSERT( !output->isFusionOutput(), diff --git a/third_party/nvfuser/csrc/fusion.h b/third_party/nvfuser/csrc/fusion.h index d8cef33fda0d..5cc867c66643 100644 --- a/third_party/nvfuser/csrc/fusion.h +++ b/third_party/nvfuser/csrc/fusion.h @@ -123,18 +123,36 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer { //! Assert that all leaves found from outputs are registered as an input void validateInputs(); + //! Serialize in text or binary form using one of many formats + void serialize(std::ostream& out, SerializationFormat fmt); + + //! Deserialize from the given format + void deserialize(std::istream& in, SerializationFormat fmt); + + //! Print detailed debug information about this fusion to the console + void printDebug(std::ostream& out = std::cout); + //! Print this fusion to the console - void print(); + void print( + std::ostream& out = std::cout, + SerializationFormat fmt = SerializationFormat::Default); //! Print Arith exprs //! \param from_outputs_only Only print exprs reachable from outputs - void printMath(bool from_outputs_only = true); + void printMath( + bool from_outputs_only = true, + std::ostream& out = std::cout, + SerializationFormat fmt = SerializationFormat::Default); //! Print transformations used in fusion (can be very verbose) - void printTransforms(); + void printTransforms( + std::ostream& out = std::cout, + SerializationFormat fmt = SerializationFormat::Default); //! Lower the fusion and print a kernel - void printKernel(DataType index_type = DataType::Int); + void printKernel( + DataType index_type = DataType::Int, + std::ostream& out = std::cout); //! Returns if this fusion is noop, for example, trivially forwarding inputs, //! or all outputs are size-0 tensors, etc. diff --git a/third_party/nvfuser/csrc/ir_base_nodes.cpp b/third_party/nvfuser/csrc/ir_base_nodes.cpp index fdc520109a96..01f8c7a0ad5a 100644 --- a/third_party/nvfuser/csrc/ir_base_nodes.cpp +++ b/third_party/nvfuser/csrc/ir_base_nodes.cpp @@ -60,12 +60,14 @@ bool Statement::lessThan(const Statement* stmt1, const Statement* stmt2) { return stmt1->name() < stmt2->name(); } -std::string Statement::toString(int indent_size) const { +std::string Statement::toString(int indent_size, SerializationFormat fmt) + const { TORCH_INTERNAL_ASSERT( false, "toString for IR node ", typeid(*this).name(), " is not defined"); } -std::string Statement::toInlineString(int indent_size) const { +std::string Statement::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_INTERNAL_ASSERT( false, "toInlineString for IR node ", diff --git a/third_party/nvfuser/csrc/ir_base_nodes.h b/third_party/nvfuser/csrc/ir_base_nodes.h index a3a03ea447be..190a200f9b99 100644 --- a/third_party/nvfuser/csrc/ir_base_nodes.h +++ b/third_party/nvfuser/csrc/ir_base_nodes.h @@ -166,9 +166,13 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase { static bool lessThan(const Statement* stmt1, const Statement* stmt2); - virtual std::string toString(int indent_size = 0) const; + virtual std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const; - virtual std::string toInlineString(int indent_size = 0) const; + virtual std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const; virtual Statement* clone(IrCloner* ir_cloner) const; @@ -361,6 +365,37 @@ class TORCH_CUDA_CU_API Val : public Statement { void resolveIndexDtype(); + //! Get vector of log messages for this object + auto getLogMessages() const { + return log_messages_; + } + + //! Write a message to this object's log + void log(std::string op_name, std::vector arg_strings) { +#ifdef NDEBUG + static bool warned = false; + if (!warned) { + std::cout + << " WARNING: Logging is slow and should be disabled in Release builds!" + << std::endl; + warned = true; + } +#endif + static int op_num = 0; + std::stringstream ss; + ss << op_name << "("; + ss << this->toString(0, SerializationFormat::NameOnly) << "):"; + bool skip_comma = true; + for (auto a : arg_strings) { + if (!skip_comma) { + ss << ","; + } + ss << " " << a; + skip_comma = false; + } + log_messages_.push_back({op_num++, ss.str()}); + } + NVFUSER_DECLARE_CLONE protected: @@ -399,6 +434,9 @@ class TORCH_CUDA_CU_API Val : public Statement { // Expr evaluator idx; int evaluator_index_ = -1; + + //! Holds log messages + std::vector> log_messages_; }; //! A Val object that stores a plain data. Note that this class is only intended @@ -425,11 +463,11 @@ class TORCH_CUDA_CU_API Attribute : public Val { return false; } - virtual std::string toString(int) const override { + virtual std::string toString(int, SerializationFormat) const override { return Printer::toString(value); } - virtual std::string toInlineString(int) const override { + virtual std::string toInlineString(int, SerializationFormat) const override { return Printer::toString(value); } }; diff --git a/third_party/nvfuser/csrc/ir_interface_nodes.h b/third_party/nvfuser/csrc/ir_interface_nodes.h index bcdbf45571c2..8e1347016267 100644 --- a/third_party/nvfuser/csrc/ir_interface_nodes.h +++ b/third_party/nvfuser/csrc/ir_interface_nodes.h @@ -62,6 +62,7 @@ class TORCH_CUDA_CU_API Scalar : public Val { (c10::is_complex::value && isComplexType(dtype)), "Invalid data type: ", dtype); + VAL_LOG("Scalar::Scalar", "IrBuilderPasskey", typePrefix(dtype)); } explicit Scalar( @@ -78,14 +79,26 @@ class TORCH_CUDA_CU_API Scalar : public Val { (c10::is_complex::value && isComplexType(dtype)), "Invalid data type: ", dtype); + VAL_LOG( + "Scalar::Scalar", + "IrBuilderPasskey", + "UnderlyingType", + typePrefix(dtype)); } Scalar(const Scalar* src, IrCloner* ir_cloner) - : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {} + : Val(src, ir_cloner), maybe_value_(src->maybe_value_) { + VAL_LOG( + "Scalar::Scalar", + src->toString(0, SerializationFormat::NameOnly), + "IrCloner"); + } NVFUSER_DECLARE_CLONE - std::string toString(int indent_size = 0) const override { + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override { std::stringstream ss; if (isSymbolic()) { ss << ir_utils::varName(this); @@ -125,13 +138,15 @@ class TORCH_CUDA_CU_API Scalar : public Val { return ss.str(); } - std::string toInlineString(int indent_size = 0) const override { + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override { if (definition() != nullptr) { std::stringstream ss; ss << "( " << definition()->toInlineString(indent_size) << " )"; return ss.str(); } else { - return toString(indent_size); + return toString(indent_size, fmt); } } @@ -173,6 +188,16 @@ using ComplexDouble = Scalar>; //! computeAt position as needed during traversal, most inlined will increase //! the compute at position to maximum possible through traversal. enum class ComputeAtMode { Standard, BestEffort, MostInlined }; +inline std::string compute_at_mode_to_string(ComputeAtMode mode) { + switch (mode) { + case ComputeAtMode::Standard: + return "Standard"; + case ComputeAtMode::BestEffort: + return "BestEffort"; + case ComputeAtMode::MostInlined: + return "MostInlined"; + } +} class TransformPropagator; struct MostInlinedTransformPropagator; @@ -231,9 +256,13 @@ class TORCH_CUDA_CU_API TensorView : public Val { NVFUSER_DECLARE_CLONE - std::string toString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; TensorDomain* domain() const { return domain_; diff --git a/third_party/nvfuser/csrc/ir_internal_nodes.h b/third_party/nvfuser/csrc/ir_internal_nodes.h index aa07a7836d05..ebc4b1608c4c 100644 --- a/third_party/nvfuser/csrc/ir_internal_nodes.h +++ b/third_party/nvfuser/csrc/ir_internal_nodes.h @@ -42,8 +42,12 @@ class TORCH_CUDA_CU_API FullOp : public Expr { return "FullOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* getFillValue() const { return inputs().back(); @@ -67,8 +71,12 @@ class TORCH_CUDA_CU_API SelectOp : public Expr { return "SelectOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; IterDomain* getSelectAxis() const { return attribute(0)->as(); @@ -97,8 +105,12 @@ class TORCH_CUDA_CU_API IndexSelectOp : public Expr { return "IndexSelectOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; IterDomain* getSelectAxis() const { return attribute(0)->as(); @@ -125,8 +137,12 @@ class TORCH_CUDA_CU_API TorchGatherOp : public Expr { return "TorchGatherOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; TensorView* lookupTv() const { return input(0)->as(); @@ -163,8 +179,12 @@ class TORCH_CUDA_CU_API ARangeOp : public Expr { return "ARangeOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; DataType dtype() const { return attribute(0)->as>()->value; @@ -213,8 +233,12 @@ class TORCH_CUDA_CU_API EyeOp : public Expr { return "EyeOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; DataType dtype() const { return attribute(0)->as>()->value; @@ -242,8 +266,12 @@ class TORCH_CUDA_CU_API UnaryOp : public Expr { virtual std::vector evaluate( const std::vector& inputs) const override; - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* out() const { return output(0); @@ -279,8 +307,12 @@ class TORCH_CUDA_CU_API BinaryOp : public Expr { virtual std::vector evaluate( const std::vector& inputs) const override; - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* out() const { return output(0); @@ -325,8 +357,12 @@ class TORCH_CUDA_CU_API TernaryOp : public Expr { virtual std::vector evaluate( const std::vector& inputs) const override; - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* out() const { return output(0); @@ -391,8 +427,12 @@ class TORCH_CUDA_CU_API RNGOp : public Expr { return "RNGOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; RNGOpType getRNGOpType() const { return attribute(0)->as>()->value.rtype; @@ -448,8 +488,12 @@ class TORCH_CUDA_CU_API BroadcastOp : public Expr { return "BroadcastOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* out() const { return output(0); @@ -495,8 +539,12 @@ class TORCH_CUDA_CU_API SqueezeOp : public Expr { return "SqueezeOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* out() const { return output(0); @@ -543,8 +591,12 @@ class TORCH_CUDA_CU_API ReductionOp : public Expr { return "ReductionOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* out() const { return output(0); @@ -589,8 +641,12 @@ class TORCH_CUDA_CU_API GroupedReductionOp : public Expr { return "GroupedReductionOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; //! Number of expressions grouped horizontally. It does not reflect //! iteration grouping. @@ -781,8 +837,12 @@ class TORCH_CUDA_CU_API WelfordOp : public Expr { return "WelfordOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* out() const { return outputTriplet().avg(); @@ -876,8 +936,12 @@ class TORCH_CUDA_CU_API GroupedWelfordOp : public Expr { return "GroupedWelfordOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; //! Number of expressions grouped horizontally. It does not reflect //! iteration grouping. As horizontal grouping is not supported, @@ -1017,8 +1081,12 @@ class TORCH_CUDA_CU_API MmaOp : public Expr { return "MmaOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* out() const { return output(0); @@ -1063,8 +1131,12 @@ class TORCH_CUDA_CU_API TransposeOp : public Expr { return "TransposeOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; TensorView* out() const { return output(0)->as(); @@ -1097,8 +1169,12 @@ class TORCH_CUDA_CU_API ExpandOp : public Expr { return "ExpandOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; TensorView* out() const { return output(0)->as(); @@ -1134,8 +1210,12 @@ class TORCH_CUDA_CU_API ShiftOp : public Expr { return "ShiftOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* out() const { return output(0); @@ -1183,8 +1263,12 @@ class TORCH_CUDA_CU_API GatherOp : public Expr { return "GatherOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* out() const { return output(0); @@ -1230,8 +1314,12 @@ class TORCH_CUDA_CU_API ViewAsScalar : public Expr { return "ViewAsScalar"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* out() const { return output(0); @@ -1264,8 +1352,12 @@ class TORCH_CUDA_CU_API ViewOp : public Expr { return "ViewOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* out() const { return output(0); @@ -1294,8 +1386,12 @@ class TORCH_CUDA_CU_API LoadStoreOp : public Expr { return "LoadStoreOp"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* out() const { return output(0); @@ -1391,9 +1487,13 @@ class TORCH_CUDA_CU_API IterDomain : public Val { bool sameAs(const Statement* other) const override; - std::string toString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; //! Returns a new IterDomain matching properties of this //! @@ -1721,9 +1821,13 @@ class TORCH_CUDA_CU_API TensorDomain : public Val { const std::vector& lhs, const std::vector& rhs); - std::string toString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; const std::vector& domain() const { return domain_; @@ -1921,8 +2025,12 @@ class TORCH_CUDA_CU_API Split : public Expr { return "Split"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; IterDomain* outer() const { return output(0)->as(); @@ -1979,8 +2087,12 @@ class TORCH_CUDA_CU_API Merge : public Expr { return "Merge"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; IterDomain* out() const { return output(0)->as(); @@ -2013,8 +2125,12 @@ class TORCH_CUDA_CU_API Swizzle2D : public Expr { return "Swizzle2D"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; // Output iterdomain pair corresponding // to the original input iterdomain pair. @@ -2109,11 +2225,15 @@ class TORCH_CUDA_CU_API NamedScalar : public Val { bool sameAs(const Statement* other) const override; - std::string toString(int indent_size = 0) const override { + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override { return name_; } - std::string toInlineString(int indent_size = 0) const override { + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override { return name_; } diff --git a/third_party/nvfuser/csrc/ir_iostream.cpp b/third_party/nvfuser/csrc/ir_iostream.cpp index 643306c96f88..71c73177f728 100644 --- a/third_party/nvfuser/csrc/ir_iostream.cpp +++ b/third_party/nvfuser/csrc/ir_iostream.cpp @@ -36,7 +36,7 @@ void IrPrinter::handle(Fusion* fusion) { FUSER_PERF_SCOPE("IrPrinter"); resetIndent(); for (const Expr* expr : fusion->exprs()) { - os_ << expr->toString(); + os_ << expr->toString(indent_size_, fmt_); } } diff --git a/third_party/nvfuser/csrc/ir_iostream.h b/third_party/nvfuser/csrc/ir_iostream.h index d8ca3647e0ab..fcb7a2f15df1 100644 --- a/third_party/nvfuser/csrc/ir_iostream.h +++ b/third_party/nvfuser/csrc/ir_iostream.h @@ -3,6 +3,7 @@ #include #include +#include #include @@ -38,8 +39,11 @@ inline std::ostream& indent(std::ostream& os, int indent_size) { //! class TORCH_CUDA_CU_API IrPrinter { public: - explicit IrPrinter(std::ostream& os, int indent_size = 0) - : os_(os), indent_size_(indent_size) {} + explicit IrPrinter( + std::ostream& os, + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) + : os_(os), indent_size_(indent_size), fmt_(fmt) {} virtual ~IrPrinter() {} void resetIndent() { @@ -76,6 +80,7 @@ class TORCH_CUDA_CU_API IrPrinter { std::ostream& os_; bool print_inline_ = false; int indent_size_ = 0; + SerializationFormat fmt_ = SerializationFormat::Default; }; TORCH_CUDA_CU_API std::ostream& operator<<( diff --git a/third_party/nvfuser/csrc/ir_nodes.cpp b/third_party/nvfuser/csrc/ir_nodes.cpp index 4e4b1f5b7e8a..814eb9a71618 100644 --- a/third_party/nvfuser/csrc/ir_nodes.cpp +++ b/third_party/nvfuser/csrc/ir_nodes.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -105,7 +106,7 @@ FullOp::FullOp(IrBuilderPasskey passkey, Val* out, Val* fill_value) addOutput(out); } -std::string FullOp::toString(int indent_size) const { +std::string FullOp::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << output(0)->toString() << "\n"; indent_size++; @@ -117,13 +118,14 @@ std::string FullOp::toString(int indent_size) const { if (i > 0) { ss << ", "; } - ss << input(i)->toInlineString(indent_size); + ss << input(i)->toInlineString(indent_size, fmt); } ss << ");\n"; return ss.str(); } -std::string FullOp::toInlineString(int indent_size) const { +std::string FullOp::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -143,7 +145,7 @@ SelectOp::SelectOp( addAttribute(index); } -std::string SelectOp::toString(int indent_size) const { +std::string SelectOp::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << output(0)->toString() << "\n"; indent_size++; @@ -153,7 +155,8 @@ std::string SelectOp::toString(int indent_size) const { return ss.str(); } -std::string SelectOp::toInlineString(int indent_size) const { +std::string SelectOp::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -174,7 +177,8 @@ IndexSelectOp::IndexSelectOp( addAttribute(IrBuilder::create>(passkey.ir_container_, dim)); } -std::string IndexSelectOp::toString(int indent_size) const { +std::string IndexSelectOp::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << output(0)->toString() << "\n"; indent_size++; @@ -188,7 +192,9 @@ std::string IndexSelectOp::toString(int indent_size) const { return ss.str(); } -std::string IndexSelectOp::toInlineString(int indent_size) const { +std::string IndexSelectOp::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -209,7 +215,8 @@ TorchGatherOp::TorchGatherOp( addAttribute(IrBuilder::create>(passkey.ir_container_, dim)); } -std::string TorchGatherOp::toString(int indent_size) const { +std::string TorchGatherOp::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << output(0)->toString() << "\n"; indent_size++; @@ -223,7 +230,9 @@ std::string TorchGatherOp::toString(int indent_size) const { return ss.str(); } -std::string TorchGatherOp::toInlineString(int indent_size) const { +std::string TorchGatherOp::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -245,7 +254,7 @@ ARangeOp::ARangeOp( IrBuilder::create>(passkey.ir_container_, dtype)); } -std::string ARangeOp::toString(int indent_size) const { +std::string ARangeOp::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << output(0)->toString(); ss << "\n"; @@ -256,7 +265,8 @@ std::string ARangeOp::toString(int indent_size) const { return ss.str(); } -std::string ARangeOp::toInlineString(int indent_size) const { +std::string ARangeOp::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -276,7 +286,7 @@ EyeOp::EyeOp(IrBuilderPasskey passkey, Val* out, DataType dtype) IrBuilder::create>(passkey.ir_container_, dtype)); } -std::string EyeOp::toString(int indent_size) const { +std::string EyeOp::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << output(0)->toString() << "\n"; indent_size++; @@ -285,7 +295,8 @@ std::string EyeOp::toString(int indent_size) const { return ss.str(); } -std::string EyeOp::toInlineString(int indent_size) const { +std::string EyeOp::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -363,7 +374,7 @@ void UnaryOp::printHelper(std::stringstream& ss, std::string input) const { } } -std::string UnaryOp::toString(int indent_size) const { +std::string UnaryOp::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; bool istvop = ir_utils::isTvOp(this); indent(ss, indent_size) << out()->toString(); @@ -378,10 +389,11 @@ std::string UnaryOp::toString(int indent_size) const { return ss.str(); } -std::string UnaryOp::toInlineString(int indent_size) const { +std::string UnaryOp::toInlineString(int indent_size, SerializationFormat fmt) + const { checkInlineable(this); std::stringstream ss; - printHelper(ss, in()->toInlineString()); + printHelper(ss, in()->toInlineString(indent_size, fmt)); return ss.str(); } @@ -507,7 +519,7 @@ void BinaryOp::printHelper( } } -std::string BinaryOp::toString(int indent_size) const { +std::string BinaryOp::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; bool istvop = ir_utils::isTvOp(this); indent(ss, indent_size) << out(); @@ -525,11 +537,15 @@ std::string BinaryOp::toString(int indent_size) const { return ss.str(); } -std::string BinaryOp::toInlineString(int indent_size) const { +std::string BinaryOp::toInlineString(int indent_size, SerializationFormat fmt) + const { checkInlineable(this); std::stringstream ss; printHelper( - ss, indent_size, lhs()->toInlineString(), rhs()->toInlineString()); + ss, + indent_size, + lhs()->toInlineString(0, fmt), + rhs()->toInlineString(0, fmt)); return ss.str(); } @@ -591,7 +607,8 @@ void TernaryOp::printHelper( ss << ", " << in3 << ")"; } -std::string TernaryOp::toString(int indent_size) const { +std::string TernaryOp::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; bool istvop = ir_utils::isTvOp(this); indent(ss, indent_size); @@ -611,15 +628,16 @@ std::string TernaryOp::toString(int indent_size) const { return ss.str(); } -std::string TernaryOp::toInlineString(int indent_size) const { +std::string TernaryOp::toInlineString(int indent_size, SerializationFormat fmt) + const { checkInlineable(this); std::stringstream ss; printHelper( ss, indent_size, - in1()->toInlineString(), - in2()->toInlineString(), - in3()->toInlineString()); + in1()->toInlineString(0, fmt), + in2()->toInlineString(0, fmt), + in3()->toInlineString(0, fmt)); return ss.str(); } @@ -650,7 +668,7 @@ RNGOp::RNGOp( addAttribute(philox_index); } -std::string RNGOp::toString(int indent_size) const { +std::string RNGOp::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size); ss << output(0)->toString() << "\n"; @@ -676,7 +694,8 @@ std::string RNGOp::toString(int indent_size) const { return ss.str(); } -std::string RNGOp::toInlineString(int indent_size) const { +std::string RNGOp::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -748,14 +767,17 @@ BroadcastOp::BroadcastOp( "The dimensions of output tensor and does not match with is_broadcast_dims and input tensor"); } -std::string BroadcastOp::toString(int indent_size) const { +std::string BroadcastOp::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << out()->toString() << "\n"; indent(ss, indent_size) << " = broadcast( " << in()->toString() << " )\n"; return ss.str(); } -std::string BroadcastOp::toInlineString(int indent_size) const { +std::string BroadcastOp::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -817,14 +839,16 @@ SqueezeOp::SqueezeOp( "The dimensions of output tensor and does not match with is_squeeze_dims and input tensor"); } -std::string SqueezeOp::toString(int indent_size) const { +std::string SqueezeOp::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << out()->toString() << "\n"; indent(ss, indent_size) << " = squeeze( " << in()->toString() << " )\n"; return ss.str(); } -std::string SqueezeOp::toInlineString(int indent_size) const { +std::string SqueezeOp::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -869,7 +893,8 @@ ReductionOp::ReductionOp( IrBuilder::create>(passkey.ir_container_, is_allreduce)); } -std::string ReductionOp::toString(int indent_size) const { +std::string ReductionOp::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << out() << "\n"; indent(ss, indent_size) << " = reduction( " << in()->toString() @@ -880,7 +905,9 @@ std::string ReductionOp::toString(int indent_size) const { return ss.str(); } -std::string ReductionOp::toInlineString(int indent_size) const { +std::string ReductionOp::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -912,7 +939,9 @@ GroupedReductionOp::GroupedReductionOp( } } -std::string GroupedReductionOp::toString(int indent_size) const { +std::string GroupedReductionOp::toString( + int indent_size, + SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << "GroupedReductionOp(\n"; ++indent_size; @@ -927,7 +956,9 @@ std::string GroupedReductionOp::toString(int indent_size) const { return ss.str(); } -std::string GroupedReductionOp::toInlineString(int indent_size) const { +std::string GroupedReductionOp::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -1098,7 +1129,8 @@ std::vector WelfordOp::getInitVals() const { return init_vals; } -std::string WelfordOp::toString(int indent_size) const { +std::string WelfordOp::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << outAvg()->toString() << "(Avg),\n" << outVar()->toString() << "(Var),\n" @@ -1119,7 +1151,8 @@ std::string WelfordOp::toString(int indent_size) const { return ss.str(); } -std::string WelfordOp::toInlineString(int indent_size) const { +std::string WelfordOp::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -1237,7 +1270,8 @@ GroupedWelfordOp::GroupedWelfordOp( } } -std::string GroupedWelfordOp::toString(int indent_size) const { +std::string GroupedWelfordOp::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << "GroupedWelford(\n"; ++indent_size; @@ -1262,7 +1296,9 @@ std::string GroupedWelfordOp::toString(int indent_size) const { return ss.str(); } -std::string GroupedWelfordOp::toInlineString(int indent_size) const { +std::string GroupedWelfordOp::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -1328,7 +1364,7 @@ MmaOp::MmaOp( attribute(1)->as>()->value = options; } -std::string MmaOp::toString(int indent_size) const { +std::string MmaOp::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << out()->toString() << " = mma(" << inA()->toString() << "," << inB()->toString(); @@ -1336,7 +1372,8 @@ std::string MmaOp::toString(int indent_size) const { return ss.str(); } -std::string MmaOp::toInlineString(int indent_size) const { +std::string MmaOp::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -1391,14 +1428,17 @@ TransposeOp::TransposeOp( passkey.ir_container_, std::move(new2old))); } -std::string TransposeOp::toString(int indent_size) const { +std::string TransposeOp::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << out()->toString() << " = transpose( " << in()->toString() << " )\n"; return ss.str(); } -std::string TransposeOp::toInlineString(int indent_size) const { +std::string TransposeOp::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -1430,7 +1470,7 @@ ExpandOp::ExpandOp( } } -std::string ExpandOp::toString(int indent_size) const { +std::string ExpandOp::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << out()->toString() << " = expand( " << in() << ", {"; @@ -1444,7 +1484,8 @@ std::string ExpandOp::toString(int indent_size) const { return ss.str(); } -std::string ExpandOp::toInlineString(int indent_size) const { +std::string ExpandOp::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -1490,7 +1531,7 @@ ShiftOp::ShiftOp( passkey.ir_container_, std::move(pad_width))); } -std::string ShiftOp::toString(int indent_size) const { +std::string ShiftOp::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << out()->toString() << " = shift( " << in()->toString() << ", {" << offsets() << "}, {" @@ -1498,7 +1539,8 @@ std::string ShiftOp::toString(int indent_size) const { return ss.str(); } -std::string ShiftOp::toInlineString(int indent_size) const { +std::string ShiftOp::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -1545,7 +1587,7 @@ GatherOp::GatherOp( passkey.ir_container_, std::move(pad_width))); } -std::string GatherOp::toString(int indent_size) const { +std::string GatherOp::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << out()->toString() << " = gather( " << in()->toString() << ", {"; @@ -1570,7 +1612,8 @@ std::string GatherOp::toString(int indent_size) const { return ss.str(); } -std::string GatherOp::toInlineString(int indent_size) const { +std::string GatherOp::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -1598,7 +1641,8 @@ ViewAsScalar::ViewAsScalar( addAttribute(index); } -std::string ViewAsScalar::toString(int indent_size) const { +std::string ViewAsScalar::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << out()->toString() << " = view_as_scalar( " << in()->toString() << ", " << vector_id()->toString() @@ -1606,7 +1650,9 @@ std::string ViewAsScalar::toString(int indent_size) const { return ss.str(); } -std::string ViewAsScalar::toInlineString(int indent_size) const { +std::string ViewAsScalar::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -1617,14 +1663,15 @@ ViewOp::ViewOp(IrBuilderPasskey passkey, Val* out, Val* in) : Expr(passkey) { addInput(in); } -std::string ViewOp::toString(int indent_size) const { +std::string ViewOp::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << out()->toString() << " = view( " << in()->toString() << " )\n"; return ss.str(); } -std::string ViewOp::toInlineString(int indent_size) const { +std::string ViewOp::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -1642,14 +1689,17 @@ LoadStoreOp::LoadStoreOp( passkey.ir_container_, op_type)); } -std::string LoadStoreOp::toString(int indent_size) const { +std::string LoadStoreOp::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << out()->toString() << " = " << opType() << "( " << in()->toString() << " )\n"; return ss.str(); } -std::string LoadStoreOp::toInlineString(int indent_size) const { +std::string LoadStoreOp::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -1849,11 +1899,15 @@ bool IterDomain::sameAs(const Statement* other) const { return is_same; } -std::string IterDomain::toString(int indent_size) const { +std::string IterDomain::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; ss << getIterType(); ss << getParallelType(); ss << name(); + if (fmt == SerializationFormat::NameOnly) { + return ss.str(); + } ss << "{"; if (!start()->isZeroInt()) { ss << start()->toInlineString() << " : "; @@ -1872,11 +1926,28 @@ std::string IterDomain::toString(int indent_size) const { if (hasPaddingToMultipleOfWarp()) { ss << "_p"; } + + if (fmt == SerializationFormat::Debug) { + ss << "s" << start_ << "e" << extent_; + if (is_simple_) { + ss << "_simple_"; + } + if (is_mma_swizzled_) { + ss << "_mmaswiz_"; + } + if (expanded_extent_ != nullptr) { + ss << " expanded=" << expanded_extent_->toString(0, fmt); + } + if (stop_offset_ != nullptr) { + ss << " stop_offset_=" << stop_offset_->toString(0, fmt); + } + } return ss.str(); } -std::string IterDomain::toInlineString(int indent_size) const { - return toString(indent_size); +std::string IterDomain::toInlineString(int indent_size, SerializationFormat fmt) + const { + return toString(indent_size, fmt); } // Returns a new IterDomain matching properties of this except for @@ -2062,6 +2133,12 @@ std::pair IterDomain::split( Val* factor, bool inner_split, bool trim_out_of_bounds) { + VAL_LOG_EXPLICIT( + in, + "IterDomain::split", + factor->toString(0, SerializationFormat::NameOnly), + std::to_string(inner_split), + std::to_string(trim_out_of_bounds), ); auto start_offset = trim_out_of_bounds ? in->start() : nullptr; auto stop_offset = trim_out_of_bounds ? in->stopOffset() : nullptr; return IterDomain::split(in, factor, inner_split, start_offset, stop_offset); @@ -2156,6 +2233,8 @@ void IterDomain::parallelize(ParallelType t) { "Parallel type other than serial, tidx, vectorize not allowed for mma swizzled ids"); } + VAL_LOG("IterDomain::parallelize", stringifyThread(t), ); + parallel_type_ = t; } @@ -2373,25 +2452,62 @@ bool TensorDomain::sameAs( return true; } -std::string TensorDomain::toString(int indent_size) const { +std::string TensorDomain::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; + + if (fmt == SerializationFormat::Debug) { + ss << "TD" << name(); + } + if (nDims() == 0) { ss << "[ 0 ]"; return ss.str(); } ss << "[ "; for (const auto i : c10::irange(nDims())) { - ss << axis(i)->toString(); + ss << axis(i)->toInlineString(0, fmt); if (i != nDims() - 1) { ss << ", "; } } ss << " ]"; + + if (fmt == SerializationFormat::Debug) { + ss << " root_domain_:"; + for (auto& it : root_domain_) { + ss << " " << it->toString(0, fmt); + } + ss << " domain_:"; + for (auto& it : domain_) { + ss << " " << it->toString(0, fmt); + } + ss << " no_bcast_domain_:"; + for (auto& it : no_bcast_domain_) { + ss << " " << it->toString(0, fmt); + } + ss << " no_reduction_domain_:"; + for (auto& it : no_reduction_domain_) { + ss << " " << it->toString(0, fmt); + } + ss << " rfactor_domain_:"; + for (auto& it : rfactor_domain_) { + ss << " " << it->toString(0, fmt); + } + ss << " contiguity_:"; + for (auto it : contiguity_) { + ss << " " << it; + } + ss << " has_reduction_:" << has_reduction_; + } + return ss.str(); } -std::string TensorDomain::toInlineString(int indent_size) const { - return toString(indent_size); +std::string TensorDomain::toInlineString( + int indent_size, + SerializationFormat fmt) const { + return toString(indent_size, fmt); } void TensorDomain::setContiguity(const std::vector& contig) { @@ -2524,6 +2640,13 @@ void TensorDomain::split( !id->isMmaSwizzled(), "Further transformation on warp mapped id's not allowed."); + VAL_LOG( + "TensorDomain::split", + std::to_string(axis_), + factor->toString(0, SerializationFormat::NameOnly), + std::to_string(inner_split), + std::to_string(trim_out_of_bounds), ); + auto split_ids = IterDomain::split(id, factor, inner_split, trim_out_of_bounds); domain_.erase(domain_.begin() + axis_); @@ -2550,6 +2673,9 @@ void TensorDomain::merge(int axis_o, int axis_i) { axis_o != axis_i, "Invalid merge detected, axes provided are the same axis."); + VAL_LOG( + "TensorDomain::merge", std::to_string(axis_o), std::to_string(axis_i), ); + if (axis_o > axis_i) { auto tmp = axis_i; axis_i = axis_o; @@ -2576,6 +2702,13 @@ void TensorDomain::reorder(const std::unordered_map& old2new_) { TORCH_INTERNAL_ASSERT( !(nDims() == 0 && old2new_.size() > 0), "Tried to reorder a 0-dim domain"); +#ifndef NDEBUG + std::stringstream ss; + for (auto on : old2new_) { + ss << " " << on.first << "->" << on.second; + } + VAL_LOG("TensorDomain::reorder", ss.str(), ); +#endif domain_ = orderedAs(domain_, old2new_); resetDomains(); } @@ -2708,6 +2841,11 @@ TensorDomain* TensorDomain::view(const AnalyzeViewResult& view_analysis) { } TensorDomain* TensorDomain::flatten(int64_t start_dim, int64_t end_dim) { + VAL_LOG( + "TensorDomain::flatten", + std::to_string(start_dim), + std::to_string(end_dim), ); + auto inp_domain = noReductions(getMaybeRFactorDomain()); if (start_dim < 0) { @@ -2814,7 +2952,7 @@ Split::Split( addAttribute(stop_offset); } -std::string Split::toString(int indent_size) const { +std::string Split::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; ss << (innerSplit() ? "Split: " : "Outer split: "); ss << in()->toString(); @@ -2834,7 +2972,8 @@ std::string Split::toString(int indent_size) const { return ss.str(); } -std::string Split::toInlineString(int indent_size) const { +std::string Split::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Split can not be printed inline"); } @@ -2865,7 +3004,7 @@ Merge::Merge( addInput(inner); } -std::string Merge::toString(int indent_size) const { +std::string Merge::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; ss << "Merge: "; ss << outer()->toString(); @@ -2877,7 +3016,8 @@ std::string Merge::toString(int indent_size) const { return ss.str(); } -std::string Merge::toInlineString(int indent_size) const { +std::string Merge::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -2902,7 +3042,8 @@ Swizzle2D::Swizzle2D( passkey.ir_container_, swizzle_mode)); } -std::string Swizzle2D::toString(int indent_size) const { +std::string Swizzle2D::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; ss << swizzleType() << "(2D): "; ss << inX()->toString(); @@ -2916,7 +3057,8 @@ std::string Swizzle2D::toString(int indent_size) const { return ss.str(); } -std::string Swizzle2D::toInlineString(int indent_size) const { +std::string Swizzle2D::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } diff --git a/third_party/nvfuser/csrc/ir_printer.h b/third_party/nvfuser/csrc/ir_printer.h index 8579730568b9..b4b5aff4f895 100644 --- a/third_party/nvfuser/csrc/ir_printer.h +++ b/third_party/nvfuser/csrc/ir_printer.h @@ -28,7 +28,10 @@ namespace cuda { //! class TORCH_CUDA_CU_API IrMathPrinter : public IrPrinter { public: - IrMathPrinter(std::ostream& os) : IrPrinter(os) {} + IrMathPrinter( + std::ostream& os, + SerializationFormat fmt = SerializationFormat::Default) + : IrPrinter(os, 0, fmt) {} using IrPrinter::handle; @@ -43,7 +46,10 @@ class TORCH_CUDA_CU_API IrMathPrinter : public IrPrinter { //! class TORCH_CUDA_CU_API IrTransformPrinter : public IrPrinter { public: - IrTransformPrinter(std::ostream& os) : IrPrinter(os) {} + IrTransformPrinter( + std::ostream& os, + SerializationFormat fmt = SerializationFormat::Default) + : IrPrinter(os, 0, fmt) {} using IrPrinter::handle; diff --git a/third_party/nvfuser/csrc/kernel_ir.cpp b/third_party/nvfuser/csrc/kernel_ir.cpp index 1983156c4dd5..6b124c39430b 100644 --- a/third_party/nvfuser/csrc/kernel_ir.cpp +++ b/third_party/nvfuser/csrc/kernel_ir.cpp @@ -61,14 +61,16 @@ Predicate::Predicate(IrBuilderPasskey passkey, Bool* value) TORCH_INTERNAL_ASSERT(value != nullptr); } -std::string Predicate::toString(int indent_size) const { +std::string Predicate::toString(int indent_size, SerializationFormat fmt) + const { if (predicate_type() == PredicateType::Manual) { - return value()->toString(); + return value()->toString(0, fmt); } return predicate_type2string(predicate_type()); } -std::string Predicate::toInlineString(int indent_size) const { +std::string Predicate::toInlineString(int indent_size, SerializationFormat fmt) + const { if (predicate_type() == PredicateType::Manual) { return value()->toInlineString(); } @@ -90,7 +92,8 @@ TensorIndex::TensorIndex( "Cannot index with a value other than an int."); } -std::string TensorIndex::toString(int indent_size) const { +std::string TensorIndex::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; ss << ir_utils::varName(this); switch (view()->getMemoryType()) { @@ -107,14 +110,16 @@ std::string TensorIndex::toString(int indent_size) const { TORCH_INTERNAL_ASSERT(false, "Unknown tensor memory type."); } ss << "["; - ss << index()->toInlineString(indent_size); + ss << index()->toInlineString(indent_size, fmt); ss << "]"; ss << " view( " << ir_utils::varName(view()) << " )"; return ss.str(); } -std::string TensorIndex::toInlineString(int indent_size) const { - return toString(indent_size); +std::string TensorIndex::toInlineString( + int indent_size, + SerializationFormat fmt) const { + return toString(indent_size, fmt); } Allocate::Allocate( @@ -191,7 +196,7 @@ Allocate::Allocate( "IR type only valid for Kernel container."); } -std::string Allocate::toString(int indent_size) const { +std::string Allocate::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << buffer()->toString(); ss << " = ALLOCATE(" @@ -207,7 +212,8 @@ std::string Allocate::toString(int indent_size) const { return ss.str(); } -std::string Allocate::toInlineString(int indent_size) const { +std::string Allocate::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -221,14 +227,16 @@ BlockSync::BlockSync(IrBuilderPasskey passkey, bool war_sync) : Expr(passkey) { IrBuilder::create>(passkey.ir_container_, war_sync)); } -std::string BlockSync::toString(int indent_size) const { +std::string BlockSync::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << "BLOCKSYNC(war_hazard=" << boolLiteral(isWarHazardSync()) << ")\n"; return ss.str(); } -std::string BlockSync::toInlineString(int indent_size) const { +std::string BlockSync::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -244,14 +252,15 @@ GridSync::GridSync( addAttribute(sync_buffer); } -std::string GridSync::toString(int indent_size) const { +std::string GridSync::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << "GRIDSYNC(" << syncDims().toString() << ", " << syncBuffer()->toString() << ")\n"; return ss.str(); } -std::string GridSync::toInlineString(int indent_size) const { +std::string GridSync::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -266,13 +275,16 @@ CpAsyncWait::CpAsyncWait(IrBuilderPasskey passkey, unsigned int keep_stages) passkey.ir_container_, keep_stages)); } -std::string CpAsyncWait::toString(int indent_size) const { +std::string CpAsyncWait::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << "CPASYNC_WAIT(" << keepStages() << ")\n"; return ss.str(); } -std::string CpAsyncWait::toInlineString(int indent_size) const { +std::string CpAsyncWait::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -284,13 +296,16 @@ CpAsyncCommit::CpAsyncCommit(IrBuilderPasskey passkey) : Expr(passkey) { "IR type only valid for Kernel container."); } -std::string CpAsyncCommit::toString(int indent_size) const { +std::string CpAsyncCommit::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << "CPASYNC_WAIT()\n"; return ss.str(); } -std::string CpAsyncCommit::toInlineString(int indent_size) const { +std::string CpAsyncCommit::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -302,13 +317,16 @@ InitMagicZero::InitMagicZero(IrBuilderPasskey passkey) : Expr(passkey) { "IR type only valid for Kernel container."); } -std::string InitMagicZero::toString(int indent_size) const { +std::string InitMagicZero::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << "NVFUSER_DEFINE_MAGIC_ZERO\n"; return ss.str(); } -std::string InitMagicZero::toInlineString(int indent_size) const { +std::string InitMagicZero::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -320,22 +338,25 @@ UpdateMagicZero::UpdateMagicZero(IrBuilderPasskey passkey) : Expr(passkey) { "IR type only valid for Kernel container."); } -std::string UpdateMagicZero::toString(int indent_size) const { +std::string UpdateMagicZero::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << "NVFUSER_UPDATE_MAGIC_ZERO\n"; return ss.str(); } -std::string UpdateMagicZero::toInlineString(int indent_size) const { +std::string UpdateMagicZero::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } NVFUSER_DEFINE_CLONE_AND_CREATE(UpdateMagicZero) -std::string Scope::toString(int indent_size) const { +std::string Scope::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; for (auto expr : exprs()) { - ss << expr->toString(indent_size); + ss << expr->toString(indent_size, fmt); } return ss.str(); } @@ -479,7 +500,7 @@ ForLoop::ForLoop(IrBuilderPasskey passkey, const ForLoop* other) "IR type only valid for Kernel container."); } -std::string ForLoop::toString(int indent_size) const { +std::string ForLoop::toString(int indent_size, SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << "FOR " << index()->toString() << " in " << iter_domain()->toString() << ":\n" @@ -487,7 +508,8 @@ std::string ForLoop::toString(int indent_size) const { return ss.str(); } -std::string ForLoop::toInlineString(int indent_size) const { +std::string ForLoop::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -672,7 +694,8 @@ IfThenElse::IfThenElse(IrBuilderPasskey passkey, Predicate* cond) IrBuilder::create>(passkey.ir_container_, this)); } -std::string IfThenElse::toString(int indent_size) const { +std::string IfThenElse::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << "IF " << predicate()->toString() << ":\n" << thenBody().toString(indent_size + 1); @@ -683,7 +706,8 @@ std::string IfThenElse::toString(int indent_size) const { return ss.str(); } -std::string IfThenElse::toInlineString(int indent_size) const { +std::string IfThenElse::toInlineString(int indent_size, SerializationFormat fmt) + const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -716,7 +740,8 @@ GridReduction::GridReduction( IrBuilder::create>(passkey.ir_container_)); } -std::string GridReduction::toString(int indent_size) const { +std::string GridReduction::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; indent(ss, indent_size) << out()->toString() << " = reduction( " << in()->toString() @@ -749,7 +774,9 @@ std::string GridReduction::toString(int indent_size) const { return ss.str(); } -std::string GridReduction::toInlineString(int indent_size) const { +std::string GridReduction::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -792,7 +819,9 @@ GroupedGridReduction::GroupedGridReduction( } } -std::string GroupedGridReduction::toString(int indent_size) const { +std::string GroupedGridReduction::toString( + int indent_size, + SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << "GroupedGridReduction(\n"; ++indent_size; @@ -828,7 +857,9 @@ std::string GroupedGridReduction::toString(int indent_size) const { return ss.str(); } -std::string GroupedGridReduction::toInlineString(int indent_size) const { +std::string GroupedGridReduction::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -848,7 +879,8 @@ GridBroadcast::GridBroadcast( addAttribute(sync_buffer); } -std::string GridBroadcast::toString(int indent_size) const { +std::string GridBroadcast::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; const auto* broadcast_op = this->broadcast_op(); indent(ss, indent_size) << broadcast_op->out()->toString() << " = " @@ -861,7 +893,9 @@ std::string GridBroadcast::toString(int indent_size) const { return ss.str(); } -std::string GridBroadcast::toInlineString(int indent_size) const { +std::string GridBroadcast::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -891,7 +925,8 @@ GridWelford::GridWelford( IrBuilder::create>(passkey.ir_container_)); } -std::string GridWelford::toString(int indent_size) const { +std::string GridWelford::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; const auto* welford_op = this->welford_op(); indent(ss, indent_size) << welford_op->outAvg()->toString() << " (Avg),\n"; @@ -952,7 +987,9 @@ std::string GridWelford::toString(int indent_size) const { return ss.str(); } -std::string GridWelford::toInlineString(int indent_size) const { +std::string GridWelford::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -1043,7 +1080,9 @@ int GroupedGridWelford::getSmemBufferSize(int bdimx, int bdimy, int bdimz) return buf_size_for_avg_var * 2 + buf_size_for_N; } -std::string GroupedGridWelford::toString(int indent_size) const { +std::string GroupedGridWelford::toString( + int indent_size, + SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << "GroupedGridWelford(\n"; ++indent_size; @@ -1095,7 +1134,9 @@ std::string GroupedGridWelford::toString(int indent_size) const { return ss.str(); } -std::string GroupedGridWelford::toInlineString(int indent_size) const { +std::string GroupedGridWelford::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } @@ -1127,14 +1168,18 @@ AllocateFusedReduction::AllocateFusedReduction( addAttribute(grid_expr); } -std::string AllocateFusedReduction::toString(int indent_size) const { +std::string AllocateFusedReduction::toString( + int indent_size, + SerializationFormat fmt) const { std::stringstream ss; indent(ss, indent_size) << "AllocateFusedReduction(reduction buffer=" << out()->toString() << ")\n"; return ss.str(); } -std::string AllocateFusedReduction::toInlineString(int indent_size) const { +std::string AllocateFusedReduction::toInlineString( + int indent_size, + SerializationFormat fmt) const { TORCH_CHECK(false, "Tensor op can not be printed inline"); } diff --git a/third_party/nvfuser/csrc/kernel_ir.h b/third_party/nvfuser/csrc/kernel_ir.h index f71553d75447..56846059767f 100644 --- a/third_party/nvfuser/csrc/kernel_ir.h +++ b/third_party/nvfuser/csrc/kernel_ir.h @@ -60,9 +60,13 @@ class TORCH_CUDA_CU_API Predicate final : public Val { explicit Predicate(IrBuilderPasskey passkey, Bool* value); - std::string toString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; PredicateType predicate_type() const { return ptype_; @@ -140,9 +144,13 @@ class TORCH_CUDA_CU_API TensorIndex final : public Val { return const_cast(view_); // NOLINT } - std::string toString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; private: const TensorView* view_ = nullptr; @@ -184,8 +192,12 @@ class TORCH_CUDA_CU_API Allocate final : public Expr { NVFUSER_DECLARE_CLONE_AND_CREATE - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* buffer() const { return attributeVal(0); @@ -237,8 +249,12 @@ class TORCH_CUDA_CU_API BlockSync final : public Expr { NVFUSER_DECLARE_CLONE_AND_CREATE - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; // TODO: war_sync_ is only used for testing/validation purposes. bool isWarHazardSync() const { @@ -263,8 +279,12 @@ class TORCH_CUDA_CU_API GridSync final : public Expr { return "GridSync"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; ParallelTypeBitmap syncDims() const { return attribute(0)->as>()->value; @@ -288,8 +308,12 @@ class TORCH_CUDA_CU_API CpAsyncWait final : public Expr { return "CpAsyncWait"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; //! Returns the remaining number of stages that are not synchronized //! after this op. @@ -313,8 +337,12 @@ class TORCH_CUDA_CU_API CpAsyncCommit final : public Expr { return "CpAsyncCommit"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; }; // Simply prints "DEFINE_MAGIC_ZERO" in the code in accordance with magic_zero @@ -331,8 +359,12 @@ class TORCH_CUDA_CU_API InitMagicZero final : public Expr { return "InitMagicZero"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; }; // Simply prints "UPDATE_MAGIC_ZERO" in the code in accordance with magic_zero @@ -349,8 +381,12 @@ class TORCH_CUDA_CU_API UpdateMagicZero final : public Expr { return "UpdateMagicZero"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; }; // TODO(kir): promote to IR node @@ -358,7 +394,9 @@ class TORCH_CUDA_CU_API Scope { public: explicit Scope(Expr* owner) : owner_(owner) {} - std::string toString(int indent_size = 0) const; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const; const std::vector& exprs() const { return exprs_; @@ -473,8 +511,12 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr { return "ForLoop"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Val* index() const { return input(0); @@ -562,8 +604,12 @@ class TORCH_CUDA_CU_API IfThenElse final : public Expr { return "IfThenElse"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Scope& thenBody() { return attribute(0)->as>()->value; @@ -616,8 +662,12 @@ class TORCH_CUDA_CU_API GridReduction final : public ReductionOp { return "GridReduction"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; Allocate* reduction_buffer() const { return attribute(num_reduction_op_attr)->as(); @@ -688,8 +738,12 @@ class TORCH_CUDA_CU_API GroupedGridReduction final : public GroupedReductionOp { return "GroupedGridReduction"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; std::vector reduction_buffers() const { auto offset = numGroupedReductionOpAttr() + 5; @@ -771,8 +825,12 @@ class TORCH_CUDA_CU_API GridBroadcast final : public Expr { return "GridBroadcast"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; BroadcastOp* broadcast_op() const { return attribute(0)->as(); @@ -816,8 +874,12 @@ class TORCH_CUDA_CU_API GridWelford final : public Expr { return "GridWelford"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; WelfordOp* welford_op() const { return attribute(0)->as(); @@ -894,8 +956,12 @@ class TORCH_CUDA_CU_API GroupedGridWelford final : public GroupedWelfordOp { return "GroupedGridWelford"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; std::array, 3> reduction_buffers() const { auto offset = numGroupedWelfordOpAttr() + 5; @@ -1036,8 +1102,12 @@ class TORCH_CUDA_CU_API AllocateFusedReduction final : public Expr { return "AllocateFusedReduction"; } - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; + std::string toString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; + std::string toInlineString( + int indent_size = 0, + SerializationFormat fmt = SerializationFormat::Default) const override; //! GridReduction, GridWelford, GroupedGridReduction or GroupedGridWelford Expr* gridExpr() const { diff --git a/third_party/nvfuser/csrc/scheduler/utils.cpp b/third_party/nvfuser/csrc/scheduler/utils.cpp index d0ddbe8a7922..4ae0fd02afca 100644 --- a/third_party/nvfuser/csrc/scheduler/utils.cpp +++ b/third_party/nvfuser/csrc/scheduler/utils.cpp @@ -233,6 +233,38 @@ void parallelizeAllLike( std::vector selected_tvs, const std::unordered_set& selected_parallel_types, bool propagate_padding) { +#ifndef NDEBUG + { + std::stringstream ss; + ss << "{"; + bool comma = false; + for (auto t : selected_tvs) { + if (comma) + ss << ", "; + ss << t->toString(0, SerializationFormat::NameOnly); + comma = true; + } + ss << " }"; + std::stringstream ssp; + ssp << "{"; + comma = false; + for (auto t : selected_parallel_types) { + if (comma) + ssp << ", "; + ssp << stringifyThread(t); + comma = true; + } + ssp << " }"; + VAL_LOG_EXPLICIT( + reference_tv, + "scheduler_utils::parallelizeAllLike", + std::to_string(pos), + ss.str(), + ssp.str(), + std::to_string(propagate_padding), ); + } +#endif + FusionGuard fg(reference_tv->fusion()); if (pos < 0) { diff --git a/third_party/nvfuser/csrc/tensor_view.cpp b/third_party/nvfuser/csrc/tensor_view.cpp index 8c736dc3f681..40c2a08f3737 100644 --- a/third_party/nvfuser/csrc/tensor_view.cpp +++ b/third_party/nvfuser/csrc/tensor_view.cpp @@ -36,7 +36,14 @@ TensorView::TensorView( MemoryType mtype) : Val(passkey, ValType::TensorView, dtype), domain_(domain), - memory_type_(mtype) {} + memory_type_(mtype) { + VAL_LOG( + "TensorView::TensorView", + "IrBuilderPasskey", + domain->toString(0, SerializationFormat::NameOnly), + typePrefix(dtype), + "MemoryType"); +} TensorView::TensorView( IrBuilderPasskey passkey, @@ -44,6 +51,7 @@ TensorView::TensorView( : Val(passkey, ValType::TensorView, aten_opt_type_map(tensor_type->scalarType())) { + VAL_LOG("TensorView::TensorView", "IrBuilderPasskey", "TensorType"); TORCH_INTERNAL_ASSERT( !container()->isA(), "Function invalid for kernel container."); @@ -135,6 +143,7 @@ TensorView::TensorView( IrBuilderPasskey passkey, const std::shared_ptr& jit_value) : TensorView(passkey, jit_value->type()->cast()) { + VAL_LOG("TensorView::TensorView", "IrBuilderPasskey", "Value"); TORCH_INTERNAL_ASSERT( !container()->isA(), "Function invalid for kernel container."); @@ -142,7 +151,8 @@ TensorView::TensorView( NVFUSER_DEFINE_CLONE(TensorView) -std::string TensorView::toString(int indent_size) const { +std::string TensorView::toString(int indent_size, SerializationFormat fmt) + const { std::stringstream ss; ss << ir_utils::varName(this); switch (getMemoryType()) { @@ -158,7 +168,10 @@ std::string TensorView::toString(int indent_size) const { default: TORCH_INTERNAL_ASSERT(false, "Unknown tensor memory type."); } - ss << domain()->toString(indent_size); + if (fmt == SerializationFormat::NameOnly) { + return ss.str(); + } + ss << domain()->toString(indent_size, fmt); if (getComputeAtPosition() > 0) { ss << " ca_pos( "; @@ -191,11 +204,20 @@ std::string TensorView::toString(int indent_size) const { ss << getMaybeMaxProducerPosition(); ss << " )"; } + if (fmt == SerializationFormat::Debug) { + // ss << " memory_type_"; + ss << " db?" << is_double_buffered_; + ss << " cb?" << is_circular_buffered_; + ss << " cbs=" << circular_buffer_stage_; + ss << " cs=" << cpu_scalar_; + ss << " swiz?" << has_swizzle_op_; + } return ss.str(); } -std::string TensorView::toInlineString(int indent_size) const { - return toString(indent_size); +std::string TensorView::toInlineString(int indent_size, SerializationFormat fmt) + const { + return toString(indent_size, fmt); } void TensorView::convertRfactorToRootDomain() { @@ -349,6 +371,11 @@ void TensorView::inlineAt( !container()->isA(), "Function invalid for kernel container."); + VAL_LOG( + "TensorView::inlineAt", + std::to_string(pos), + std::to_string(best_effort), ); + std::unique_ptr calc_owner; if (calc == nullptr) { calc_owner = std::make_unique(); @@ -487,6 +514,13 @@ TensorView* TensorView::computeAt( // Make sure this and consumer are not the same tensor, that's illegal TORCH_CHECK(!sameAs(consumer), "Cannot call this->computeAt(this, ...)"); + VAL_LOG( + "TensorView::computeAt", + consumer->toString(0, SerializationFormat::NameOnly), + std::to_string(position), + // ComputeAtMode + ); + // We support negative axes, so increment it by consumer->nDims() + 1 and make // sure the result is within consumer->nDims() + 1. being at consumer->nDims() // means producer will be computed inline with consumer, hence the +1. @@ -514,6 +548,11 @@ void TensorView::computeWith(int pos, bool best_effort) { !container()->isA(), "Function invalid for kernel container."); + VAL_LOG( + "TensorView::computeWith", + std::to_string(pos), + std::to_string(best_effort), ); + if (isFusionInput()) { return; } @@ -731,6 +770,13 @@ TensorView* TensorView::split( ". Tensor: ", toString()); + VAL_LOG( + "TensorView::split", + std::to_string(axis_), + factor->toString(0, SerializationFormat::NameOnly), + std::to_string(inner_split), + std::to_string(trim_out_of_bounds), ); + domain()->split(axis_, factor, inner_split, trim_out_of_bounds); return this; } @@ -829,6 +875,9 @@ TensorView* TensorView::reorder(const std::unordered_map& old2new_) { getMaybeMaxProducerPosition()); } + // we'll leave the logging of individual axis reorders to the next line + VAL_LOG("TensorView::reorder"); + domain()->reorder(old2new_); return this; } @@ -952,6 +1001,14 @@ TensorView* TensorView::rFactor(const std::vector& axes) { !definition()->isA(), "For GroupedReductionOp, use TensorView::rFactor(const std::vector& axes, const std::vector& tvs)"); + { + std::stringstream ss; + for (auto ax : axes) { + ss << " " << ax; + } + VAL_LOG("TensorView::rFactor", ss.str()); + } + // Split tensor view into 2 parts auto domain_pair = domain()->rFactor(axes); diff --git a/third_party/nvfuser/csrc/utils.cpp b/third_party/nvfuser/csrc/utils.cpp index 5eaef09fb4b7..cb84c00285da 100644 --- a/third_party/nvfuser/csrc/utils.cpp +++ b/third_party/nvfuser/csrc/utils.cpp @@ -98,6 +98,7 @@ auto parseEnvOptions( auto parseDebugDumpOptions() { const std::unordered_map available_options = { + {"fusion_debug", DebugDumpOption::FusionDebug}, {"fusion_ir", DebugDumpOption::FusionIr}, {"fusion_ir_math", DebugDumpOption::FusionIrMath}, {"fusion_ir_presched", DebugDumpOption::FusionIrPresched}, @@ -184,8 +185,66 @@ const auto& getEnableOptions() { return options; } +//! Parse environment variable that represents one choice from an enum +//! OptionEnum must be an enum like SerializationFormat +template +OptionEnum parseEnvChoice( + const char* option_env_name, + const std::unordered_map& available_options, + const OptionEnum defaultOption) { + // Make sure available_options includes all of the enum values + TORCH_INTERNAL_ASSERT( + available_options.size() == static_cast(OptionEnum::EndOfOption), + "Invalid available option map"); + + auto option = defaultOption; + + if (const char* option_string = std::getenv(option_env_name)) { + auto option_it = available_options.find(std::string(option_string)); + if (option_it == available_options.end()) { + // get vector of valid option strings for error message + std::vector option_values; + std::transform( + available_options.begin(), + available_options.end(), + std::back_inserter(option_values), + [](const auto& kv) { return kv.first; }); + std::sort(option_values.begin(), option_values.end()); + TORCH_CHECK( + false, + "Parsing ", + option_env_name, + " failed. Invalid option: '", + option_string, + "'\nAvailable options (case-sensitive): ", + toDelimitedString(option_values)); + } else { + option = option_it->second; + } + } + + return option; +} + } // namespace +//! Parse environment variable to find print format, or return Default +auto parsePrintFormat() { + const std::unordered_map available_options = + {{"default", SerializationFormat::Default}, + {"debug", SerializationFormat::Debug}}; + + return parseEnvChoice( + "PYTORCH_NVFUSER_PRINT_FORMAT", + available_options, + SerializationFormat::Default); +} + +SerializationFormat getPrintFormat() { + static const auto format = parsePrintFormat(); + return format; +} + C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function") void debugPrint(const c10::TensorTypePtr& type) { std::stringstream sizes_s; diff --git a/third_party/nvfuser/csrc/utils.h b/third_party/nvfuser/csrc/utils.h index 0949ce39ad50..a146fd72b7fa 100644 --- a/third_party/nvfuser/csrc/utils.h +++ b/third_party/nvfuser/csrc/utils.h @@ -34,6 +34,8 @@ KernelIndexMode collectIndexMode(const at::ArrayRef& inputs); //! These can be set through the `PYTORCH_NVFUSER_DUMP` environment variable //! enum class DebugDumpOption { + FusionDebug, //!< Dump the entire contents of the Fusion object before + //!< lowering FusionIr, //!< Dump the Fusion IR before lowering FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR FusionIrPresched, //!< Dump the Fusion IR before it is scheduled. @@ -118,6 +120,32 @@ TORCH_CUDA_CU_API bool isOptionEnabled(EnableOption option); TORCH_CUDA_CU_API const std::vector& getEnableOptionArguments( EnableOption option); +TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option); +TORCH_CUDA_CU_API const std::vector& getDebugDumpArguments( + DebugDumpOption option); + +//! Types of serialization formats +//! +//! When dumping IR to screen, the text modes can be set through the +//! `PYTORCH_NVFUSER_PRINT_FORMAT` environment variable +//! +//! Note this list includes some formats that are machine-readable and some +//! that are not, enabling the same entry points to handle pretty-printing and +//! serde. +//! +enum class SerializationFormat { + Default, //! [TEXT] Default text format for printing to screen + Debug, //! [TEXT] Dump full IR recursively, including all member variables and + NameOnly, //! [TEXT] Only print the name of each object + //! memory locations + // FlatBuffers, //! [BINARY] Dump full IR using FlatBuffers + // JSON, //! [TEXT] Dump full IR recursively in JSON format. Uses FlatBuffers + // text format. + EndOfOption //! Placeholder for counting the number of elements +}; + +TORCH_CUDA_CU_API SerializationFormat getPrintFormat(); + // Check if fallback path should be used which will dispatch to eagermode if any // errors are encountered. Helpful for debugging. bool useFallback(); @@ -341,6 +369,16 @@ std::string toDelimitedString( return toDelimitedString(vec.begin(), vec.end(), delim); } +#ifndef NDEBUG +//! These macros just pack the variadic arguments into a vector to pass to the +//! log function +#define VAL_LOG_EXPLICIT(obj, op_name, ...) obj->log(op_name, {__VA_ARGS__}) +#define VAL_LOG(op_name, ...) log(op_name, {__VA_ARGS__}) +#else +#define VAL_LOG_EXPLICIT(obj, op_name, ...) +#define VAL_LOG(op_name, ...) +#endif + } // namespace cuda } // namespace fuser } // namespace jit