diff --git a/third_party/nvfuser/csrc/executor.cpp b/third_party/nvfuser/csrc/executor.cpp
index b5b5289094fd..2ddbf6f025dd 100644
--- a/third_party/nvfuser/csrc/executor.cpp
+++ b/third_party/nvfuser/csrc/executor.cpp
@@ -236,6 +236,10 @@ void FusionExecutor::compileFusion(
     }
   }
 
+  if (isDebugDumpEnabled(DebugDumpOption::FusionDebug)) {
+    fusion->printDebug();
+  }
+
   if (isDebugDumpEnabled(DebugDumpOption::FusionIr)) {
     fusion->print();
   } else if (isDebugDumpEnabled(DebugDumpOption::FusionIrMath)) {
diff --git a/third_party/nvfuser/csrc/expr_simplifier.cpp b/third_party/nvfuser/csrc/expr_simplifier.cpp
index 53ea426a57c2..0c31a9ea664c 100644
--- a/third_party/nvfuser/csrc/expr_simplifier.cpp
+++ b/third_party/nvfuser/csrc/expr_simplifier.cpp
@@ -411,7 +411,9 @@ class FlattenedAssocCommOp : public Expr {
     return other_inputs.empty();
   }
 
-  std::string toString(int indent_size = 0) const override {
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override {
     std::stringstream ss;
     indent(ss, indent_size) << getOpString() << "(";
     bool needs_comma = false;
@@ -426,7 +428,9 @@ class FlattenedAssocCommOp : public Expr {
     return ss.str();
   }
 
-  std::string toInlineString(int = 0) const override {
+  std::string toInlineString(
+      int = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override {
     std::stringstream ss;
     ss << getOpString() << "(";
     bool needs_comma = false;
diff --git a/third_party/nvfuser/csrc/fusion.cpp b/third_party/nvfuser/csrc/fusion.cpp
index 31bb763ac559..300effdf4590 100644
--- a/third_party/nvfuser/csrc/fusion.cpp
+++ b/third_party/nvfuser/csrc/fusion.cpp
@@ -12,6 +12,7 @@
 #include <kernel.h>
 #include <lower2device.h>
 #include <lower_bank_conflict.h>
+#include <utils.h>
 
 namespace torch {
 namespace jit {
@@ -344,26 +345,150 @@ void Fusion::validateInputs() {
   }
 }
 
-void Fusion::print() {
+void Fusion::serialize(std::ostream& out, SerializationFormat fmt) {
+  FUSER_PERF_SCOPE("Fusion::serialize");
+
+  switch (fmt) {
+    case SerializationFormat::NameOnly:
+      out << "Fusion";
+      break;
+    case SerializationFormat::Default: {
+      FusionGuard fg(this);
+      out << "\n%kernel {\n";
+      IrMathPrinter op_exprs(out);
+      op_exprs.handle(this);
+      out << "\nTransformPrinter : \n";
+      IrTransformPrinter t_exprs(out);
+      t_exprs.handle(this);
+      out << "}\n\n";
+      break;
+    }
+    case SerializationFormat::Debug: {
+      break;
+    }
+    case SerializationFormat::EndOfOption:
+      break;
+  }
+}
+
+void Fusion::printDebug(std::ostream& out) {
+  FUSER_PERF_SCOPE("Fusion::printDebug");
+
+  out << "Fusion DEBUG INFO {";
+  std::vector<Val*> inputs_;
+  out << "\n  inputs_ = {";
+  for (auto& it : inputs_) {
+    out << "\n    " << it->toString(0, SerializationFormat::Debug);
+  }
+  out << "  }\n";
+  out << "\n  outputs_ = {";
+  for (auto& it : outputs_) {
+    out << "\n    " << it->toString(0, SerializationFormat::Debug);
+    auto a = getOutputAlias(it);
+    if (a != nullptr) {
+      out << " ALIASES " << a->toString(0, SerializationFormat::NameOnly);
+    }
+  }
+  out << "  }\n";
+  out << "\n  all_tv_uses_valid_ = " << all_tv_uses_valid_;
+  out << "\n  is_during_update_uses_ = " << is_during_update_uses_;
+  out << "\n  io_alias_ = {";
+  for (auto& it : io_alias_) { // NOTE: ordering arbitrary
+    out << "\n    " << it.first->toString(0, SerializationFormat::Debug)
+        << " => " << it.second->toString(0, SerializationFormat::Debug);
+  }
+  out << "  }\n";
+  out << "\n  permuted_input_map_ = {";
+  for (auto& it : permuted_input_map_) { // NOTE: ordering arbitrary
+    out << "\n    " << it.first << " => " << it.second;
+  }
+  out << "  }\n";
+  out << "\n  permuted_output_map_ = {";
+  for (auto& it : permuted_input_map_) { // NOTE: ordering arbitrary
+    out << "\n    " << it.first << " => " << it.second;
+  }
+  out << "  }\n";
+
+  auto ind = "  ";
+  out << ind << "  expr_name_counter = " << expr_name_counter_;
+  out << ind << "\n  vals_ (" << vals_.size() << ") = [";
+  std::vector<std::string> valstrs;
+  std::vector<std::tuple<int, std::string, std::string>> all_logs;
+  for (auto& it : vals_) { // NOTE: ordering arbitrary
+    std::stringstream obss;
+    obss << it->toString(3, SerializationFormat::Debug);
+    auto log = it->getLogMessages();
+    if (log.size() > 0) {
+      auto val_name = it->toString(0, SerializationFormat::NameOnly);
+      for (auto num_msg :
+           it->getLogMessages()) { // pair log sequence number & message
+        all_logs.push_back({num_msg.first, val_name, num_msg.second});
+      }
+    }
+    valstrs.push_back(obss.str());
+  }
+  std::sort(valstrs.begin(), valstrs.end());
+  for (auto& it : valstrs) { // sorted
+    out << ind << "\n    " << it;
+  }
+  out << ind << "  ]\n";
+  out << ind << "\n  exprs_ (" << exprs_.size() << ") = [\n";
+  std::vector<std::string> expstrs;
+  for (auto& it : exprs_) { // NOTE: ordering arbitrary
+    expstrs.push_back(it->toString(3, SerializationFormat::NameOnly));
+  }
+  std::sort(expstrs.begin(), expstrs.end());
+  for (auto& it : expstrs) { // sorted
+    out << ind << it;
+  }
+  out << ind << "  ]\n";
+  out << ind << "\n  val_type_name_map_ (" << val_type_name_map_.size()
+      << ") = {";
+  for (auto& it : val_type_name_map_) { // NOTE: ordering arbitrary
+    out << ind << "\n    " << (int)it.first << " => " << it.second;
+  }
+  out << ind << "  }\n";
+  std::sort(all_logs.begin(), all_logs.end());
+  out << ind << "\n  Logged operations:";
+#ifdef NDEBUG
+  std::cerr << "WARNING: Fusion operations are only logged in Debug builds."
+            << std::endl;
+#endif
+  // Actual lognum may be large if there are multiple Fusions defined in this
+  // process. Instead, just print a local counter for the log messages
+  // appearing in this Fusion's vals_.
+  int local_lognum = 0;
+  for (auto entry : all_logs) {
+    int lognum;
+    std::string val_name;
+    std::string msg;
+    std::tie(lognum, val_name, msg) = entry;
+    out << ind << "\n    " << local_lognum++ << ") " << val_name << " : "
+        << msg;
+  }
+  out << "\n}\n";
+}
+
+void Fusion::print(std::ostream& out, SerializationFormat fmt) {
   FUSER_PERF_SCOPE("Fusion::print");
 
   FusionGuard fg(this);
-  std::cout << "\n%kernel {\n";
-  IrMathPrinter op_exprs(std::cout);
+  out << "\n%kernel {\n";
+  IrMathPrinter op_exprs(out, fmt);
   op_exprs.handle(this);
-  std::cout << "\nTransformPrinter : \n";
-  IrTransformPrinter t_exprs(std::cout);
+  out << "\nTransformPrinter : \n";
+  IrTransformPrinter t_exprs(out, fmt);
   t_exprs.handle(this);
-  std::cout << "}\n\n";
+  out << "}\n\n";
 }
 
-void Fusion::printKernel(DataType index_type) {
+void Fusion::printKernel(DataType index_type, std::ostream& out) {
   FUSER_PERF_SCOPE("Fusion::printKernel");
   TORCH_INTERNAL_ASSERT(
       !this->isA<kir::Kernel>(),
       "Cannot \"print kernel\" of a kernel container. ",
       "This would require lowering during lowering.");
-  std::cout << codegen::generateCudaKernel(GpuLower(this, index_type).kernel());
+  out << codegen::generateCudaKernel(GpuLower(this, index_type).kernel());
 }
 
 std::unordered_map<std::string, std::pair<int, int>> Fusion::bankConflictInfo(
@@ -380,38 +505,54 @@ std::unordered_map<std::string, std::pair<int, int>> Fusion::bankConflictInfo(
   return result;
 }
 
-void Fusion::printMath(bool from_outputs_only) {
+void Fusion::printMath(
+    bool from_outputs_only,
+    std::ostream& out,
+    SerializationFormat fmt) {
   FUSER_PERF_SCOPE("Fusion::printMath");
 
-  FusionGuard fg(this);
-  auto exprs_for_print = exprs();
-  std::cout << "Inputs:" << std::endl;
-  for (auto inp : inputs()) {
-    std::cout << "  " << inp << ", " << inp->getDataType().value() << std::endl;
-  }
+  switch (fmt) {
+    case SerializationFormat::NameOnly:
+      out << "Fusion Math";
+      break;
+    case SerializationFormat::Default: {
+      FusionGuard fg(this);
+      auto exprs_for_print = exprs();
+      out << "Inputs:" << std::endl;
+      for (auto inp : inputs()) {
+        out << "  " << inp << ", " << inp->getDataType().value() << std::endl;
+      }
 
-  std::cout << "Outputs:" << std::endl;
-  for (auto out : outputs()) {
-    std::cout << "  " << out << ", " << out->getDataType().value() << std::endl;
-  }
+      out << "Outputs:" << std::endl;
+      for (auto output : outputs()) {
+        out << "  " << output << ", " << output->getDataType().value()
+            << std::endl;
+      }
 
-  // If we want everything in the fusion, grab all values without uses to
-  // traverse from.
-  if (!from_outputs_only) {
-    std::vector<Val*> leaf_vals;
-    for (auto val : deterministic_vals()) {
-      if (val->uses().empty()) {
-        leaf_vals.push_back(val);
+      // If we want everything in the fusion, grab all values without uses to
+      // traverse from.
+      if (!from_outputs_only) {
+        std::vector<Val*> leaf_vals;
+        for (auto val : deterministic_vals()) {
+          if (val->uses().empty()) {
+            leaf_vals.push_back(val);
+          }
+        }
+        exprs_for_print = StmtSort::getExprs(this, leaf_vals);
       }
-    }
-    exprs_for_print = StmtSort::getExprs(this, leaf_vals);
-  }
 
-  std::cout << "\n%kernel_math {\n";
-  for (auto expr : exprs_for_print) {
-    std::cout << expr;
+      out << "\n%kernel_math {\n";
+      for (auto expr : exprs_for_print) {
+        out << expr;
+      }
+      out << "}\n\n";
+      break;
+    }
+    case SerializationFormat::Debug:
+      break;
+    case SerializationFormat::EndOfOption:
+      break;
   }
-  std::cout << "}\n\n";
 }
 
 std::vector<Val*> Fusion::inputsAndCreated() {
@@ -427,12 +568,25 @@ std::vector<Val*> Fusion::inputsAndCreated() {
   return result;
 }
 
-void Fusion::printTransforms() {
+void Fusion::printTransforms(std::ostream& out, SerializationFormat fmt) {
   FUSER_PERF_SCOPE("Fusion::printTransforms");
 
-  FusionGuard fg(this);
-  IrTransformPrinter t_exprs(std::cout);
-  t_exprs.handle(this);
+  switch (fmt) {
+    case SerializationFormat::NameOnly:
+      out << "Fusion Transforms";
+      break;
+    case SerializationFormat::Default: {
+      FusionGuard fg(this);
+      IrTransformPrinter t_exprs(out);
+      t_exprs.handle(this);
+      break;
+    }
+    case SerializationFormat::Debug: {
+      out << "DEBUG OUTPUT:" << std::endl;
+    }
+    case SerializationFormat::EndOfOption:
+      break;
+  }
 }
 
 void Fusion::registerVal(Val* val) {
@@ -660,6 +814,12 @@ bool Fusion::isAliasCompatible(Val* left, Val* right) {
 }
 
 void Fusion::aliasOutputToInput(Val* output, Val* input) {
+  VAL_LOG_EXPLICIT(
+      output,
+      "Fusion::aliasOutputToInput",
+      output->toString(0, SerializationFormat::NameOnly),
+      input->toString(0, SerializationFormat::NameOnly), );
+
   // Because we could cast output when input is cast.
   TORCH_INTERNAL_ASSERT(
       !output->isFusionOutput(),
diff --git a/third_party/nvfuser/csrc/fusion.h b/third_party/nvfuser/csrc/fusion.h
index d8cef33fda0d..5cc867c66643 100644
--- a/third_party/nvfuser/csrc/fusion.h
+++ b/third_party/nvfuser/csrc/fusion.h
@@ -123,18 +123,36 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer {
   //! Assert that all leaves found from outputs are registered as an input
   void validateInputs();
 
+  //! Serialize in text or binary form using one of many formats
+  void serialize(std::ostream& out, SerializationFormat fmt);
+
+  //! Deserialize from the given format
+  void deserialize(std::istream& in, SerializationFormat fmt);
+
+  //! Print detailed debug information about this fusion to the console
+  void printDebug(std::ostream& out = std::cout);
+
   //! Print this fusion to the console
-  void print();
+  void print(
+      std::ostream& out = std::cout,
+      SerializationFormat fmt = SerializationFormat::Default);
 
   //! Print Arith exprs
   //! \param from_outputs_only Only print exprs reachable from outputs
-  void printMath(bool from_outputs_only = true);
+  void printMath(
+      bool from_outputs_only = true,
+      std::ostream& out = std::cout,
+      SerializationFormat fmt = SerializationFormat::Default);
 
   //! Print transformations used in fusion (can be very verbose)
-  void printTransforms();
+  void printTransforms(
+      std::ostream& out = std::cout,
+      SerializationFormat fmt = SerializationFormat::Default);
 
   //! Lower the fusion and print a kernel
-  void printKernel(DataType index_type = DataType::Int);
+  void printKernel(
+      DataType index_type = DataType::Int,
+      std::ostream& out = std::cout);
 
   //! Returns if this fusion is noop, for example, trivially forwarding inputs,
   //! or all outputs are size-0 tensors, etc.
diff --git a/third_party/nvfuser/csrc/ir_base_nodes.cpp b/third_party/nvfuser/csrc/ir_base_nodes.cpp
index fdc520109a96..01f8c7a0ad5a 100644
--- a/third_party/nvfuser/csrc/ir_base_nodes.cpp
+++ b/third_party/nvfuser/csrc/ir_base_nodes.cpp
@@ -60,12 +60,14 @@ bool Statement::lessThan(const Statement* stmt1, const Statement* stmt2) {
   return stmt1->name() < stmt2->name();
 }
 
-std::string Statement::toString(int indent_size) const {
+std::string Statement::toString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_INTERNAL_ASSERT(
       false, "toString for IR node ", typeid(*this).name(), " is not defined");
 }
 
-std::string Statement::toInlineString(int indent_size) const {
+std::string Statement::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_INTERNAL_ASSERT(
       false,
       "toInlineString for IR node ",
diff --git a/third_party/nvfuser/csrc/ir_base_nodes.h b/third_party/nvfuser/csrc/ir_base_nodes.h
index a3a03ea447be..190a200f9b99 100644
--- a/third_party/nvfuser/csrc/ir_base_nodes.h
+++ b/third_party/nvfuser/csrc/ir_base_nodes.h
@@ -166,9 +166,13 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase {
 
   static bool lessThan(const Statement* stmt1, const Statement* stmt2);
 
-  virtual std::string toString(int indent_size = 0) const;
+  virtual std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const;
 
-  virtual std::string toInlineString(int indent_size = 0) const;
+  virtual std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const;
 
   virtual Statement* clone(IrCloner* ir_cloner) const;
 
@@ -361,6 +365,37 @@ class TORCH_CUDA_CU_API Val : public Statement {
 
   void resolveIndexDtype();
 
+  //! Get vector of log messages for this object
+  auto getLogMessages() const {
+    return log_messages_;
+  }
+
+  //! Write a message to this object's log
+  void log(std::string op_name, std::vector<std::string> arg_strings) {
+#ifdef NDEBUG
+    static bool warned = false;
+    if (!warned) {
+      std::cout
+          << " WARNING: Logging is slow and should be disabled in Release builds!"
+          << std::endl;
+      warned = true;
+    }
+#endif
+    static int op_num = 0;
+    std::stringstream ss;
+    ss << op_name << "(";
+    ss << this->toString(0, SerializationFormat::NameOnly) << "):";
+    bool skip_comma = true;
+    for (auto a : arg_strings) {
+      if (!skip_comma) {
+        ss << ",";
+      }
+      ss << " " << a;
+      skip_comma = false;
+    }
+    log_messages_.push_back({op_num++, ss.str()});
+  }
+
   NVFUSER_DECLARE_CLONE
 
  protected:
@@ -399,6 +434,9 @@ class TORCH_CUDA_CU_API Val : public Statement {
 
   // Expr evaluator idx;
   int evaluator_index_ = -1;
+
+  //! Holds log messages
+  std::vector<std::pair<int, std::string>> log_messages_;
 };
 
 //! A Val object that stores a plain data. Note that this class is only intended
@@ -425,11 +463,11 @@ class TORCH_CUDA_CU_API Attribute : public Val {
     return false;
   }
 
-  virtual std::string toString(int) const override {
+  virtual std::string toString(int, SerializationFormat) const override {
     return Printer<T>::toString(value);
   }
 
-  virtual std::string toInlineString(int) const override {
+  virtual std::string toInlineString(int, SerializationFormat) const override {
     return Printer<T>::toString(value);
   }
 };
diff --git a/third_party/nvfuser/csrc/ir_interface_nodes.h b/third_party/nvfuser/csrc/ir_interface_nodes.h
index bcdbf45571c2..8e1347016267 100644
--- a/third_party/nvfuser/csrc/ir_interface_nodes.h
+++ b/third_party/nvfuser/csrc/ir_interface_nodes.h
@@ -62,6 +62,7 @@ class TORCH_CUDA_CU_API Scalar : public Val {
             (c10::is_complex<UnderlyingType>::value && isComplexType(dtype)),
         "Invalid data type: ",
         dtype);
+    VAL_LOG("Scalar::Scalar", "IrBuilderPasskey", typePrefix(dtype));
   }
 
   explicit Scalar(
@@ -78,14 +79,26 @@ class TORCH_CUDA_CU_API Scalar : public Val {
             (c10::is_complex<UnderlyingType>::value && isComplexType(dtype)),
         "Invalid data type: ",
         dtype);
+    VAL_LOG(
+        "Scalar::Scalar",
+        "IrBuilderPasskey",
+        "UnderlyingType",
+        typePrefix(dtype));
   }
 
   Scalar(const Scalar* src, IrCloner* ir_cloner)
-      : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {}
+      : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {
+    VAL_LOG(
+        "Scalar::Scalar",
+        src->toString(0, SerializationFormat::NameOnly),
+        "IrCloner");
+  }
 
   NVFUSER_DECLARE_CLONE
 
-  std::string toString(int indent_size = 0) const override {
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override {
     std::stringstream ss;
     if (isSymbolic()) {
       ss << ir_utils::varName(this);
@@ -125,13 +138,15 @@ class TORCH_CUDA_CU_API Scalar : public Val {
     return ss.str();
   }
 
-  std::string toInlineString(int indent_size = 0) const override {
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override {
     if (definition() != nullptr) {
       std::stringstream ss;
       ss << "( " << definition()->toInlineString(indent_size) << " )";
       return ss.str();
     } else {
-      return toString(indent_size);
+      return toString(indent_size, fmt);
     }
   }
 
@@ -173,6 +188,16 @@ using ComplexDouble = Scalar<std::complex<double>>;
 //! computeAt position as needed during traversal, most inlined will increase
 //! the compute at position to maximum possible through traversal.
 enum class ComputeAtMode { Standard, BestEffort, MostInlined };
+inline std::string compute_at_mode_to_string(ComputeAtMode mode) {
+  switch (mode) {
+    case ComputeAtMode::Standard:
+      return "Standard";
+    case ComputeAtMode::BestEffort:
+      return "BestEffort";
+    case ComputeAtMode::MostInlined:
+      return "MostInlined";
+  }
+}
 
 class TransformPropagator;
 struct MostInlinedTransformPropagator;
@@ -231,9 +256,13 @@ class TORCH_CUDA_CU_API TensorView : public Val {
 
   NVFUSER_DECLARE_CLONE
 
-  std::string toString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   TensorDomain* domain() const {
     return domain_;
diff --git a/third_party/nvfuser/csrc/ir_internal_nodes.h b/third_party/nvfuser/csrc/ir_internal_nodes.h
index aa07a7836d05..ebc4b1608c4c 100644
--- a/third_party/nvfuser/csrc/ir_internal_nodes.h
+++ b/third_party/nvfuser/csrc/ir_internal_nodes.h
@@ -42,8 +42,12 @@ class TORCH_CUDA_CU_API FullOp : public Expr {
     return "FullOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* getFillValue() const {
     return inputs().back();
@@ -67,8 +71,12 @@ class TORCH_CUDA_CU_API SelectOp : public Expr {
     return "SelectOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   IterDomain* getSelectAxis() const {
     return attribute(0)->as<IterDomain>();
@@ -97,8 +105,12 @@ class TORCH_CUDA_CU_API IndexSelectOp : public Expr {
     return "IndexSelectOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   IterDomain* getSelectAxis() const {
     return attribute(0)->as<IterDomain>();
@@ -125,8 +137,12 @@ class TORCH_CUDA_CU_API TorchGatherOp : public Expr {
     return "TorchGatherOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   TensorView* lookupTv() const {
     return input(0)->as<TensorView>();
@@ -163,8 +179,12 @@ class TORCH_CUDA_CU_API ARangeOp : public Expr {
     return "ARangeOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   DataType dtype() const {
     return attribute(0)->as<Attribute<DataType>>()->value;
@@ -213,8 +233,12 @@ class TORCH_CUDA_CU_API EyeOp : public Expr {
     return "EyeOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   DataType dtype() const {
     return attribute(0)->as<Attribute<DataType>>()->value;
@@ -242,8 +266,12 @@ class TORCH_CUDA_CU_API UnaryOp : public Expr {
   virtual std::vector<EvaluatorValue> evaluate(
       const std::vector<EvaluatorValue>& inputs) const override;
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* out() const {
     return output(0);
@@ -279,8 +307,12 @@ class TORCH_CUDA_CU_API BinaryOp : public Expr {
   virtual std::vector<EvaluatorValue> evaluate(
       const std::vector<EvaluatorValue>& inputs) const override;
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* out() const {
     return output(0);
@@ -325,8 +357,12 @@ class TORCH_CUDA_CU_API TernaryOp : public Expr {
   virtual std::vector<EvaluatorValue> evaluate(
       const std::vector<EvaluatorValue>& inputs) const override;
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* out() const {
     return output(0);
@@ -391,8 +427,12 @@ class TORCH_CUDA_CU_API RNGOp : public Expr {
     return "RNGOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   RNGOpType getRNGOpType() const {
     return attribute(0)->as<Attribute<Attributes>>()->value.rtype;
@@ -448,8 +488,12 @@ class TORCH_CUDA_CU_API BroadcastOp : public Expr {
     return "BroadcastOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* out() const {
     return output(0);
@@ -495,8 +539,12 @@ class TORCH_CUDA_CU_API SqueezeOp : public Expr {
     return "SqueezeOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* out() const {
     return output(0);
@@ -543,8 +591,12 @@ class TORCH_CUDA_CU_API ReductionOp : public Expr {
     return "ReductionOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* out() const {
     return output(0);
@@ -589,8 +641,12 @@ class TORCH_CUDA_CU_API GroupedReductionOp : public Expr {
     return "GroupedReductionOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   //! Number of expressions grouped horizontally. It does not reflect
   //! iteration grouping.
@@ -781,8 +837,12 @@ class TORCH_CUDA_CU_API WelfordOp : public Expr {
     return "WelfordOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* out() const {
     return outputTriplet().avg();
@@ -876,8 +936,12 @@ class TORCH_CUDA_CU_API GroupedWelfordOp : public Expr {
     return "GroupedWelfordOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   //! Number of expressions grouped horizontally. It does not reflect
   //! iteration grouping. As horizontal grouping is not supported,
@@ -1017,8 +1081,12 @@ class TORCH_CUDA_CU_API MmaOp : public Expr {
     return "MmaOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* out() const {
     return output(0);
@@ -1063,8 +1131,12 @@ class TORCH_CUDA_CU_API TransposeOp : public Expr {
     return "TransposeOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   TensorView* out() const {
     return output(0)->as<TensorView>();
@@ -1097,8 +1169,12 @@ class TORCH_CUDA_CU_API ExpandOp : public Expr {
     return "ExpandOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   TensorView* out() const {
     return output(0)->as<TensorView>();
@@ -1134,8 +1210,12 @@ class TORCH_CUDA_CU_API ShiftOp : public Expr {
     return "ShiftOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* out() const {
     return output(0);
@@ -1183,8 +1263,12 @@ class TORCH_CUDA_CU_API GatherOp : public Expr {
     return "GatherOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* out() const {
     return output(0);
@@ -1230,8 +1314,12 @@ class TORCH_CUDA_CU_API ViewAsScalar : public Expr {
     return "ViewAsScalar";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* out() const {
     return output(0);
@@ -1264,8 +1352,12 @@ class TORCH_CUDA_CU_API ViewOp : public Expr {
     return "ViewOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* out() const {
     return output(0);
@@ -1294,8 +1386,12 @@ class TORCH_CUDA_CU_API LoadStoreOp : public Expr {
     return "LoadStoreOp";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* out() const {
     return output(0);
@@ -1391,9 +1487,13 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
 
   bool sameAs(const Statement* other) const override;
 
-  std::string toString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   //! Returns a new IterDomain matching properties of this
   //!
@@ -1721,9 +1821,13 @@ class TORCH_CUDA_CU_API TensorDomain : public Val {
       const std::vector<IterDomain*>& lhs,
       const std::vector<IterDomain*>& rhs);
 
-  std::string toString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   const std::vector<IterDomain*>& domain() const {
     return domain_;
@@ -1921,8 +2025,12 @@ class TORCH_CUDA_CU_API Split : public Expr {
     return "Split";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   IterDomain* outer() const {
     return output(0)->as<IterDomain>();
@@ -1979,8 +2087,12 @@ class TORCH_CUDA_CU_API Merge : public Expr {
     return "Merge";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   IterDomain* out() const {
     return output(0)->as<IterDomain>();
@@ -2013,8 +2125,12 @@ class TORCH_CUDA_CU_API Swizzle2D : public Expr {
     return "Swizzle2D";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   // Output iterdomain pair corresponding
   //  to the original input iterdomain pair.
@@ -2109,11 +2225,15 @@ class TORCH_CUDA_CU_API NamedScalar : public Val {
 
   bool sameAs(const Statement* other) const override;
 
-  std::string toString(int indent_size = 0) const override {
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override {
     return name_;
   }
 
-  std::string toInlineString(int indent_size = 0) const override {
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override {
     return name_;
   }
 
diff --git a/third_party/nvfuser/csrc/ir_iostream.cpp b/third_party/nvfuser/csrc/ir_iostream.cpp
index 643306c96f88..71c73177f728 100644
--- a/third_party/nvfuser/csrc/ir_iostream.cpp
+++ b/third_party/nvfuser/csrc/ir_iostream.cpp
@@ -36,7 +36,7 @@ void IrPrinter::handle(Fusion* fusion) {
   FUSER_PERF_SCOPE("IrPrinter");
   resetIndent();
   for (const Expr* expr : fusion->exprs()) {
-    os_ << expr->toString();
+    os_ << expr->toString(indent_size_, fmt_);
   }
 }
 
diff --git a/third_party/nvfuser/csrc/ir_iostream.h b/third_party/nvfuser/csrc/ir_iostream.h
index d8ca3647e0ab..fcb7a2f15df1 100644
--- a/third_party/nvfuser/csrc/ir_iostream.h
+++ b/third_party/nvfuser/csrc/ir_iostream.h
@@ -3,6 +3,7 @@
 #include <c10/macros/Export.h>
 
 #include <dispatch.h>
+#include <utils.h>
 
 #include <c10/util/irange.h>
 
@@ -38,8 +39,11 @@ inline std::ostream& indent(std::ostream& os, int indent_size) {
 //!
 class TORCH_CUDA_CU_API IrPrinter {
  public:
-  explicit IrPrinter(std::ostream& os, int indent_size = 0)
-      : os_(os), indent_size_(indent_size) {}
+  explicit IrPrinter(
+      std::ostream& os,
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default)
+      : os_(os), indent_size_(indent_size), fmt_(fmt) {}
   virtual ~IrPrinter() {}
 
   void resetIndent() {
@@ -76,6 +80,7 @@ class TORCH_CUDA_CU_API IrPrinter {
   std::ostream& os_;
   bool print_inline_ = false;
   int indent_size_ = 0;
+  SerializationFormat fmt_ = SerializationFormat::Default;
 };
 
 TORCH_CUDA_CU_API std::ostream& operator<<(
diff --git a/third_party/nvfuser/csrc/ir_nodes.cpp b/third_party/nvfuser/csrc/ir_nodes.cpp
index 4e4b1f5b7e8a..814eb9a71618 100644
--- a/third_party/nvfuser/csrc/ir_nodes.cpp
+++ b/third_party/nvfuser/csrc/ir_nodes.cpp
@@ -12,6 +12,7 @@
 #include <transform_rfactor.h>
 #include <transform_view.h>
 #include <type.h>
+#include <utils.h>
 
 #include <c10/util/irange.h>
 
@@ -105,7 +106,7 @@ FullOp::FullOp(IrBuilderPasskey passkey, Val* out, Val* fill_value)
   addOutput(out);
 }
 
-std::string FullOp::toString(int indent_size) const {
+std::string FullOp::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << output(0)->toString() << "\n";
   indent_size++;
@@ -117,13 +118,14 @@ std::string FullOp::toString(int indent_size) const {
     if (i > 0) {
       ss << ", ";
     }
-    ss << input(i)->toInlineString(indent_size);
+    ss << input(i)->toInlineString(indent_size, fmt);
   }
   ss << ");\n";
   return ss.str();
 }
 
-std::string FullOp::toInlineString(int indent_size) const {
+std::string FullOp::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -143,7 +145,7 @@ SelectOp::SelectOp(
   addAttribute(index);
 }
 
-std::string SelectOp::toString(int indent_size) const {
+std::string SelectOp::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << output(0)->toString() << "\n";
   indent_size++;
@@ -153,7 +155,8 @@ std::string SelectOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string SelectOp::toInlineString(int indent_size) const {
+std::string SelectOp::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -174,7 +177,8 @@ IndexSelectOp::IndexSelectOp(
   addAttribute(IrBuilder::create<Attribute<int>>(passkey.ir_container_, dim));
 }
 
-std::string IndexSelectOp::toString(int indent_size) const {
+std::string IndexSelectOp::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << output(0)->toString() << "\n";
   indent_size++;
@@ -188,7 +192,9 @@ std::string IndexSelectOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string IndexSelectOp::toInlineString(int indent_size) const {
+std::string IndexSelectOp::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -209,7 +215,8 @@ TorchGatherOp::TorchGatherOp(
   addAttribute(IrBuilder::create<Attribute<int>>(passkey.ir_container_, dim));
 }
 
-std::string TorchGatherOp::toString(int indent_size) const {
+std::string TorchGatherOp::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << output(0)->toString() << "\n";
   indent_size++;
@@ -223,7 +230,9 @@ std::string TorchGatherOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string TorchGatherOp::toInlineString(int indent_size) const {
+std::string TorchGatherOp::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -245,7 +254,7 @@ ARangeOp::ARangeOp(
       IrBuilder::create<Attribute<DataType>>(passkey.ir_container_, dtype));
 }
 
-std::string ARangeOp::toString(int indent_size) const {
+std::string ARangeOp::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << output(0)->toString();
   ss << "\n";
@@ -256,7 +265,8 @@ std::string ARangeOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string ARangeOp::toInlineString(int indent_size) const {
+std::string ARangeOp::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -276,7 +286,7 @@ EyeOp::EyeOp(IrBuilderPasskey passkey, Val* out, DataType dtype)
       IrBuilder::create<Attribute<DataType>>(passkey.ir_container_, dtype));
 }
 
-std::string EyeOp::toString(int indent_size) const {
+std::string EyeOp::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << output(0)->toString() << "\n";
   indent_size++;
@@ -285,7 +295,8 @@ std::string EyeOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string EyeOp::toInlineString(int indent_size) const {
+std::string EyeOp::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -363,7 +374,7 @@ void UnaryOp::printHelper(std::stringstream& ss, std::string input) const {
   }
 }
 
-std::string UnaryOp::toString(int indent_size) const {
+std::string UnaryOp::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   bool istvop = ir_utils::isTvOp(this);
   indent(ss, indent_size) << out()->toString();
@@ -378,10 +389,11 @@ std::string UnaryOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string UnaryOp::toInlineString(int indent_size) const {
+std::string UnaryOp::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   checkInlineable(this);
   std::stringstream ss;
-  printHelper(ss, in()->toInlineString());
+  printHelper(ss, in()->toInlineString(indent_size, fmt));
   return ss.str();
 }
 
@@ -507,7 +519,7 @@ void BinaryOp::printHelper(
   }
 }
 
-std::string BinaryOp::toString(int indent_size) const {
+std::string BinaryOp::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   bool istvop = ir_utils::isTvOp(this);
   indent(ss, indent_size) << out();
@@ -525,11 +537,15 @@ std::string BinaryOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string BinaryOp::toInlineString(int indent_size) const {
+std::string BinaryOp::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   checkInlineable(this);
   std::stringstream ss;
   printHelper(
-      ss, indent_size, lhs()->toInlineString(), rhs()->toInlineString());
+      ss,
+      indent_size,
+      lhs()->toInlineString(0, fmt),
+      rhs()->toInlineString(0, fmt));
   return ss.str();
 }
 
@@ -591,7 +607,8 @@ void TernaryOp::printHelper(
   ss << ", " << in3 << ")";
 }
 
-std::string TernaryOp::toString(int indent_size) const {
+std::string TernaryOp::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   bool istvop = ir_utils::isTvOp(this);
   indent(ss, indent_size);
@@ -611,15 +628,16 @@ std::string TernaryOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string TernaryOp::toInlineString(int indent_size) const {
+std::string TernaryOp::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   checkInlineable(this);
   std::stringstream ss;
   printHelper(
       ss,
       indent_size,
-      in1()->toInlineString(),
-      in2()->toInlineString(),
-      in3()->toInlineString());
+      in1()->toInlineString(0, fmt),
+      in2()->toInlineString(0, fmt),
+      in3()->toInlineString(0, fmt));
   return ss.str();
 }
 
@@ -650,7 +668,7 @@ RNGOp::RNGOp(
   addAttribute(philox_index);
 }
 
-std::string RNGOp::toString(int indent_size) const {
+std::string RNGOp::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size);
   ss << output(0)->toString() << "\n";
@@ -676,7 +694,8 @@ std::string RNGOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string RNGOp::toInlineString(int indent_size) const {
+std::string RNGOp::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -748,14 +767,17 @@ BroadcastOp::BroadcastOp(
       "The dimensions of output tensor and does not match with is_broadcast_dims and input tensor");
 }
 
-std::string BroadcastOp::toString(int indent_size) const {
+std::string BroadcastOp::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << "\n";
   indent(ss, indent_size) << "   = broadcast( " << in()->toString() << " )\n";
   return ss.str();
 }
 
-std::string BroadcastOp::toInlineString(int indent_size) const {
+std::string BroadcastOp::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -817,14 +839,16 @@ SqueezeOp::SqueezeOp(
       "The dimensions of output tensor and does not match with is_squeeze_dims and input tensor");
 }
 
-std::string SqueezeOp::toString(int indent_size) const {
+std::string SqueezeOp::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << "\n";
   indent(ss, indent_size) << "   = squeeze( " << in()->toString() << " )\n";
   return ss.str();
 }
 
-std::string SqueezeOp::toInlineString(int indent_size) const {
+std::string SqueezeOp::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -869,7 +893,8 @@ ReductionOp::ReductionOp(
       IrBuilder::create<Attribute<bool>>(passkey.ir_container_, is_allreduce));
 }
 
-std::string ReductionOp::toString(int indent_size) const {
+std::string ReductionOp::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << out() << "\n";
   indent(ss, indent_size) << "   = reduction( " << in()->toString()
@@ -880,7 +905,9 @@ std::string ReductionOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string ReductionOp::toInlineString(int indent_size) const {
+std::string ReductionOp::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -912,7 +939,9 @@ GroupedReductionOp::GroupedReductionOp(
   }
 }
 
-std::string GroupedReductionOp::toString(int indent_size) const {
+std::string GroupedReductionOp::toString(
+    int indent_size,
+    SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << "GroupedReductionOp(\n";
   ++indent_size;
@@ -927,7 +956,9 @@ std::string GroupedReductionOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string GroupedReductionOp::toInlineString(int indent_size) const {
+std::string GroupedReductionOp::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -1098,7 +1129,8 @@ std::vector<Val*> WelfordOp::getInitVals() const {
   return init_vals;
 }
 
-std::string WelfordOp::toString(int indent_size) const {
+std::string WelfordOp::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << outAvg()->toString() << "(Avg),\n"
                           << outVar()->toString() << "(Var),\n"
@@ -1119,7 +1151,8 @@ std::string WelfordOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string WelfordOp::toInlineString(int indent_size) const {
+std::string WelfordOp::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -1237,7 +1270,8 @@ GroupedWelfordOp::GroupedWelfordOp(
   }
 }
 
-std::string GroupedWelfordOp::toString(int indent_size) const {
+std::string GroupedWelfordOp::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << "GroupedWelford(\n";
   ++indent_size;
@@ -1262,7 +1296,9 @@ std::string GroupedWelfordOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string GroupedWelfordOp::toInlineString(int indent_size) const {
+std::string GroupedWelfordOp::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -1328,7 +1364,7 @@ MmaOp::MmaOp(
   attribute(1)->as<Attribute<OptionsInMma>>()->value = options;
 }
 
-std::string MmaOp::toString(int indent_size) const {
+std::string MmaOp::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << " = mma(" << inA()->toString()
                           << "," << inB()->toString();
@@ -1336,7 +1372,8 @@ std::string MmaOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string MmaOp::toInlineString(int indent_size) const {
+std::string MmaOp::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -1391,14 +1428,17 @@ TransposeOp::TransposeOp(
       passkey.ir_container_, std::move(new2old)));
 }
 
-std::string TransposeOp::toString(int indent_size) const {
+std::string TransposeOp::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << " = transpose( "
                           << in()->toString() << " )\n";
   return ss.str();
 }
 
-std::string TransposeOp::toInlineString(int indent_size) const {
+std::string TransposeOp::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -1430,7 +1470,7 @@ ExpandOp::ExpandOp(
   }
 }
 
-std::string ExpandOp::toString(int indent_size) const {
+std::string ExpandOp::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << " = expand( " << in()
                           << ", {";
@@ -1444,7 +1484,8 @@ std::string ExpandOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string ExpandOp::toInlineString(int indent_size) const {
+std::string ExpandOp::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -1490,7 +1531,7 @@ ShiftOp::ShiftOp(
       passkey.ir_container_, std::move(pad_width)));
 }
 
-std::string ShiftOp::toString(int indent_size) const {
+std::string ShiftOp::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << " = shift( "
                           << in()->toString() << ", {" << offsets() << "}, {"
@@ -1498,7 +1539,8 @@ std::string ShiftOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string ShiftOp::toInlineString(int indent_size) const {
+std::string ShiftOp::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -1545,7 +1587,7 @@ GatherOp::GatherOp(
       passkey.ir_container_, std::move(pad_width)));
 }
 
-std::string GatherOp::toString(int indent_size) const {
+std::string GatherOp::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << " = gather( "
                           << in()->toString() << ", {";
@@ -1570,7 +1612,8 @@ std::string GatherOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string GatherOp::toInlineString(int indent_size) const {
+std::string GatherOp::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -1598,7 +1641,8 @@ ViewAsScalar::ViewAsScalar(
   addAttribute(index);
 }
 
-std::string ViewAsScalar::toString(int indent_size) const {
+std::string ViewAsScalar::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << " = view_as_scalar( "
                           << in()->toString() << ", " << vector_id()->toString()
@@ -1606,7 +1650,9 @@ std::string ViewAsScalar::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string ViewAsScalar::toInlineString(int indent_size) const {
+std::string ViewAsScalar::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -1617,14 +1663,15 @@ ViewOp::ViewOp(IrBuilderPasskey passkey, Val* out, Val* in) : Expr(passkey) {
   addInput(in);
 }
 
-std::string ViewOp::toString(int indent_size) const {
+std::string ViewOp::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << " = view( "
                           << in()->toString() << " )\n";
   return ss.str();
 }
 
-std::string ViewOp::toInlineString(int indent_size) const {
+std::string ViewOp::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -1642,14 +1689,17 @@ LoadStoreOp::LoadStoreOp(
       passkey.ir_container_, op_type));
 }
 
-std::string LoadStoreOp::toString(int indent_size) const {
+std::string LoadStoreOp::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << " = " << opType() << "( "
                           << in()->toString() << " )\n";
   return ss.str();
 }
 
-std::string LoadStoreOp::toInlineString(int indent_size) const {
+std::string LoadStoreOp::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -1849,11 +1899,15 @@ bool IterDomain::sameAs(const Statement* other) const {
   return is_same;
 }
 
-std::string IterDomain::toString(int indent_size) const {
+std::string IterDomain::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   ss << getIterType();
   ss << getParallelType();
   ss << name();
+  if (fmt == SerializationFormat::NameOnly) {
+    return ss.str();
+  }
   ss << "{";
   if (!start()->isZeroInt()) {
     ss << start()->toInlineString() << " : ";
@@ -1872,11 +1926,28 @@ std::string IterDomain::toString(int indent_size) const {
   if (hasPaddingToMultipleOfWarp()) {
     ss << "_p";
   }
+
+  if (fmt == SerializationFormat::Debug) {
+    ss << "s" << start_ << "e" << extent_;
+    if (is_simple_) {
+      ss << "_simple_";
+    }
+    if (is_mma_swizzled_) {
+      ss << "_mmaswiz_";
+    }
+    if (expanded_extent_ != nullptr) {
+      ss << " expanded=" << expanded_extent_->toString(0, fmt);
+    }
+    if (stop_offset_ != nullptr) {
+      ss << " stop_offset_=" << stop_offset_->toString(0, fmt);
+    }
+  }
   return ss.str();
 }
 
-std::string IterDomain::toInlineString(int indent_size) const {
-  return toString(indent_size);
+std::string IterDomain::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
+  return toString(indent_size, fmt);
 }
 
 // Returns a new IterDomain matching properties of this except for
@@ -2062,6 +2133,12 @@ std::pair<IterDomain*, IterDomain*> IterDomain::split(
     Val* factor,
     bool inner_split,
     bool trim_out_of_bounds) {
+  VAL_LOG_EXPLICIT(
+      in,
+      "IterDomain::split",
+      factor->toString(0, SerializationFormat::NameOnly),
+      std::to_string(inner_split),
+      std::to_string(trim_out_of_bounds), );
   auto start_offset = trim_out_of_bounds ? in->start() : nullptr;
   auto stop_offset = trim_out_of_bounds ? in->stopOffset() : nullptr;
   return IterDomain::split(in, factor, inner_split, start_offset, stop_offset);
@@ -2156,6 +2233,8 @@ void IterDomain::parallelize(ParallelType t) {
         "Parallel type other than serial, tidx, vectorize not allowed for mma swizzled ids");
   }
 
+  VAL_LOG("IterDomain::parallelize", stringifyThread(t), );
+
   parallel_type_ = t;
 }
 
@@ -2373,25 +2452,62 @@ bool TensorDomain::sameAs(
   return true;
 }
 
-std::string TensorDomain::toString(int indent_size) const {
+std::string TensorDomain::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
+
+  if (fmt == SerializationFormat::Debug) {
+    ss << "TD" << name();
+  }
+
   if (nDims() == 0) {
     ss << "[ 0 ]";
     return ss.str();
   }
   ss << "[ ";
   for (const auto i : c10::irange(nDims())) {
-    ss << axis(i)->toString();
+    ss << axis(i)->toInlineString(0, fmt);
     if (i != nDims() - 1) {
       ss << ", ";
     }
   }
   ss << " ]";
+
+  if (fmt == SerializationFormat::Debug) {
+    ss << " root_domain_:";
+    for (auto& it : root_domain_) {
+      ss << " " << it->toString(0, fmt);
+    }
+    ss << " domain_:";
+    for (auto& it : domain_) {
+      ss << " " << it->toString(0, fmt);
+    }
+    ss << " no_bcast_domain_:";
+    for (auto& it : no_bcast_domain_) {
+      ss << " " << it->toString(0, fmt);
+    }
+    ss << " no_reduction_domain_:";
+    for (auto& it : no_reduction_domain_) {
+      ss << " " << it->toString(0, fmt);
+    }
+    ss << " rfactor_domain_:";
+    for (auto& it : rfactor_domain_) {
+      ss << " " << it->toString(0, fmt);
+    }
+    ss << " contiguity_:";
+    for (auto it : contiguity_) {
+      ss << " " << it;
+    }
+    ss << " has_reduction_:" << has_reduction_;
+  }
+
   return ss.str();
 }
 
-std::string TensorDomain::toInlineString(int indent_size) const {
-  return toString(indent_size);
+std::string TensorDomain::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
+  return toString(indent_size, fmt);
 }
 
 void TensorDomain::setContiguity(const std::vector<bool>& contig) {
@@ -2524,6 +2640,13 @@ void TensorDomain::split(
       !id->isMmaSwizzled(),
       "Further transformation on warp mapped id's not allowed.");
 
+  VAL_LOG(
+      "TensorDomain::split",
+      std::to_string(axis_),
+      factor->toString(0, SerializationFormat::NameOnly),
+      std::to_string(inner_split),
+      std::to_string(trim_out_of_bounds), );
+
   auto split_ids =
       IterDomain::split(id, factor, inner_split, trim_out_of_bounds);
   domain_.erase(domain_.begin() + axis_);
@@ -2550,6 +2673,9 @@ void TensorDomain::merge(int axis_o, int axis_i) {
       axis_o != axis_i,
       "Invalid merge detected, axes provided are the same axis.");
 
+  VAL_LOG(
+      "TensorDomain::merge", std::to_string(axis_o), std::to_string(axis_i), );
+
   if (axis_o > axis_i) {
     auto tmp = axis_i;
     axis_i = axis_o;
@@ -2576,6 +2702,13 @@ void TensorDomain::reorder(const std::unordered_map<int, int>& old2new_) {
   TORCH_INTERNAL_ASSERT(
       !(nDims() == 0 && old2new_.size() > 0),
       "Tried to reorder a 0-dim domain");
+#ifndef NDEBUG
+  std::stringstream ss;
+  for (auto on : old2new_) {
+    ss << "  " << on.first << "->" << on.second;
+  }
+  VAL_LOG("TensorDomain::reorder", ss.str(), );
+#endif
   domain_ = orderedAs(domain_, old2new_);
   resetDomains();
 }
@@ -2708,6 +2841,11 @@ TensorDomain* TensorDomain::view(const AnalyzeViewResult& view_analysis) {
 }
 
 TensorDomain* TensorDomain::flatten(int64_t start_dim, int64_t end_dim) {
+  VAL_LOG(
+      "TensorDomain::flatten",
+      std::to_string(start_dim),
+      std::to_string(end_dim), );
+
   auto inp_domain = noReductions(getMaybeRFactorDomain());
 
   if (start_dim < 0) {
@@ -2814,7 +2952,7 @@ Split::Split(
   addAttribute(stop_offset);
 }
 
-std::string Split::toString(int indent_size) const {
+std::string Split::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   ss << (innerSplit() ? "Split: " : "Outer split: ");
   ss << in()->toString();
@@ -2834,7 +2972,8 @@ std::string Split::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string Split::toInlineString(int indent_size) const {
+std::string Split::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Split can not be printed inline");
 }
 
@@ -2865,7 +3004,7 @@ Merge::Merge(
   addInput(inner);
 }
 
-std::string Merge::toString(int indent_size) const {
+std::string Merge::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   ss << "Merge: ";
   ss << outer()->toString();
@@ -2877,7 +3016,8 @@ std::string Merge::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string Merge::toInlineString(int indent_size) const {
+std::string Merge::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -2902,7 +3042,8 @@ Swizzle2D::Swizzle2D(
       passkey.ir_container_, swizzle_mode));
 }
 
-std::string Swizzle2D::toString(int indent_size) const {
+std::string Swizzle2D::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   ss << swizzleType() << "(2D): ";
   ss << inX()->toString();
@@ -2916,7 +3057,8 @@ std::string Swizzle2D::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string Swizzle2D::toInlineString(int indent_size) const {
+std::string Swizzle2D::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
diff --git a/third_party/nvfuser/csrc/ir_printer.h b/third_party/nvfuser/csrc/ir_printer.h
index 8579730568b9..b4b5aff4f895 100644
--- a/third_party/nvfuser/csrc/ir_printer.h
+++ b/third_party/nvfuser/csrc/ir_printer.h
@@ -28,7 +28,10 @@ namespace cuda {
 //!
 class TORCH_CUDA_CU_API IrMathPrinter : public IrPrinter {
  public:
-  IrMathPrinter(std::ostream& os) : IrPrinter(os) {}
+  IrMathPrinter(
+      std::ostream& os,
+      SerializationFormat fmt = SerializationFormat::Default)
+      : IrPrinter(os, 0, fmt) {}
 
   using IrPrinter::handle;
 
@@ -43,7 +46,10 @@ class TORCH_CUDA_CU_API IrMathPrinter : public IrPrinter {
 //!
 class TORCH_CUDA_CU_API IrTransformPrinter : public IrPrinter {
  public:
-  IrTransformPrinter(std::ostream& os) : IrPrinter(os) {}
+  IrTransformPrinter(
+      std::ostream& os,
+      SerializationFormat fmt = SerializationFormat::Default)
+      : IrPrinter(os, 0, fmt) {}
 
   using IrPrinter::handle;
 
diff --git a/third_party/nvfuser/csrc/kernel_ir.cpp b/third_party/nvfuser/csrc/kernel_ir.cpp
index 1983156c4dd5..6b124c39430b 100644
--- a/third_party/nvfuser/csrc/kernel_ir.cpp
+++ b/third_party/nvfuser/csrc/kernel_ir.cpp
@@ -61,14 +61,16 @@ Predicate::Predicate(IrBuilderPasskey passkey, Bool* value)
   TORCH_INTERNAL_ASSERT(value != nullptr);
 }
 
-std::string Predicate::toString(int indent_size) const {
+std::string Predicate::toString(int indent_size, SerializationFormat fmt)
+    const {
   if (predicate_type() == PredicateType::Manual) {
-    return value()->toString();
+    return value()->toString(0, fmt);
   }
   return predicate_type2string(predicate_type());
 }
 
-std::string Predicate::toInlineString(int indent_size) const {
+std::string Predicate::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   if (predicate_type() == PredicateType::Manual) {
     return value()->toInlineString();
   }
@@ -90,7 +92,8 @@ TensorIndex::TensorIndex(
       "Cannot index with a value other than an int.");
 }
 
-std::string TensorIndex::toString(int indent_size) const {
+std::string TensorIndex::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   ss << ir_utils::varName(this);
   switch (view()->getMemoryType()) {
@@ -107,14 +110,16 @@ std::string TensorIndex::toString(int indent_size) const {
       TORCH_INTERNAL_ASSERT(false, "Unknown tensor memory type.");
   }
   ss << "[";
-  ss << index()->toInlineString(indent_size);
+  ss << index()->toInlineString(indent_size, fmt);
   ss << "]";
   ss << " view( " << ir_utils::varName(view()) << " )";
   return ss.str();
 }
 
-std::string TensorIndex::toInlineString(int indent_size) const {
-  return toString(indent_size);
+std::string TensorIndex::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
+  return toString(indent_size, fmt);
 }
 
 Allocate::Allocate(
@@ -191,7 +196,7 @@ Allocate::Allocate(
       "IR type only valid for Kernel container.");
 }
 
-std::string Allocate::toString(int indent_size) const {
+std::string Allocate::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << buffer()->toString();
   ss << " = ALLOCATE("
@@ -207,7 +212,8 @@ std::string Allocate::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string Allocate::toInlineString(int indent_size) const {
+std::string Allocate::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -221,14 +227,16 @@ BlockSync::BlockSync(IrBuilderPasskey passkey, bool war_sync) : Expr(passkey) {
       IrBuilder::create<Attribute<bool>>(passkey.ir_container_, war_sync));
 }
 
-std::string BlockSync::toString(int indent_size) const {
+std::string BlockSync::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << "BLOCKSYNC(war_hazard="
                           << boolLiteral(isWarHazardSync()) << ")\n";
   return ss.str();
 }
 
-std::string BlockSync::toInlineString(int indent_size) const {
+std::string BlockSync::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -244,14 +252,15 @@ GridSync::GridSync(
   addAttribute(sync_buffer);
 }
 
-std::string GridSync::toString(int indent_size) const {
+std::string GridSync::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << "GRIDSYNC(" << syncDims().toString() << ", "
                           << syncBuffer()->toString() << ")\n";
   return ss.str();
 }
 
-std::string GridSync::toInlineString(int indent_size) const {
+std::string GridSync::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -266,13 +275,16 @@ CpAsyncWait::CpAsyncWait(IrBuilderPasskey passkey, unsigned int keep_stages)
       passkey.ir_container_, keep_stages));
 }
 
-std::string CpAsyncWait::toString(int indent_size) const {
+std::string CpAsyncWait::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << "CPASYNC_WAIT(" << keepStages() << ")\n";
   return ss.str();
 }
 
-std::string CpAsyncWait::toInlineString(int indent_size) const {
+std::string CpAsyncWait::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -284,13 +296,16 @@ CpAsyncCommit::CpAsyncCommit(IrBuilderPasskey passkey) : Expr(passkey) {
       "IR type only valid for Kernel container.");
 }
 
-std::string CpAsyncCommit::toString(int indent_size) const {
+std::string CpAsyncCommit::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << "CPASYNC_WAIT()\n";
   return ss.str();
 }
 
-std::string CpAsyncCommit::toInlineString(int indent_size) const {
+std::string CpAsyncCommit::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -302,13 +317,16 @@ InitMagicZero::InitMagicZero(IrBuilderPasskey passkey) : Expr(passkey) {
       "IR type only valid for Kernel container.");
 }
 
-std::string InitMagicZero::toString(int indent_size) const {
+std::string InitMagicZero::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << "NVFUSER_DEFINE_MAGIC_ZERO\n";
   return ss.str();
 }
 
-std::string InitMagicZero::toInlineString(int indent_size) const {
+std::string InitMagicZero::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -320,22 +338,25 @@ UpdateMagicZero::UpdateMagicZero(IrBuilderPasskey passkey) : Expr(passkey) {
       "IR type only valid for Kernel container.");
 }
 
-std::string UpdateMagicZero::toString(int indent_size) const {
+std::string UpdateMagicZero::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << "NVFUSER_UPDATE_MAGIC_ZERO\n";
   return ss.str();
 }
 
-std::string UpdateMagicZero::toInlineString(int indent_size) const {
+std::string UpdateMagicZero::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
 NVFUSER_DEFINE_CLONE_AND_CREATE(UpdateMagicZero)
 
-std::string Scope::toString(int indent_size) const {
+std::string Scope::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   for (auto expr : exprs()) {
-    ss << expr->toString(indent_size);
+    ss << expr->toString(indent_size, fmt);
   }
   return ss.str();
 }
@@ -479,7 +500,7 @@ ForLoop::ForLoop(IrBuilderPasskey passkey, const ForLoop* other)
       "IR type only valid for Kernel container.");
 }
 
-std::string ForLoop::toString(int indent_size) const {
+std::string ForLoop::toString(int indent_size, SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << "FOR " << index()->toString() << " in "
                           << iter_domain()->toString() << ":\n"
@@ -487,7 +508,8 @@ std::string ForLoop::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string ForLoop::toInlineString(int indent_size) const {
+std::string ForLoop::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -672,7 +694,8 @@ IfThenElse::IfThenElse(IrBuilderPasskey passkey, Predicate* cond)
       IrBuilder::create<Attribute<Scope>>(passkey.ir_container_, this));
 }
 
-std::string IfThenElse::toString(int indent_size) const {
+std::string IfThenElse::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << "IF " << predicate()->toString() << ":\n"
                           << thenBody().toString(indent_size + 1);
@@ -683,7 +706,8 @@ std::string IfThenElse::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string IfThenElse::toInlineString(int indent_size) const {
+std::string IfThenElse::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -716,7 +740,8 @@ GridReduction::GridReduction(
       IrBuilder::create<Attribute<ParallelTypeBitmap>>(passkey.ir_container_));
 }
 
-std::string GridReduction::toString(int indent_size) const {
+std::string GridReduction::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << " = reduction( "
                           << in()->toString()
@@ -749,7 +774,9 @@ std::string GridReduction::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string GridReduction::toInlineString(int indent_size) const {
+std::string GridReduction::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -792,7 +819,9 @@ GroupedGridReduction::GroupedGridReduction(
   }
 }
 
-std::string GroupedGridReduction::toString(int indent_size) const {
+std::string GroupedGridReduction::toString(
+    int indent_size,
+    SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << "GroupedGridReduction(\n";
   ++indent_size;
@@ -828,7 +857,9 @@ std::string GroupedGridReduction::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string GroupedGridReduction::toInlineString(int indent_size) const {
+std::string GroupedGridReduction::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -848,7 +879,8 @@ GridBroadcast::GridBroadcast(
   addAttribute(sync_buffer);
 }
 
-std::string GridBroadcast::toString(int indent_size) const {
+std::string GridBroadcast::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   const auto* broadcast_op = this->broadcast_op();
   indent(ss, indent_size) << broadcast_op->out()->toString() << " = "
@@ -861,7 +893,9 @@ std::string GridBroadcast::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string GridBroadcast::toInlineString(int indent_size) const {
+std::string GridBroadcast::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -891,7 +925,8 @@ GridWelford::GridWelford(
       IrBuilder::create<Attribute<ParallelTypeBitmap>>(passkey.ir_container_));
 }
 
-std::string GridWelford::toString(int indent_size) const {
+std::string GridWelford::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   const auto* welford_op = this->welford_op();
   indent(ss, indent_size) << welford_op->outAvg()->toString() << " (Avg),\n";
@@ -952,7 +987,9 @@ std::string GridWelford::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string GridWelford::toInlineString(int indent_size) const {
+std::string GridWelford::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -1043,7 +1080,9 @@ int GroupedGridWelford::getSmemBufferSize(int bdimx, int bdimy, int bdimz)
   return buf_size_for_avg_var * 2 + buf_size_for_N;
 }
 
-std::string GroupedGridWelford::toString(int indent_size) const {
+std::string GroupedGridWelford::toString(
+    int indent_size,
+    SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << "GroupedGridWelford(\n";
   ++indent_size;
@@ -1095,7 +1134,9 @@ std::string GroupedGridWelford::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string GroupedGridWelford::toInlineString(int indent_size) const {
+std::string GroupedGridWelford::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
@@ -1127,14 +1168,18 @@ AllocateFusedReduction::AllocateFusedReduction(
   addAttribute(grid_expr);
 }
 
-std::string AllocateFusedReduction::toString(int indent_size) const {
+std::string AllocateFusedReduction::toString(
+    int indent_size,
+    SerializationFormat fmt) const {
   std::stringstream ss;
   indent(ss, indent_size) << "AllocateFusedReduction(reduction buffer="
                           << out()->toString() << ")\n";
   return ss.str();
 }
 
-std::string AllocateFusedReduction::toInlineString(int indent_size) const {
+std::string AllocateFusedReduction::toInlineString(
+    int indent_size,
+    SerializationFormat fmt) const {
   TORCH_CHECK(false, "Tensor op can not be printed inline");
 }
 
diff --git a/third_party/nvfuser/csrc/kernel_ir.h b/third_party/nvfuser/csrc/kernel_ir.h
index f71553d75447..56846059767f 100644
--- a/third_party/nvfuser/csrc/kernel_ir.h
+++ b/third_party/nvfuser/csrc/kernel_ir.h
@@ -60,9 +60,13 @@ class TORCH_CUDA_CU_API Predicate final : public Val {
 
   explicit Predicate(IrBuilderPasskey passkey, Bool* value);
 
-  std::string toString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   PredicateType predicate_type() const {
     return ptype_;
@@ -140,9 +144,13 @@ class TORCH_CUDA_CU_API TensorIndex final : public Val {
     return const_cast<TensorView*>(view_); // NOLINT
   }
 
-  std::string toString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
  private:
   const TensorView* view_ = nullptr;
@@ -184,8 +192,12 @@ class TORCH_CUDA_CU_API Allocate final : public Expr {
 
   NVFUSER_DECLARE_CLONE_AND_CREATE
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* buffer() const {
     return attributeVal(0);
@@ -237,8 +249,12 @@ class TORCH_CUDA_CU_API BlockSync final : public Expr {
 
   NVFUSER_DECLARE_CLONE_AND_CREATE
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   // TODO: war_sync_ is only used for testing/validation purposes.
   bool isWarHazardSync() const {
@@ -263,8 +279,12 @@ class TORCH_CUDA_CU_API GridSync final : public Expr {
     return "GridSync";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   ParallelTypeBitmap syncDims() const {
     return attribute(0)->as<Attribute<ParallelTypeBitmap>>()->value;
@@ -288,8 +308,12 @@ class TORCH_CUDA_CU_API CpAsyncWait final : public Expr {
     return "CpAsyncWait";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   //! Returns the remaining number of stages that are not synchronized
   //!  after this op.
@@ -313,8 +337,12 @@ class TORCH_CUDA_CU_API CpAsyncCommit final : public Expr {
     return "CpAsyncCommit";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 };
 
 // Simply prints "DEFINE_MAGIC_ZERO" in the code in accordance with magic_zero
@@ -331,8 +359,12 @@ class TORCH_CUDA_CU_API InitMagicZero final : public Expr {
     return "InitMagicZero";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 };
 
 // Simply prints "UPDATE_MAGIC_ZERO" in the code in accordance with magic_zero
@@ -349,8 +381,12 @@ class TORCH_CUDA_CU_API UpdateMagicZero final : public Expr {
     return "UpdateMagicZero";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 };
 
 // TODO(kir): promote to IR node
@@ -358,7 +394,9 @@ class TORCH_CUDA_CU_API Scope {
  public:
   explicit Scope(Expr* owner) : owner_(owner) {}
 
-  std::string toString(int indent_size = 0) const;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const;
 
   const std::vector<Expr*>& exprs() const {
     return exprs_;
@@ -473,8 +511,12 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr {
     return "ForLoop";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Val* index() const {
     return input(0);
@@ -562,8 +604,12 @@ class TORCH_CUDA_CU_API IfThenElse final : public Expr {
     return "IfThenElse";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Scope& thenBody() {
     return attribute(0)->as<Attribute<Scope>>()->value;
@@ -616,8 +662,12 @@ class TORCH_CUDA_CU_API GridReduction final : public ReductionOp {
     return "GridReduction";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   Allocate* reduction_buffer() const {
     return attribute(num_reduction_op_attr)->as<Allocate>();
@@ -688,8 +738,12 @@ class TORCH_CUDA_CU_API GroupedGridReduction final : public GroupedReductionOp {
     return "GroupedGridReduction";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   std::vector<Allocate*> reduction_buffers() const {
     auto offset = numGroupedReductionOpAttr() + 5;
@@ -771,8 +825,12 @@ class TORCH_CUDA_CU_API GridBroadcast final : public Expr {
     return "GridBroadcast";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   BroadcastOp* broadcast_op() const {
     return attribute(0)->as<BroadcastOp>();
@@ -816,8 +874,12 @@ class TORCH_CUDA_CU_API GridWelford final : public Expr {
     return "GridWelford";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   WelfordOp* welford_op() const {
     return attribute(0)->as<WelfordOp>();
@@ -894,8 +956,12 @@ class TORCH_CUDA_CU_API GroupedGridWelford final : public GroupedWelfordOp {
     return "GroupedGridWelford";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   std::array<std::vector<Allocate*>, 3> reduction_buffers() const {
     auto offset = numGroupedWelfordOpAttr() + 5;
@@ -1036,8 +1102,12 @@ class TORCH_CUDA_CU_API AllocateFusedReduction final : public Expr {
     return "AllocateFusedReduction";
   }
 
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
+  std::string toString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
+  std::string toInlineString(
+      int indent_size = 0,
+      SerializationFormat fmt = SerializationFormat::Default) const override;
 
   //! GridReduction, GridWelford, GroupedGridReduction or GroupedGridWelford
   Expr* gridExpr() const {
diff --git a/third_party/nvfuser/csrc/scheduler/utils.cpp b/third_party/nvfuser/csrc/scheduler/utils.cpp
index d0ddbe8a7922..4ae0fd02afca 100644
--- a/third_party/nvfuser/csrc/scheduler/utils.cpp
+++ b/third_party/nvfuser/csrc/scheduler/utils.cpp
@@ -233,6 +233,38 @@ void parallelizeAllLike(
     std::vector<TensorView*> selected_tvs,
     const std::unordered_set<ParallelType>& selected_parallel_types,
     bool propagate_padding) {
+#ifndef NDEBUG
+  {
+    std::stringstream ss;
+    ss << "{";
+    bool comma = false;
+    for (auto t : selected_tvs) {
+      if (comma)
+        ss << ", ";
+      ss << t->toString(0, SerializationFormat::NameOnly);
+      comma = true;
+    }
+    ss << " }";
+    std::stringstream ssp;
+    ssp << "{";
+    comma = false;
+    for (auto t : selected_parallel_types) {
+      if (comma)
+        ssp << ", ";
+      ssp << stringifyThread(t);
+      comma = true;
+    }
+    ssp << " }";
+    VAL_LOG_EXPLICIT(
+        reference_tv,
+        "scheduler_utils::parallelizeAllLike",
+        std::to_string(pos),
+        ss.str(),
+        ssp.str(),
+        std::to_string(propagate_padding), );
+  }
+#endif
+
   FusionGuard fg(reference_tv->fusion());
 
   if (pos < 0) {
diff --git a/third_party/nvfuser/csrc/tensor_view.cpp b/third_party/nvfuser/csrc/tensor_view.cpp
index 8c736dc3f681..40c2a08f3737 100644
--- a/third_party/nvfuser/csrc/tensor_view.cpp
+++ b/third_party/nvfuser/csrc/tensor_view.cpp
@@ -36,7 +36,14 @@ TensorView::TensorView(
     MemoryType mtype)
     : Val(passkey, ValType::TensorView, dtype),
       domain_(domain),
-      memory_type_(mtype) {}
+      memory_type_(mtype) {
+  VAL_LOG(
+      "TensorView::TensorView",
+      "IrBuilderPasskey",
+      domain->toString(0, SerializationFormat::NameOnly),
+      typePrefix(dtype),
+      "MemoryType");
+}
 
 TensorView::TensorView(
     IrBuilderPasskey passkey,
@@ -44,6 +51,7 @@ TensorView::TensorView(
     : Val(passkey,
           ValType::TensorView,
           aten_opt_type_map(tensor_type->scalarType())) {
+  VAL_LOG("TensorView::TensorView", "IrBuilderPasskey", "TensorType");
   TORCH_INTERNAL_ASSERT(
       !container()->isA<kir::Kernel>(),
       "Function invalid for kernel container.");
@@ -135,6 +143,7 @@ TensorView::TensorView(
     IrBuilderPasskey passkey,
     const std::shared_ptr<Value>& jit_value)
     : TensorView(passkey, jit_value->type()->cast<c10::TensorType>()) {
+  VAL_LOG("TensorView::TensorView", "IrBuilderPasskey", "Value");
   TORCH_INTERNAL_ASSERT(
       !container()->isA<kir::Kernel>(),
       "Function invalid for kernel container.");
@@ -142,7 +151,8 @@ TensorView::TensorView(
 
 NVFUSER_DEFINE_CLONE(TensorView)
 
-std::string TensorView::toString(int indent_size) const {
+std::string TensorView::toString(int indent_size, SerializationFormat fmt)
+    const {
   std::stringstream ss;
   ss << ir_utils::varName(this);
   switch (getMemoryType()) {
@@ -158,7 +168,10 @@ std::string TensorView::toString(int indent_size) const {
     default:
       TORCH_INTERNAL_ASSERT(false, "Unknown tensor memory type.");
   }
-  ss << domain()->toString(indent_size);
+  if (fmt == SerializationFormat::NameOnly) {
+    return ss.str();
+  }
+  ss << domain()->toString(indent_size, fmt);
 
   if (getComputeAtPosition() > 0) {
     ss << " ca_pos( ";
@@ -191,11 +204,20 @@ std::string TensorView::toString(int indent_size) const {
     ss << getMaybeMaxProducerPosition();
     ss << " )";
   }
+  if (fmt == SerializationFormat::Debug) {
+    // ss << "  memory_type_";
+    ss << " db?" << is_double_buffered_;
+    ss << " cb?" << is_circular_buffered_;
+    ss << " cbs=" << circular_buffer_stage_;
+    ss << " cs=" << cpu_scalar_;
+    ss << " swiz?" << has_swizzle_op_;
+  }
   return ss.str();
 }
 
-std::string TensorView::toInlineString(int indent_size) const {
-  return toString(indent_size);
+std::string TensorView::toInlineString(int indent_size, SerializationFormat fmt)
+    const {
+  return toString(indent_size, fmt);
 }
 
 void TensorView::convertRfactorToRootDomain() {
@@ -349,6 +371,11 @@ void TensorView::inlineAt(
       !container()->isA<kir::Kernel>(),
       "Function invalid for kernel container.");
 
+  VAL_LOG(
+      "TensorView::inlineAt",
+      std::to_string(pos),
+      std::to_string(best_effort), );
+
   std::unique_ptr<MaxPosCalculator> calc_owner;
   if (calc == nullptr) {
     calc_owner = std::make_unique<MaxPosCalculator>();
@@ -487,6 +514,13 @@ TensorView* TensorView::computeAt(
   // Make sure this and consumer are not the same tensor, that's illegal
   TORCH_CHECK(!sameAs(consumer), "Cannot call this->computeAt(this, ...)");
 
+  VAL_LOG(
+      "TensorView::computeAt",
+      consumer->toString(0, SerializationFormat::NameOnly),
+      std::to_string(position),
+      // ComputeAtMode
+  );
+
   // We support negative axes, so increment it by consumer->nDims() + 1 and make
   // sure the result is within consumer->nDims() + 1. being at consumer->nDims()
   // means producer will be computed inline with consumer, hence the +1.
@@ -514,6 +548,11 @@ void TensorView::computeWith(int pos, bool best_effort) {
       !container()->isA<kir::Kernel>(),
       "Function invalid for kernel container.");
 
+  VAL_LOG(
+      "TensorView::computeWith",
+      std::to_string(pos),
+      std::to_string(best_effort), );
+
   if (isFusionInput()) {
     return;
   }
@@ -731,6 +770,13 @@ TensorView* TensorView::split(
       ". Tensor: ",
       toString());
 
+  VAL_LOG(
+      "TensorView::split",
+      std::to_string(axis_),
+      factor->toString(0, SerializationFormat::NameOnly),
+      std::to_string(inner_split),
+      std::to_string(trim_out_of_bounds), );
+
   domain()->split(axis_, factor, inner_split, trim_out_of_bounds);
   return this;
 }
@@ -829,6 +875,9 @@ TensorView* TensorView::reorder(const std::unordered_map<int, int>& old2new_) {
         getMaybeMaxProducerPosition());
   }
 
+  // we'll leave the logging of individual axis reorders to the next line
+  VAL_LOG("TensorView::reorder");
+
   domain()->reorder(old2new_);
   return this;
 }
@@ -952,6 +1001,14 @@ TensorView* TensorView::rFactor(const std::vector<int>& axes) {
       !definition()->isA<GroupedReductionOp>(),
       "For GroupedReductionOp, use TensorView::rFactor(const std::vector<int>& axes, const std::vector<TensorView*>& tvs)");
 
+  {
+    std::stringstream ss;
+    for (auto ax : axes) {
+      ss << " " << ax;
+    }
+    VAL_LOG("TensorView::rFactor", ss.str());
+  }
+
   // Split tensor view into 2 parts
   auto domain_pair = domain()->rFactor(axes);
 
diff --git a/third_party/nvfuser/csrc/utils.cpp b/third_party/nvfuser/csrc/utils.cpp
index 5eaef09fb4b7..cb84c00285da 100644
--- a/third_party/nvfuser/csrc/utils.cpp
+++ b/third_party/nvfuser/csrc/utils.cpp
@@ -98,6 +98,7 @@ auto parseEnvOptions(
 
 auto parseDebugDumpOptions() {
   const std::unordered_map<std::string, DebugDumpOption> available_options = {
+      {"fusion_debug", DebugDumpOption::FusionDebug},
       {"fusion_ir", DebugDumpOption::FusionIr},
       {"fusion_ir_math", DebugDumpOption::FusionIrMath},
       {"fusion_ir_presched", DebugDumpOption::FusionIrPresched},
@@ -184,8 +185,66 @@ const auto& getEnableOptions() {
   return options;
 }
 
+//! Parse environment variable that represents one choice from an enum
+//! OptionEnum must be an enum like SerializationFormat
+template <typename OptionEnum>
+OptionEnum parseEnvChoice(
+    const char* option_env_name,
+    const std::unordered_map<std::string, OptionEnum>& available_options,
+    const OptionEnum defaultOption) {
+  // Make sure available_options includes all of the enum values
+  TORCH_INTERNAL_ASSERT(
+      available_options.size() == static_cast<int>(OptionEnum::EndOfOption),
+      "Invalid available option map");
+
+  auto option = defaultOption;
+
+  if (const char* option_string = std::getenv(option_env_name)) {
+    auto option_it = available_options.find(std::string(option_string));
+    if (option_it == available_options.end()) {
+      // get vector of valid option strings for error message
+      std::vector<std::string> option_values;
+      std::transform(
+          available_options.begin(),
+          available_options.end(),
+          std::back_inserter(option_values),
+          [](const auto& kv) { return kv.first; });
+      std::sort(option_values.begin(), option_values.end());
+      TORCH_CHECK(
+          false,
+          "Parsing ",
+          option_env_name,
+          " failed. Invalid option: '",
+          option_string,
+          "'\nAvailable options (case-sensitive): ",
+          toDelimitedString(option_values));
+    } else {
+      option = option_it->second;
+    }
+  }
+
+  return option;
+}
+
 } // namespace
 
+//! Parse environment variable to find print format, or return Default
+auto parsePrintFormat() {
+  const std::unordered_map<std::string, SerializationFormat> available_options =
+      {{"default", SerializationFormat::Default},
+       {"debug", SerializationFormat::Debug}};
+
+  return parseEnvChoice(
+      "PYTORCH_NVFUSER_PRINT_FORMAT",
+      available_options,
+      SerializationFormat::Default);
+}
+
+SerializationFormat getPrintFormat() {
+  static const auto format = parsePrintFormat();
+  return format;
+}
+
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function")
 void debugPrint(const c10::TensorTypePtr& type) {
   std::stringstream sizes_s;
diff --git a/third_party/nvfuser/csrc/utils.h b/third_party/nvfuser/csrc/utils.h
index 0949ce39ad50..a146fd72b7fa 100644
--- a/third_party/nvfuser/csrc/utils.h
+++ b/third_party/nvfuser/csrc/utils.h
@@ -34,6 +34,8 @@ KernelIndexMode collectIndexMode(const at::ArrayRef<at::IValue>& inputs);
 //! These can be set through the `PYTORCH_NVFUSER_DUMP` environment variable
 //!
 enum class DebugDumpOption {
+  FusionDebug, //!< Dump the entire contents of the Fusion object before
+               //!< lowering
   FusionIr, //!< Dump the Fusion IR before lowering
   FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR
   FusionIrPresched, //!< Dump the Fusion IR before it is scheduled.
@@ -118,6 +120,32 @@ TORCH_CUDA_CU_API bool isOptionEnabled(EnableOption option);
 TORCH_CUDA_CU_API const std::vector<std::string>& getEnableOptionArguments(
     EnableOption option);
 
+TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
+TORCH_CUDA_CU_API const std::vector<std::string>& getDebugDumpArguments(
+    DebugDumpOption option);
+
+//! Types of serialization formats
+//!
+//! When dumping IR to screen, the text modes can be set through the
+//! `PYTORCH_NVFUSER_PRINT_FORMAT` environment variable
+//!
+//! Note this list includes some formats that are machine-readable and some
+//! that are not, enabling the same entry points to handle pretty-printing and
+//! serde.
+//!
+enum class SerializationFormat {
+  Default, //! [TEXT] Default text format for printing to screen
+  Debug, //! [TEXT] Dump full IR recursively, including all member variables and
+  NameOnly, //! [TEXT] Only print the name of each object
+            //! memory locations
+  // FlatBuffers, //! [BINARY] Dump full IR using FlatBuffers
+  // JSON, //! [TEXT] Dump full IR recursively in JSON format. Uses FlatBuffers
+  // text format.
+  EndOfOption //! Placeholder for counting the number of elements
+};
+
+TORCH_CUDA_CU_API SerializationFormat getPrintFormat();
+
 // Check if fallback path should be used which will dispatch to eagermode if any
 // errors are encountered. Helpful for debugging.
 bool useFallback();
@@ -341,6 +369,16 @@ std::string toDelimitedString(
   return toDelimitedString(vec.begin(), vec.end(), delim);
 }
 
+#ifndef NDEBUG
+//! These macros just pack the variadic arguments into a vector to pass to the
+//! log function
+#define VAL_LOG_EXPLICIT(obj, op_name, ...) obj->log(op_name, {__VA_ARGS__})
+#define VAL_LOG(op_name, ...) log(op_name, {__VA_ARGS__})
+#else
+#define VAL_LOG_EXPLICIT(obj, op_name, ...)
+#define VAL_LOG(op_name, ...)
+#endif
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit