Skip to content

Commit

Permalink
[NNPA] Revise compiler options for quantization (#3043)
Browse files Browse the repository at this point in the history
* Introduce two new options -nnpa-quant-dynamic and -nnpa-quant-op-types, and remove the old option --nnpa-quanzation.

Signed-off-by: Tung D. Le <[email protected]>

---------

Signed-off-by: Tung D. Le <[email protected]>
  • Loading branch information
tungld authored Jan 16, 2025
1 parent 7b0dd65 commit 5a1e295
Show file tree
Hide file tree
Showing 17 changed files with 198 additions and 101 deletions.
7 changes: 7 additions & 0 deletions docs/AddCustomAccelerators.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,13 @@ virtual void registerDialects(mlir::DialectRegistry &registry) const = 0;
/// command line options.
virtual void registerPasses(int optLevel) const = 0;

//===--------------------------------------------------------------------===//
// Hooks for both onnx-mlir and onnx-mlir-opt drivers
//===--------------------------------------------------------------------===//

/// Configure passes for the accelerator.
virtual void configurePasses() const = 0;

//===--------------------------------------------------------------------===//
// Hooks for onnx-to-krnl pass
//===--------------------------------------------------------------------===//
Expand Down
7 changes: 7 additions & 0 deletions src/Accelerators/Accelerator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,13 @@ class Accelerator {
/// command line options.
virtual void registerPasses(int optLevel) const = 0;

//===--------------------------------------------------------------------===//
// Hooks for both onnx-mlir and onnx-mlir-opt drivers
//===--------------------------------------------------------------------===//

/// Configure passes for the accelerator.
virtual void configurePasses() const = 0;

//===--------------------------------------------------------------------===//
// Hooks for onnx-to-krnl pass
//===--------------------------------------------------------------------===//
Expand Down
52 changes: 39 additions & 13 deletions src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@

namespace onnx_mlir {

// Use external storage for the options so that they are globally accessible
std::vector<NNPAQuantOptions> nnpaQuantDynamic; // common for both
std::vector<std::string> nnpaQuantOpTypes; // common for both

llvm::cl::opt<NNPAEmissionTargetType> nnpaEmissionTarget(
llvm::cl::desc("[Optional] Choose NNPA-related target to emit "
"(once selected it will cancel the other targets):"),
Expand Down Expand Up @@ -101,6 +105,41 @@ llvm::cl::opt<bool> nnpaEnableSaturation("nnpa-saturation",
"Default is false."),
llvm::cl::init(false), llvm::cl::cat(OnnxMlirCommonOptions));

llvm::cl::list<NNPAQuantOptions, std::vector<NNPAQuantOptions>>
nnpaQuantDynamicOpt("nnpa-quant-dynamic",
llvm::cl::desc(
"Enable dynamic quantization of the input model. If enabled, it "
"only quantizes from fp32 to i8. If an ONNX operation is already "
"in i8, no quantization is applied to that operation. Optionally, "
"a comma-separated list of quantization options can be specified "
"as its value, e.g. -nnpa-quant-dynamic=symActivation,symWeight."),
llvm::cl::values(clEnumVal(symWeight, "Symmetric quant for weights."),
clEnumVal(asymWeight, "Asymmetric quant for weights."),
clEnumVal(symActivation, "Symmetric quant for activations."),
clEnumVal(asymActivation, "Asymmetric quant for activations."),
// Use an empty string for the case where `--nnpa-quant-dynamic` is
// specified on the command line WITHOUT value, which is different
// from the case where `--nnpa-quant-dynamic` is NOT specified on
// the command line.
clEnumValN(autoQuantOpt, "",
"Compiler automatically finds the best options. Once this "
"option (an empty string) is in the list, the other options "
"are ignored. This is the default option when "
"`-nnpa-quant-dynamic` is specified without any value.")),
llvm::cl::location(nnpaQuantDynamic), llvm::cl::ValueOptional,
llvm::cl::CommaSeparated, llvm::cl::cat(OnnxMlirCommonOptions));

llvm::cl::list<std::string, std::vector<std::string>> nnpaQuantOpTypesOpt(
"nnpa-quant-op-types",
llvm::cl::desc(
"A comma-separated list of types of operations that are quantized. "
"E.g. 'MatMul,Conv'. Strings for types are the same as ONNX operator "
"names in https://onnx.ai/onnx/operators/. Currently, only MatMul is "
"supported. Without specifying this option, the compiler will "
"determine the operation types by itself."),
llvm::cl::location(nnpaQuantOpTypes), llvm::cl::ValueOptional,
llvm::cl::CommaSeparated, llvm::cl::cat(OnnxMlirCommonOptions));

llvm::cl::opt<bool> nnpaUseDynamicQuantizeLinearOnCPU("nnpa-cpu-dql",
llvm::cl::desc("Use dynamic quantized linear on CPU. Default is false"),
llvm::cl::init(false), llvm::cl::cat(OnnxMlirCommonOptions));
Expand All @@ -111,17 +150,4 @@ llvm::cl::opt<bool> nnpaUseDynamicQuantizeLinearOnCPUForScaleOffset(
" scale and offset on CPU. Default is false"),
llvm::cl::init(false), llvm::cl::cat(OnnxMlirCommonOptions));

llvm::cl::opt<NNPAQuantType> nnpaQuantization("nnpa-quantization",
llvm::cl::desc("Enable quantization with a specific type. Only "
"MatMul whose weight is a constant is supported."),
llvm::cl::values(
clEnumVal(DynSymI8,
"Dynamic Quantization to signed integer 8. Asymmetric "
"quant for activations and symmetric quant for weights."),
clEnumVal(SymSymI8,
"Dynamic Quantization to signed integer 8. Symmetric "
"quant for activations and symmetric quant for weights."),
clEnumVal(QNONE, "No quantization (default).")),
llvm::cl::init(QNONE), llvm::cl::cat(OnnxMlirOptions));

} // namespace onnx_mlir
15 changes: 8 additions & 7 deletions src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,12 @@ typedef enum {

// Quantization type
typedef enum {
DynSymI8, /* Dynamic quantization to signed integer 8. Asymmetric quant for
activations and symmetric quant for weights.*/
SymSymI8, /* Dynamic quantization to signed integer 8. Symmetric quant for
activations and symmetric quant for weights.*/
QNONE, /* Only qualifying ops that are faster on NNPA. */
} NNPAQuantType;
symWeight,
asymWeight,
symActivation,
asymActivation,
autoQuantOpt,
} NNPAQuantOptions;

extern llvm::cl::OptionCategory OnnxMlirOptions;
extern llvm::cl::OptionCategory OnnxMlirCommonOptions;
Expand All @@ -79,7 +79,8 @@ extern llvm::cl::opt<std::string> nnpaSaveDevicePlacementFile;
extern llvm::cl::opt<bool> nnpaEnableSaturation;
extern llvm::cl::opt<bool> nnpaUseDynamicQuantizeLinearOnCPU;
extern llvm::cl::opt<bool> nnpaUseDynamicQuantizeLinearOnCPUForScaleOffset;
extern llvm::cl::opt<NNPAQuantType> nnpaQuantization;
extern std::vector<NNPAQuantOptions> nnpaQuantDynamic;
extern std::vector<std::string> nnpaQuantOpTypes;

} // namespace onnx_mlir
#endif
51 changes: 48 additions & 3 deletions src/Accelerators/NNPA/Compiler/NNPACompilerUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,56 @@ using namespace onnx_mlir;
namespace onnx_mlir {

void configurePassesNNPA() {
configureOnnxToZHighLoweringPass(optReport == OptReport::NNPAUnsupportedOps);
// z16 does not support for hardware saturation.
// So, force its usage to compiler generated sticks.
if (nnpaEnableSaturation && isLessEqualNNPALevel(NNPALevel::M14))
nnpaEnableCompilerStickUnstick = true;

// Configure ONNXToZHighLoweringPass.
bool isDynQuant = !nnpaQuantDynamic.empty();
// Default/auto mode: symmetric for weighs and asymmetric for activations.
bool isActivationSym = false;
bool isWeightSym = true;
std::vector<std::string> quantOpTypes;
if (isDynQuant) {
// Set options for activations and weights if they are given.
// When auto mode is specified, the other specified options are ignored.
if (!llvm::is_contained(nnpaQuantDynamic, NNPAQuantOptions::autoQuantOpt)) {
for (unsigned i = 0; i < nnpaQuantDynamic.size(); ++i) {
switch (nnpaQuantDynamic[i]) {
case NNPAQuantOptions::symWeight:
isWeightSym = true;
break;
case NNPAQuantOptions::asymWeight:
isWeightSym = false;
break;
case NNPAQuantOptions::symActivation:
isActivationSym = true;
break;
case NNPAQuantOptions::asymActivation:
isActivationSym = false;
break;
default:
llvm_unreachable("Unsupported quantization options");
break;
}
}
}
if (!isWeightSym) {
// TODO: Support asymmetric quantiation for weights.
llvm::outs()
<< "Asymmetric quantization for weights is not yet supported. "
"Turning off quantization.\n";
isDynQuant = false;
}
if (nnpaQuantOpTypes.empty()) {
quantOpTypes.emplace_back("MatMul");
} else {
quantOpTypes = nnpaQuantOpTypes;
}
}
configureONNXToZHighLoweringPass(optReport == OptReport::NNPAUnsupportedOps,
isDynQuant, isActivationSym, isWeightSym, quantOpTypes);
}

void addONNXToZHighPasses(mlir::PassManager &pm) {
Expand Down Expand Up @@ -85,7 +130,8 @@ void addONNXToZHighPasses(mlir::PassManager &pm) {
pm.addNestedPass<func::FuncOp>(
onnx_mlir::createInstrumentPass(instrumentOps, instrumentActions));

pm.addPass(onnx_mlir::createONNXToZHighPass(nnpaQuantization));
// Lowering ONNX to ZHigh.
pm.addPass(onnx_mlir::createONNXToZHighPass());
pm.addNestedPass<func::FuncOp>(onnx_mlir::createShapeInferencePass());

// There are more opportunities for const propagation once all zhigh ops were
Expand Down Expand Up @@ -191,7 +237,6 @@ void addPassesNNPA(mlir::OwningOpRef<mlir::ModuleOp> &module,

// Override pass configurations.
configurePasses();
configurePassesNNPA();

// LLVM_DEBUG(llvm::dbgs() << "Adding NNPA passes" << std::endl;);
if (emissionTarget >= EmitONNXIR) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ void DevicePlacementPass::runOnOperation() {

// Disable reporting on NNPA unsupported ops in this pass even if
// `-opt-report=NNPAUnsupportedOps` is specified..
OnnxToZHighLoweringConfiguration::reportOnNNPAUnsupportedOps = 0;
ONNXToZHighLoweringConfiguration::reportOnNNPAUnsupportedOps = 0;

// Run the unknown dimension analysis to help check equality of unknown
// dimensions at compile time.
Expand Down Expand Up @@ -200,13 +200,13 @@ void DevicePlacementPass::runOnOperation() {
// Call ONNXToZHigh pass for lowering multiple ONNX ops at once to ZHigh.
// E.g. `onnx.ReLu (onnx.Conv)` to zhigh.Conv.
RewritePatternSet Patterns2(context);
getONNXToZHighMultipleOpPatterns(Patterns2, nnpaQuantization);
getONNXToZHighMultipleOpPatterns(Patterns2);
(void)applyAnalysisConversion(module, target, std::move(Patterns2),
ConversionConfig{.legalizableOps = &legalizedOps2});

// Call ONNXToZHigh pass for lowering a single ONNX op to ZHigh.
RewritePatternSet Patterns3(context);
getONNXToZHighOneOpPatterns(Patterns3, nnpaQuantization);
getONNXToZHighOneOpPatterns(Patterns3);
getONNXToZHighOneOpDynamicallyLegal(&target, &dimAnalysis);
(void)applyAnalysisConversion(module, target, std::move(Patterns3),
ConversionConfig{.legalizableOps = &legalizedOps3});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ using namespace onnx_mlir;

/// Report NNPA unsupported case.
bool onnxToZHighUnsupportedReport(Operation *op, const std::string &message) {
if (OnnxToZHighLoweringConfiguration::reportOnNNPAUnsupportedOps &&
if (ONNXToZHighLoweringConfiguration::reportOnNNPAUnsupportedOps &&
!message.empty()) {
StringAttr opName = op->getName().getIdentifier();
std::string nodeNameStr = getNodeNameInPresenceOfOpt(op);
Expand Down
Loading

0 comments on commit 5a1e295

Please sign in to comment.