support dlti

zhczhong · zhczhong · commit 510932e33384 · 2024-07-30T19:35:59.000-07:00
diff --git a/include/gc/Analysis/MatmulConfigAnalysis.h b/include/gc/Analysis/MatmulConfigAnalysis.h
@@ -10,64 +10,82 @@
 #define MLIR_ANALYSIS_MATMULCONFIGANALYSIS_H
 
 #include "gc/Dialect/Linalgx/LinalgxOps.h"
+#include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include <cstring>
+#include "mlir/Interfaces/DataLayoutInterfaces.h"
 
 namespace mlir {
 namespace gc {
 
 using namespace mlir;
 
-// A mock for the taget information
-// TODO: replace it with upstream hardware description model
 struct SystemDesc {
-
-  static int getPositiveIntFromStr(char *str, int defaultValue = 1) {
-    if (!str || strlen(str) == 0 || str[0] > '9' || str[0] < '0') {
-      return defaultValue;
-    }
-    auto val = std::stoi(str);
-    return val > 0 ? val : defaultValue;
-  }
-
   // get runtime OMP_NUM_THREADS
   uint32_t getNumThreads() {
-    char *numThreads = getenv("OMP_NUM_THREADS");
-    return getPositiveIntFromStr(numThreads, 1);
+    std::optional<Attribute> numThreads = layout.getDevicePropertyValue(
+        Builder(ctx).getStringAttr("CPU" /* device ID*/),
+        Builder(ctx).getStringAttr("num_threads"));
+    if (numThreads && isa<IntegerAttr>(*numThreads)) {
+      return dyn_cast<IntegerAttr>(*numThreads).getInt();
+    }
+    return 1;
   }
   // get cache size by cacheLevel
   size_t getCacheSize(uint8_t cacheLevel) {
     if (cacheLevel == 1) {
-      char *cacheSize = getenv("L1_CACHE_SIZE");
-      return getPositiveIntFromStr(cacheSize, 0);
+      std::optional<Attribute> cacheSize = layout.getDevicePropertyValue(
+          Builder(ctx).getStringAttr("CPU" /* device ID*/),
+          Builder(ctx).getStringAttr("L1_cache_size_in_bytes"));
+      if (cacheSize && isa<IntegerAttr>(*cacheSize)) {
+        return dyn_cast<IntegerAttr>(*cacheSize).getInt();
+      }
     } else if (cacheLevel == 2) {
-      char *cacheSize = getenv("L2_CACHE_SIZE");
-      return getPositiveIntFromStr(cacheSize, 0);
+      std::optional<Attribute> cacheSize = layout.getDevicePropertyValue(
+          Builder(ctx).getStringAttr("CPU" /* device ID*/),
+          Builder(ctx).getStringAttr("L2_cache_size_in_bytes"));
+      if (cacheSize && isa<IntegerAttr>(*cacheSize)) {
+        return dyn_cast<IntegerAttr>(*cacheSize).getInt();
+      }
     } else if (cacheLevel == 3) {
-      char *cacheSize = getenv("L3_CACHE_SIZE");
-      return getPositiveIntFromStr(cacheSize, 0);
+      std::optional<Attribute> cacheSize = layout.getDevicePropertyValue(
+          Builder(ctx).getStringAttr("CPU" /* device ID*/),
+          Builder(ctx).getStringAttr("L3_cache_size_in_bytes"));
+      if (cacheSize && isa<IntegerAttr>(*cacheSize)) {
+        return dyn_cast<IntegerAttr>(*cacheSize).getInt();
+      }
     }
     return 0;
   }
 
   // get the maximum vector length in bits
   size_t getMaxVectorLength() {
-    char *maxVectorLanes = getenv("MAX_VECTOR_LENGTH");
-    return getPositiveIntFromStr(maxVectorLanes, 512);
+    std::optional<Attribute> maxVectorLength = layout.getDevicePropertyValue(
+        Builder(ctx).getStringAttr("CPU" /* device ID*/),
+        Builder(ctx).getStringAttr("max_vector_width"));
+    if (maxVectorLength && isa<IntegerAttr>(*maxVectorLength)) {
+      return dyn_cast<IntegerAttr>(*maxVectorLength).getInt();
+    }
+    return 512;
   }
+
+  SystemDesc(ModuleOp m) : layout(m), ctx(m->getContext()) {}
+
+private:
+  DataLayout layout;
+  MLIRContext *ctx;
 };
 
 // The configuration for matmul tiling
 // TODO: support batch matmul
 struct MatmulConfig {
   // The number of threads distributed to M, N, K
   uint32_t MThreads, NThreads, KThreads;
-  // The innermost block size for M, N, K which will be directly converted to
-  // brgemm.
-  uint32_t innerMostMBlock, innerMostNBlock, innerMostKBlock;
   // The outer block size for M, N, K which will be used to decide the loop tile
   // size in single thread
   uint32_t MBlock, NBlock, KBlock;
+  // The innermost block size for M, N, K which will be directly converted to
+  // brgemm.
+  uint32_t innerMostMBlock, innerMostNBlock, innerMostKBlock;
 };
 
 enum DimType { Batch, M, N, K };
diff --git a/lib/gc/Analysis/MatmulConfigAnalysis.cpp b/lib/gc/Analysis/MatmulConfigAnalysis.cpp
@@ -88,6 +88,7 @@ double vectorRegEfficiencyCost(linalg::LinalgOp &linalgOp,
   size_t dtypeSize = DataLayout().getTypeSizeInBits(
       ShapeAdaptor(linalgOp.getDpsInputs()[1].getType()).getElementType());
   size_t maxVectorLength = sysDesc.getMaxVectorLength() / dtypeSize;
+  // TODO: take matrix register like amx into account
   double cost = (maxVectorLength - config.innerMostMBlock % maxVectorLength) %
                     maxVectorLength * 1.0 / config.innerMostMBlock +
                 (maxVectorLength - config.innerMostKBlock % maxVectorLength) %
@@ -270,8 +271,8 @@ prepareConfigCandidates(Operation *root, SystemDesc &sysDesc,
                       continue;
                     }
                     MatmulConfig config{
-                        MBlock,          NBlock,          KBlock,
                         MThreads,        NThreads,        KThreads,
+                        MBlock,          NBlock,          KBlock,
                         innerMostMBlock, innerMostNBlock, innerMostKBlock};
                     configs.push_back(config);
                   }
@@ -311,13 +312,13 @@ bool readConfigFromAttrs(MatmulConfig &config, ArrayRef<NamedAttribute> attrs) {
     } else if (attr.getName() == "MThreads") {
       config.MThreads = cast<IntegerAttr>(attr.getValue()).getInt();
       cfgItemCnt++;
-    } else if (attr.getName() == "innerMostMBlock") {
+    } else if (attr.getName() == "innermostMBlock") {
       config.innerMostMBlock = cast<IntegerAttr>(attr.getValue()).getInt();
       cfgItemCnt++;
-    } else if (attr.getName() == "innerMostNBlock") {
+    } else if (attr.getName() == "innermostNBlock") {
       config.innerMostNBlock = cast<IntegerAttr>(attr.getValue()).getInt();
       cfgItemCnt++;
-    } else if (attr.getName() == "innerMostKBlock") {
+    } else if (attr.getName() == "innermostKBlock") {
       config.innerMostKBlock = cast<IntegerAttr>(attr.getValue()).getInt();
       cfgItemCnt++;
     }
@@ -338,7 +339,7 @@ bool readConfigFromAttrs(MatmulConfig &config, ArrayRef<NamedAttribute> attrs) {
 // previous matmul
 MatmulConfigAnalysis::MatmulConfigAnalysis(Operation *root) {
   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(root)) {
-    SystemDesc sysDesc;
+    SystemDesc sysDesc(root->getParentOfType<ModuleOp>());
     SmallVector<SmallVector<DimType>> oprandDimType =
         *getOprandDimType(linalgOp);
     // get the origin M,N,K size
diff --git a/lib/gc/Transforms/DeepTileContractionNamedOp.cpp b/lib/gc/Transforms/DeepTileContractionNamedOp.cpp
@@ -243,7 +243,7 @@ static Operation *findParentFillOp(Value val) {
          llvm::find(skipOpList, currentOp->getName().getStringRef()) !=
              skipOpList.end() &&
          !isa<linalg::FillOp>(currentOp)) {
-    currentOp = currentOp->getResult(0).getDefiningOp();
+    currentOp = currentOp->getOperand(0).getDefiningOp();
   }
   if (currentOp && isa<linalg::FillOp>(currentOp)) {
     return currentOp;
diff --git a/lib/gc/Transforms/TilingUtil.hpp b/lib/gc/Transforms/TilingUtil.hpp
@@ -16,6 +16,8 @@
 namespace mlir {
 namespace linalgX {
 
+// An enahncement for the upstream pass to support tiling reduction for MKmk
+// like cases(with multiple reduction iterators).
 FailureOr<linalg::ForallReductionTilingResult> tileReductionUsingForall(
     RewriterBase &b, PartialReductionOpInterface op,
     ArrayRef<OpFoldResult> threadNums, ArrayRef<OpFoldResult> tileSizes,
diff --git a/test/gc/Transform/deepTileContractionNamedOp.mlir b/test/gc/Transform/deepTileContractionNamedOp.mlir
@@ -108,3 +108,50 @@ func.func @matmul_2Dx4D_bf16(%arg0: tensor<4096x4096xbf16>, %arg1: tensor<128x12
     return %2 : tensor<4096x4096xbf16>
 }
 
+// -----
+
+module attributes {
+  dlti.target_system_spec = #dlti.target_system_spec<
+    "CPU": #dlti.target_device_spec<
+      #dlti.dl_entry<"L1_cache_size_in_bytes", 49152 : i32>,
+      #dlti.dl_entry<"L2_cache_size_in_bytes", 2097152 : i32>,
+      #dlti.dl_entry<"L3_cache_size_in_bytes", 110100480 : i32>,
+      #dlti.dl_entry<"num_threads", 56 : i32>,
+      #dlti.dl_entry<"max_vector_width", 512 : i32>>
+  >} {
+    /// CHECK-LABEL: @matmul_2Dx4D_bf16_with_dlti
+func.func @matmul_2Dx4D_bf16_with_dlti(%arg0: tensor<4096x4096xbf16>, %arg1: tensor<128x128x16x32x2xbf16>) -> tensor<4096x4096xbf16> {
+    %cst_0 = arith.constant 0.000000e+00 : bf16
+    %0 = tensor.empty() : tensor<4096x4096xbf16>
+    %1 = linalg.fill ins(%cst_0 : bf16) outs(%0 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
+    // CHECK: scf.forall
+    // CHECK: tensor.extract_slice
+    // CHECK: scf.forall
+    // CHECK: tensor.extract_slice
+    // CHECK: scf.forall
+    // CHECK: tensor.extract_slice
+    // CHECK: scf.for
+    // CHECK: tensor.extract_slice
+    // CHECK: scf.for
+    // CHECK: scf.for
+    // CHECK: tensor.extract_slice
+    // CHECK: tensor.extract_slice
+    // CHECK: scf.for
+    // CHECK: tensor.extract_slice
+    // CHECK: tensor.extract_slice
+    // CHECK: linalg.transpose
+    // CHECK: scf.if
+    // CHECK: linalg.fill
+    // CHECK: linalgx.batch_reduce_matmul_vnni
+    // CHECK: else
+    // CHECK: linalgx.batch_reduce_matmul_vnni
+    // CHECK: scf.forall.in_parallel
+    // CHECK: scf.forall.in_parallel
+    // CHECK: scf.forall.in_parallel
+    // CHECK: linalg.reduce
+    // CHECK: linalg.copy
+    %2 = linalgx.mm2d_vnni ins(%arg0, %arg1 : tensor<4096x4096xbf16>, tensor<128x128x16x32x2xbf16>) outs(%1 : tensor<4096x4096xbf16>)  -> tensor<4096x4096xbf16>
+    return %2 : tensor<4096x4096xbf16>
+}
+
+}

Original file line number	Diff line number	Diff line change
`@@ -243,7 +243,7 @@ static Operation *findParentFillOp(Value val) {`
`243`	`243`	`llvm::find(skipOpList, currentOp->getName().getStringRef()) !=`
`244`	`244`	`skipOpList.end() &&`
`245`	`245`	`!isa<linalg::FillOp>(currentOp)) {`
`246`		`- currentOp = currentOp->getResult(0).getDefiningOp();`
	`246`	`+ currentOp = currentOp->getOperand(0).getDefiningOp();`
`247`	`247`	`}`
`248`	`248`	`if (currentOp && isa<linalg::FillOp>(currentOp)) {`
`249`	`249`	`return currentOp;`