draft update

ZhangYan · ZhangYan · commit c6a674ddd15f · 2024-07-08T08:43:13.000-07:00
diff --git a/include/gc/Analysis/MatmulConfigAnalysis.h b/include/gc/Analysis/MatmulConfigAnalysis.h
@@ -28,11 +28,29 @@ struct SystemDesc {
   // get runtime OMP_NUM_THREADS
   uint32_t getNumThreads() {
     char *numThreads = getenv("OMP_NUM_THREADS");
-    if (numThreads) {
+    if (!threads_limited && numThreads) {
       return std::stoi(numThreads);
     }
+    return curThreads;
+  }
+
+  // set the expected threads
+  void limitOnSingleNode(uint32_t numa_node) {
+    char *cacheSize = getenv("NUMA_THREADS");
+    if (cacheSize) {
+      curThreads = std::stoi(cacheSize);
+      threads_limited = true;
+    }
+  }
+
+  uint32_t getNumNodes() {
+    char *numThreads = getenv("OMP_NUM_THREADS");
+    if (threads_limited && numThreads) {
+      return std::stoi(numThreads) / curThreads;
+    }
     return 1;
   }
+
   // get cache size by cacheLevel
   size_t getCacheSize(uint8_t cacheLevel) {
     if (cacheLevel == 1) {
@@ -57,6 +75,10 @@ struct SystemDesc {
   SmallVector<size_t> getContractionOperationMaxVectorLength() {
     return {512UL, 512UL};
   }
+
+private:
+  uint32_t curThreads = 1;
+  bool threads_limited = false;
 };
 
 struct MatmulConfig {
diff --git a/lib/gc/Analysis/MatmulConfigAnalysis.cpp b/lib/gc/Analysis/MatmulConfigAnalysis.cpp
@@ -345,6 +345,12 @@ previous matmul
 MatmulConfigAnalysis::MatmulConfigAnalysis(Operation *root) {
   SystemDesc sysDesc;
   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(root)) {
+    // Check if the operation has an attribute named 'splited'
+    auto splitedAttr = linalgOp->getAttrOfType<IntegerAttr>("splited");
+    if (splitedAttr) {
+      sysDesc.limitOnSingleNode(splitedAttr.getInt());
+      llvm::outs() << "splited mm, and should be allocated on numa node 0.\n";
+    }
     auto oprandDimType = *getOprandDimType(linalgOp);
     // get the origin M,N,K size
     auto MDimTypeIdx = extractDimTypeIdx(oprandDimType[0], DimType::M);
diff --git a/lib/gc/Transforms/DeepTileContractionNamedOp.cpp b/lib/gc/Transforms/DeepTileContractionNamedOp.cpp
@@ -326,6 +326,7 @@ static void setStaticSizeForExtractSliceOp(RewriterBase &rewriter,
                                            Operation *op, bool isExtract,
                                            SmallVector<int64_t> size,
                                            int shrinDimNum = 0) {
+  llvm::outs() << "^^^^^^^^^^^^^^setStaticSizeForExtractSliceOp^^^^^^^^^^\n";
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPoint(op);
   if (auto extractSlice = dyn_cast<tensor::ExtractSliceOp>(op)) {
@@ -335,6 +336,23 @@ static void setStaticSizeForExtractSliceOp(RewriterBase &rewriter,
     for (auto i = 0UL; i < mixedSizes.size(); i++) {
       mixedSizes[i] = getAsIndexOpFoldResult(rewriter.getContext(), size[i]);
     }
+    llvm::outs() << "mixedOffsets: ";
+    for (auto t : mixedOffsets) {
+      llvm::outs() << t << ", ";
+    }
+    llvm::outs() << "\n";
+
+    llvm::outs() << "mixedSizes: ";
+    for (auto t : mixedSizes) {
+      llvm::outs() << t << ", ";
+    }
+    llvm::outs() << "\n";
+
+    llvm::outs() << "mixedStrides: ";
+    for (auto t : mixedStrides) {
+      llvm::outs() << t << ", ";
+    }
+    llvm::outs() << "\n";
     if (shrinDimNum > 0) {
       rewriter.replaceOpWithNewOp<tensor::ExtractSliceOp>(
           extractSlice,
@@ -348,6 +366,7 @@ static void setStaticSizeForExtractSliceOp(RewriterBase &rewriter,
           mixedStrides);
     }
   }
+  llvm::outs() << "^^^^^^^^^^^^^^setStaticSizeForExtractSliceOp^^^^^^^^^^\n";
 }
 
 static void setStaticSizeForInsertSliceOp(RewriterBase &rewriter, Operation *op,
@@ -398,6 +417,7 @@ struct OuterLoopGenerationResult {
 static FailureOr<OuterLoopGenerationResult>
 generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
                   const OuterLoopGenerationOption &option) {
+  llvm::outs() << "======================================\n";
   // TODO: handle the return value
   OuterLoopGenerationResult result;
   auto nestedTileSizes = option.nestedTileSizes;
@@ -471,40 +491,82 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
         else
           tileSizes[d] = getAsIndexOpFoldResult(b.getContext(), tile);
       }
+
+      llvm::outs() << "tileSizes: ";
+      for (auto t : tileSizes) {
+        llvm::outs() << t << ", ";
+      }
+      llvm::outs() << "\n";
+
+      llvm::outs() << "threads: ";
+      for (auto t : threads) {
+        llvm::outs() << t << ", ";
+      }
+      llvm::outs() << "\n";
+
       SmallVector<Range> loopRanges =
           cast<TilingInterface>(currentOp.getOperation()).getIterationDomain(b);
       OpBuilder::InsertionGuard guard(b);
       b.setInsertionPoint(currentOp);
       if (auto partialInterface =
               dyn_cast<PartialReductionOpInterface>(currentOp.getOperation())) {
+        llvm::outs() << "PartialReductionOpInterface\n";
         for (auto [idx, tile] : llvm::enumerate(tileSizes)) {
           if (isConstantIntValue(tile, 0)) {
             tileSizes[idx] = loopRanges[idx].size;
           }
         }
-
+        llvm::outs() << "updated tileSizes: ";
+        for (auto t : tileSizes) {
+          llvm::outs() << t << ", ";
+        }
+        llvm::outs() << "\n";
         SmallVector<OpFoldResult> newParallelDims;
         for (auto i = 0UL; i < reductionDims.size(); i++) {
           newParallelDims.push_back(getAsIndexOpFoldResult(b.getContext(), i));
         }
-        auto tilingResult = linalgX::tileAllUsingForall(
-            b, cast<PartialReductionOpInterface>(currentOp.getOperation()), {},
-            tileSizes, newParallelDims, std::nullopt);
-        if (failed(tilingResult) &&
-            tilingResult->parallelTiledOps.size() == 1UL)
-          return failure();
-        currentOp =
-            dyn_cast<linalg::LinalgOp>(tilingResult->parallelTiledOps.back());
-        if (!tilingResult->mergeOps.empty()) {
-          for (const auto &fn : option.finalReduceCallBacks) {
-            auto result = fn(b, currentOp.getLoc(), *tilingResult);
-            if (succeeded(result)) {
-              currentOp = *result;
+        if (currentTileSize.front() != 16 || true) {
+          auto tilingResult = linalgX::tileAllUsingForall(
+              b, cast<PartialReductionOpInterface>(currentOp.getOperation()),
+              {}, tileSizes, newParallelDims, std::nullopt);
+          if (failed(tilingResult) &&
+              tilingResult->parallelTiledOps.size() == 1UL)
+            return failure();
+          currentOp =
+              dyn_cast<linalg::LinalgOp>(tilingResult->parallelTiledOps.back());
+          if (!tilingResult->mergeOps.empty()) {
+            llvm::outs() << "has merge ops\n";
+            for (const auto &fn : option.finalReduceCallBacks) {
+              auto result = fn(b, currentOp.getLoc(), *tilingResult);
+              if (succeeded(result)) {
+                currentOp = *result;
+              }
             }
           }
+        } else {
+          llvm::outs() << "handle special cases\n";
+          OpBuilder::InsertionGuard g(b);
+
+          Location loc = currentOp.getLoc();
+          SmallVector<Value> dest;
+          if (failed(tensor::getOrCreateDestinations(b, loc, currentOp, dest)))
+            return b.notifyMatchFailure(currentOp,
+                                        "failed to get destination tensors");
+          arith::ConstantIndexOp lb = b.create<arith::ConstantIndexOp>(loc, 0);
+          arith::ConstantIndexOp ub = b.create<arith::ConstantIndexOp>(loc, 2);
+          arith::ConstantIndexOp step =
+              b.create<arith::ConstantIndexOp>(loc, 1);
+
+          Operation *forallOp = b.create<scf::ForallOp>(
+              loc, ArrayRef<OpFoldResult>(lb->getResult(0)),
+              ArrayRef<OpFoldResult>(ub->getResult(0)),
+              ArrayRef<OpFoldResult>(step->getResult(0)), dest, std::nullopt);
+          currentOp = dyn_cast<linalg::LinalgOp>(forallOp);
         }
+
       } else if (auto tilingInterface =
                      cast<TilingInterface>(currentOp.getOperation())) {
+        llvm::outs() << "TilingInterface\n";
         auto tilingResult = linalg::tileToForallOpUsingTileSizes(
             b, tilingInterface, tileSizes, std::nullopt);
         if (failed(tilingResult))
@@ -515,6 +577,7 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
     }
   }
   result.tiledOps.emplace_back(currentOp);
+  llvm::outs() << "======================================\n";
   return result;
 }
 
@@ -595,6 +658,11 @@ struct deepTileMatmul : public OpInterfaceRewritePattern<linalg::LinalgOp> {
     auto NOuterBlockSize = NDimPos.size() > 1
                                ? (cfg.NBlock - 1) / cfg.innerMostNBlock + 1
                                : cfg.NBlock;
+    // Outermost Numa loop
+    option.nestedTileSizes.emplace_back(
+        SmallVector<size_t>{uint32_t(MFirstDim / 2)});
+    option.loopType.emplace_back(OuterLoopGenerationOption::LoopType::ForallOp);
+    option.loopDim.emplace_back(SmallVector<size_t>{MDimPos[0]});
     // Outer
     option.nestedTileSizes.emplace_back(SmallVector<size_t>{
         MParallelBlockSize, NParallelBlockSize, KParallelBlockSize});
diff --git a/lib/gc/Transforms/Tiling.cpp b/lib/gc/Transforms/Tiling.cpp