intel
diff --git a/‎cmake/imex-version.txt
Lines changed: 1 addition & 1 deletion b/‎cmake/imex-version.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/llvm-version-imex.txt
Lines changed: 1 addition & 1 deletion b/‎cmake/llvm-version-imex.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/llvm-version.txt
Lines changed: 1 addition & 1 deletion b/‎cmake/llvm-version.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h
Lines changed: 29 additions & 0 deletions b/‎include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h
Lines changed: 29 additions & 0 deletions
diff --git a/‎include/gc/Transforms/Passes.h
Lines changed: 2 additions & 2 deletions b/‎include/gc/Transforms/Passes.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/gc/Transforms/Passes.td
Lines changed: 14 additions & 0 deletions b/‎include/gc/Transforms/Passes.td
Lines changed: 14 additions & 0 deletions
diff --git a/‎lib/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmOnednn.cpp
Lines changed: 47 additions & 33 deletions b/‎lib/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmOnednn.cpp
Lines changed: 47 additions & 33 deletions
diff --git a/‎lib/gc/Transforms/GPU/AddContextArg.cpp
Lines changed: 54 additions & 0 deletions b/‎lib/gc/Transforms/GPU/AddContextArg.cpp
Lines changed: 54 additions & 0 deletions
diff --git a/‎lib/gc/Transforms/GPU/CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎lib/gc/Transforms/GPU/CMakeLists.txt
Lines changed: 2 additions & 0 deletions
@@ -1 +1 @@
-6c2e414a953b9a118bce6adac21cf9d42630e674
+8209807be6148d81fda6f439a01b77696986dd3e
@@ -1 +1 @@
-f06563a5c0d239a6b98f74db522681613254ad08
+3191587666aa3d1e53966bc8876614c7197fac4f
@@ -1 +1 @@
-f06563a5c0d239a6b98f74db522681613254ad08
+3191587666aa3d1e53966bc8876614c7197fac4f
@@ -0,0 +1,29 @@
+//===-- GpuOclRuntime.h - GPU OpenCL runtime --------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef GC_GPUOCLRUNTIME_H
+#define GC_GPUOCLRUNTIME_H
+
+namespace mlir::gc::gpu {
+constexpr char GPU_OCL_MALLOC[] = "gcGpuOclMalloc";
+constexpr char GPU_OCL_DEALLOC[] = "gcGpuOclDealloc";
+constexpr char GPU_OCL_MEMCPY[] = "gcGpuOclMemcpy";
+constexpr char GPU_OCL_KERNEL_CREATE[] = "gcGpuOclKernelCreate";
+constexpr char GPU_OCL_KERNEL_DESTROY[] = "gcGpuOclKernelDestroy";
+constexpr char GPU_OCL_KERNEL_LAUNCH[] = "gcGpuOclKernelLaunch";
+constexpr char GPU_OCL_MOD_DESTRUCTOR[] = "gcGpuOclModuleDestructor";
+} // namespace mlir::gc::gpu
+
+#ifndef GC_GPU_OCL_CONST_ONLY
+
+// TBD
+
+#else
+#undef GC_GPU_OCL_CONST_ONLY
+#endif
+#endif
@@ -20,7 +20,6 @@ namespace func {
 class FuncOp;
 } // namespace func
 
-
 namespace LLVM {
 class LLVMDialect;
 }
@@ -116,7 +115,8 @@ void populateFrontendPasses(mlir::OpPassManager &);
 void populateCPUPipeline(mlir::OpPassManager &);
 
 #ifdef GC_USE_IMEX
-void populateGPUPipeline(mlir::OpPassManager &);
+struct GPUPipelineOption;
+void populateGPUPipeline(mlir::OpPassManager &, const GPUPipelineOption &);
 #endif
 
 #define GEN_PASS_DECL
 
@@ -93,6 +93,20 @@ def LinalgToXeGPU : Pass<"linalg-to-xegpu", "func::FuncOp"> {
                "DPAS register block sizes MxNxK">,
   ];
 }
+
+def AddContextArg : Pass<"add-ctx-arg", "func::FuncOp"> {
+  let summary = "Add a context argument.";
+  let description = [{
+    Add a new memref argument to the function, that could be used to pass some context.
+  }];
+}
+
+def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
+  let summary = "Convert the GPU operations to GpuOclRuntime calls.";
+  let description = [{
+    Convert the gpu alloc, dealloc, memcpy and launch operations to GpuOclRuntime calls.
+  }];
+}
 #endif // GC_USE_IMEX
 
 def IterativeTilingAndFusion : Pass<"iterative-tiling-and-fusion",
 
@@ -53,15 +53,24 @@ using read_lock_guard_t = std::shared_lock<std::shared_mutex>;
 using write_lock_guard_t = std::unique_lock<std::shared_mutex>;
 static std::shared_mutex g_brgemm_lock;
 
-static std::vector<brgemm_desc_t> g_brgemm_desc_list;
-static std::vector<brgemm_kernel_t *> g_brgemm_kernel_list;
-static std::vector<std::unique_ptr<char[]>> g_brgemm_palette;
+struct brgemm_cache_info_t {
+  brgemm_desc_t desc;
+  brgemm_kernel_t *kernel;
+  std::shared_ptr<char[]> palette;
+};
+
+static std::vector<brgemm_cache_info_t> g_cache;
 
 // TODO(haixin): use syscall to determine page size?
 static constexpr size_t SCRATCH_SIZE = 2 * 4096;
 // TODO(haixin): need to use custom thread management for scratch in the future?
 static thread_local char scratch[SCRATCH_SIZE] = {0};
 
+static std::unordered_map<int64_t, brgemm_cache_info_t> &get_tl_cache() {
+  thread_local std::unordered_map<int64_t, brgemm_cache_info_t> tl_cache;
+  return tl_cache;
+}
+
 extern "C" {
 
 int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
@@ -93,33 +102,33 @@ int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
   brgemm_desc_set_attr(&desc, dnnl_attrs);
 
   // TODO(haixin): Reuse identical palettes across kernels
-  char *palette_buffer = nullptr;
+  std::shared_ptr<char[]> palette_buffer;
   if (desc.is_tmm) {
-    palette_buffer = new char[PALETTE_SIZE];
-    dnnl::impl::status_t status = brgemm_init_tiles(desc, palette_buffer);
+    palette_buffer.reset(new char[PALETTE_SIZE]);
+    dnnl::impl::status_t status = brgemm_init_tiles(desc, palette_buffer.get());
     assert(status == dnnl::impl::status::success &&
            "Failed to initialize palette for BRGEMM");
   }
 
   write_lock_guard_t g(g_brgemm_lock);
-  g_brgemm_desc_list.push_back(desc);
-  g_brgemm_kernel_list.push_back(kernel);
-  g_brgemm_palette.emplace_back(palette_buffer);
-
-  return g_brgemm_desc_list.size() - 1;
+  g_cache.push_back(brgemm_cache_info_t{desc, kernel, palette_buffer});
+  return g_cache.size() - 1;
 }
 
 void dnnl_brgemm_tileconfig(int64_t kernel_idx) {
-  char *palette_buffer = nullptr;
-  {
+  assert(kernel_idx >= 0 && "Invalid kernel handler");
+  auto &tl_cache = get_tl_cache();
+  auto it = tl_cache.find(kernel_idx);
+  if (it == tl_cache.end()) {
     read_lock_guard_t g(g_brgemm_lock);
-    assert(kernel_idx >= 0 && kernel_idx < (int64_t)g_brgemm_desc_list.size() &&
-           "Invalid kernel handler");
-    brgemm_desc_t &desc = g_brgemm_desc_list[kernel_idx];
-    if (!desc.is_tmm) {
-      return;
-    }
-    palette_buffer = g_brgemm_palette[kernel_idx].get();
+    assert(kernel_idx < (int64_t)g_cache.size() && "Invalid kernel handler");
+    it = tl_cache.insert({kernel_idx, g_cache[kernel_idx]}).first;
+  }
+  brgemm_desc_t &desc = it->second.desc;
+  char *palette_buffer = it->second.palette.get();
+
+  if (!desc.is_tmm) {
+    return;
   }
 
   assert(palette_buffer != nullptr && "Invalid palette for BRGEMM kernel");
@@ -137,24 +146,29 @@ void dnnl_brgemm_tilerelease() {
 void dnnl_brgemm_execute(int64_t kernel_idx, void *A, uint64_t A_offset,
                          void *B, uint64_t B_offset, void *C, uint64_t C_offset,
                          int num) {
-  brgemm_kernel_t *kernel = nullptr;
-  size_t A_offset_in_bytes;
-  size_t B_offset_in_bytes;
-  size_t C_offset_in_bytes;
-  {
+  auto &tl_cache = get_tl_cache();
+  if (tl_cache.find(kernel_idx) == tl_cache.end()) {
     read_lock_guard_t g(g_brgemm_lock);
-    assert(kernel_idx >= 0 && kernel_idx < (int64_t)g_brgemm_desc_list.size() &&
+    assert(kernel_idx >= 0 && kernel_idx < (int64_t)g_cache.size() &&
            "Invalid kernel handler");
-
-    brgemm_desc_t &desc = g_brgemm_desc_list[kernel_idx];
-    kernel = g_brgemm_kernel_list[kernel_idx];
-
-    A_offset_in_bytes = dnnl::impl::types::data_type_size(desc.dt_a) * A_offset;
-    B_offset_in_bytes = dnnl::impl::types::data_type_size(desc.dt_b) * B_offset;
-    C_offset_in_bytes = dnnl::impl::types::data_type_size(desc.dt_c) * C_offset;
+    auto updated_cache =
+        tl_cache.insert(std::make_pair(kernel_idx, g_cache[kernel_idx]));
+    assert(updated_cache.second && "insert into thread local cache");
   }
+  auto it = tl_cache.find(kernel_idx);
+  brgemm_kernel_t *kernel = it->second.kernel;
+  brgemm_desc_t *desc_ptr = &it->second.desc;
 
   assert(kernel && "Invalid brgemm kernel pointer");
+  assert(desc_ptr && "Invalid brgemm descriptor pointer");
+
+  size_t A_offset_in_bytes =
+      dnnl::impl::types::data_type_size(desc_ptr->dt_a) * A_offset;
+  size_t B_offset_in_bytes =
+      dnnl::impl::types::data_type_size(desc_ptr->dt_b) * B_offset;
+  size_t C_offset_in_bytes =
+      dnnl::impl::types::data_type_size(desc_ptr->dt_c) * C_offset;
+
   char *A_arith = (char *)A;
   char *B_arith = (char *)B;
   char *C_arith = (char *)C;
 
@@ -0,0 +1,54 @@
+//===-- AddContextArg.cpp - Add context argument ----------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/Conversion/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+
+namespace mlir::gc {
+#define GEN_PASS_DECL_ADDCONTEXTARG
+#define GEN_PASS_DEF_ADDCONTEXTARG
+#include "gc/Transforms/Passes.h.inc"
+} // namespace mlir::gc
+
+using namespace mlir;
+
+namespace {
+struct AddContextArg final : gc::impl::AddContextArgBase<AddContextArg> {
+  void runOnOperation() override {
+    auto func = getOperation();
+    if (func.isExternal()) {
+      return;
+    }
+
+    auto funcType = func.getFunctionType();
+    auto argTypes = llvm::to_vector<8>(funcType.getInputs());
+    auto resultTypes = llvm::to_vector<1>(funcType.getResults());
+    auto ctx = func->getContext();
+    auto newArgType = MemRefType::get({}, IntegerType::get(ctx, 8));
+    argTypes.emplace_back(newArgType);
+    auto newFuncType = FunctionType::get(ctx, argTypes, resultTypes);
+    func.setType(newFuncType);
+    func.getBody().front().addArgument(newArgType, func.getLoc());
+
+    // Find all function calls and append the last argument of the current
+    // function to the call.
+    auto module = func->getParentOfType<ModuleOp>();
+    func.walk([&](func::CallOp call) {
+      // If the function to be called is defined in the current module, then the
+      // context arg will be added to this function signature either and, thus,
+      // wee need add the context arg to the function call.
+      if (auto callee = module.lookupSymbol<func::FuncOp>(call.getCallee());
+          !callee || callee.isExternal()) {
+        return;
+      }
+      auto args = llvm::to_vector<8>(call.getOperands());
+      args.emplace_back(func.getArgument(func.getNumArguments() - 1));
+      call->setOperands(args);
+    });
+  }
+};
+} // namespace
@@ -1,4 +1,6 @@
 gc_add_mlir_library(GcGpuPasses
+  AddContextArg.cpp
+  GpuToGpuOcl.cpp
   LinalgToXeGPU.cpp
   Pipeline.cpp
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-6c2e414a953b9a118bce6adac21cf9d42630e674`
	`1`	`+8209807be6148d81fda6f439a01b77696986dd3e`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-f06563a5c0d239a6b98f74db522681613254ad08`
	`1`	`+3191587666aa3d1e53966bc8876614c7197fac4f`