diff --git a/docs/howto_add_new_accelerator.md b/docs/howto_add_new_accelerator.md
new file mode 100644
index 00000000..26d75304
--- /dev/null
+++ b/docs/howto_add_new_accelerator.md
@@ -0,0 +1,320 @@
+# How to add new backend in occa-transpiler 
+In this short tutorial, you will learn how to add your own new backend. The example will be shown in OpenCL.
+
+## Prepartion
+Before you start adding a new backend to the project, you need to add a new backend to the target_backends.cpp file at the address lib/core/target_backends.cpp. The following changes need to be made:
+```cpp
+
+tl::expected<TargetBackend, std::string> backendFromString(const std::string& type) {
+    static const std::map<std::string, TargetBackend> BACKENDS_MAP = {
+        ...
+        {"opencl", TargetBackend::OPENCL},
+        ...
+    };
+
+    auto it = BACKENDS_MAP.find(util::toLower(type));
+    if (it != BACKENDS_MAP.end()) {
+        return it->second;
+    }
+    return tl::unexpected("unknown backend is requested");
+}
+
+std::string backendToString(TargetBackend backend) {
+    switch (backend) {
+        ...
+        case TargetBackend::OPENCL:
+            return std::string{"opencl"};
+        ...
+    }
+    return {};
+}
+
+```
+And according to the backend we add, we need to add it to either isHostCategory or isDeviceCategory. Since OpenCl refers to the second case, we make the following changes:
+```cpp
+bool isDeviceCategory(TargetBackend backend) {
+    switch (backend) {
+        ...
+        case TargetBackend::OPENCL:
+            return true;
+        default:
+            return false;
+    }
+}
+```
+
+## Implementaion of backend 
+### Adding new files
+At the following address: lib/attributes/backend/, create a folder with the name of the backend and add the following files:
+- atomic.cpp
+- barrier.cpp
+- exclusive.cpp
+- global_constant.cpp
+- global_function.cpp
+- inner.cpp
+- kernel.cpp
+- outer.cpp
+- restrict.cpp
+- shared.cpp
+- tile.cpp
+- translation_unit.cpp
+
+And additional ones if needed:
+- common.h
+- common.cpp
+
+### Connection of new files
+After creating the folder and creating the appropriate ones, add the stem files to CMakeLists.txt located at the following address: lib/CMakeLists.txt 
+```txt
+    # OPENCL
+    attributes/backend/opencl/kernel.cpp
+    attributes/backend/opencl/translation_unit.cpp
+    attributes/backend/opencl/global_constant.cpp
+    attributes/backend/opencl/global_function.cpp
+    attributes/backend/opencl/outer.cpp
+    attributes/backend/opencl/inner.cpp
+    attributes/backend/opencl/tile.cpp
+    attributes/backend/opencl/shared.cpp
+    attributes/backend/opencl/restrict.cpp
+    attributes/backend/opencl/atomic.cpp
+    attributes/backend/opencl/barrier.cpp
+    attributes/backend/opencl/exclusive.cpp
+    attributes/backend/opencl/common.cpp
+    attributes/backend/opencl/common.h
+```
+
+### Stucture of files
+Other backends can be used as templates for writing files. For example, here is a ready-made implementation of @atomic for opencl in atomic.cpp:
+```cpp
+namespace {
+using namespace oklt;
+using namespace clang;
+
+HandleResult handleAtomicAttribute(SessionStage& stage, const Stmt& stmt, const Attr& attr) {
+    SPDLOG_DEBUG("Handle [@atomic] attribute (stmt)");
+
+    removeAttribute(stage, attr);
+    return {};
+}
+
+__attribute__((constructor)) void registerAttrBackend() {
+    auto ok = registerBackendHandler(
+        TargetBackend::OPENCL, ATOMIC_ATTR_NAME, handleAtomicAttribute);
+
+    if (!ok) {
+        SPDLOG_ERROR("[OPENCL] Failed to register {} attribute handler", ATOMIC_ATTR_NAME);
+    }
+}
+} 
+```
+
+The main thing is to change the TargetBackend and SPDLOG_ERROR to the appropriate one for the backend when writing implementations or using other files as templates.
+
+
+## Test run of the program 
+To check the implementation of the backend, you need to run it on an example. To do this, we create a file in which we will add the test code for testing, so we created test.cpp in the example folder.  
+
+To run the program and check the code from the file, run the following command:
+
+``` bash
+./build/bin/occa-tool transpile --normalize -b opencl -i $FullPath$/occa-transpiler/example/test.cpp -o $FullPath$/occa-transpiler/example/test-out.cpp --sema with-sema 
+```
+
+## Tests for backend 
+This chapter will show you the necessary steps to create the appropriate tests, generate them, and run them.
+
+### Adding of new tests
+Creating tests to check the backend consists of two stages. The first step is to create the tests themselves, and the second is to create the corresponding json configuration files.
+
+Let's consider the first stage. To add tests, go to tests/functional/data/transpiler/backends/ and create an opencl folder. In this folder, add folders for tests, and add example files to the folders themselves. Let's consider an example of creating a test for @barrier, to do this, create the nobarrier folder and create the nobarrier_builtin file.cpp 
+```cpp
+@kernel void hello_kern() {
+    for (int i = 0; i < 10; ++i; @outer) {
+        @shared int shm[10];
+        for (int j = 0; j < 10; ++j; @inner) {
+            shm[j] = j;
+        }
+
+        @nobarrier for (int j = 0; j < 10; ++j; @inner) {
+            shm[j] = j;
+        }
+
+        for (int j = 0; j < 10; ++j; @inner) {
+            shm[j] = j;
+        }
+
+        for (int j = 0; j < 10; ++j; @inner) {
+            shm[j] = j;
+        }
+    }
+}
+
+@kernel void priority_issue() {
+    @outer for (int i = 0; i < 32; ++i) {
+        @shared float shm[32];
+        @nobarrier for (int j = 0; j < 32; ++j; @inner) {
+                shm[i] = i;
+        }
+        @inner for (int j = 0; j < 32; ++j) {
+                @atomic shm[i * j] += 32;
+        }
+    }
+}
+```
+
+Let's move on to the second step. Now you need to go to tests/functional/configs/test_suite_transpiler/backends/ and create the opencl folder. Create the file nobarrier.json
+
+```json
+[
+  {
+    "action": "normalize_and_transpile",
+    "action_config": {
+      "backend": "opencl",
+      "source": "transpiler/backends/opencl/nobarrier/nobarrier_builtin.cpp",
+      "includes": [],
+      "defs": [],
+      "launcher": ""
+    },
+    "reference": "transpiler/backends/opencl/nobarrier/nobarrier_builtin_ref.cpp"
+  }
+]
+
+```
+
+### Adding python script for test regeneration
+ДFor the created tests, you need to create the appropriate reference files. To do this, you can use the python script located at the following address: script/regenerate_tests_ref.py, but it needs to be slightly modified by adding the newly created backend. The following changes are made:
+```python
+    SERIAL = 0
+    OPENMP = 1
+    CUDA = 2
+    HIP = 3
+    DPCPP = 4
+    OPENCL = 5
+    LAUNCHER = 6
+```
+Add new options to the following functions:
+```python
+ def from_str(s: str) -> "Backend":
+    s = s.lower()
+    ...
+    if s == "opencl":
+        return Backend.OPENCL
+    ...
+
+ def to_str(self) -> str:
+    ...
+    if self == Backend.OPENCL:
+            return "opencl"
+    ...
+```
+And change the selection options:
+```python
+    parser.add_argument(
+        "--backend", "-b", type=str, required=True, help="serial/openmp/cuda/hip/dpcppp/opencl"
+    )
+```
+
+### Creation of new test
+After creating the tests and modifying the script, let's generate the reference files using the following command:
+``` bash
+python3 ./script/regenerate_test_ref.py -o ./build/bin/occa-tool -d test/functional/data/transpiler/backends/opencl/ -b opencl 
+```
+
+### Test run
+To run all tests, use the following command:
+``` bash
+./occa-transpiler-tests --suite configs/test_suite_transpiler/backends/opencl/ --data_root data/
+```
+## Tips
+
+### Hooks
+Hooks are a mechanism that allows you to register functions (usually known as handlers or event handlers) that will be called when certain AST nodes are encountered during code analysis. This allows you to intervene in the AST analysis and processing process to perform various tasks, in our case code generation.
+
+The general structure of the hook is as follows:
+```cpp
+__attribute__((constructor)) void functionName() {
+    auto ok =
+        registerBackendHandler(TargetBackend::<beckend>, <hook target>, <functionToCall>);
+
+    if (!ok) {
+        SPDLOG_ERROR("[<beckend>] Failed to register {} attribute handler", <hook target>);
+    }
+}
+```
+The program has the following options for hook targets:
+- KERNEL_ATTR_NAME
+- OUTER_ATTR_NAME
+- TILE_ATTR_NAME
+- SHARED_ATTR_NAME
+- RESTRICT_ATTR_NAME
+- BARRIER_ATTR_NAME
+- NO_BARRIER_ATTR_NAME
+- EXCLUSIVE_ATTR_NAME
+- ATOMIC_ATTR_NAME
+
+Let's look at an example of using a hook on the example of implementing @barrier for OpenCL: 
+
+```cpp
+__attribute__((constructor)) void registerAttrBackend() {
+    auto ok =
+        registerBackendHandler(TargetBackend::OPENCL, BARRIER_ATTR_NAME, handleBarrierAttribute);
+
+    if (!ok) {
+        SPDLOG_ERROR("[OPENCL] Failed to register {} attribute handler", BARRIER_ATTR_NAME);
+    }
+}
+```
+As you can see, the following two positions responsible for global function and global constant are missing from the provided hook targets. In this case, the hook structure is as follows: 
+
+```cpp
+__attribute__((constructor)) void functionName() {
+    auto ok =
+        registerBackendHandler(TargetBackend::<beckend>, <functionToCall>);
+
+    if (!ok) {
+        SPDLOG_ERROR("[<beckend>] Failed to register {} attribute handler");
+    }
+}
+```
+This case can be seen in the example of the global const implementation for OpenCl:
+
+```cpp
+__attribute__((constructor)) void registeCUDAGlobalConstantHandler() {
+    auto ok = registerImplicitHandler(TargetBackend::OPENCL, handleGlobalConstant);
+
+    if (!ok) {
+        SPDLOG_ERROR("[OPENCL] Failed to register implicit handler for global constant");
+    }
+}
+```
+
+### Rewriter
+This is one of the main tools used in the backend implementation. The general structure can be found [here](./documentation.pdf). 
+
+When using it, keep in mind that inserting it when performing a replace or delete operation can lead to errors. Therefore, you need to be sure of the cursor position when using insert or replace operations.
+
+## Tips for debug
+To improve the code debugging process, you can “disable” the format function located at the following address: lib/core/utils/format.cpp. This function processes the text before outputting it into a more human-readable format, but in turn complicates the debugging process. To do this, use the format:
+```cpp
+std::string format(std::string_view code) {
+    const std::vector<Range> ranges(1, Range(0, code.size()));
+    auto style = format::getLLVMStyle();
+    style.MaxEmptyLinesToKeep = 1;
+    style.SeparateDefinitionBlocks = format::FormatStyle::SeparateDefinitionStyle::SDS_Always;
+
+    Replacements replaces = format::reformat(style, code, ranges);
+    auto changedCode = applyAllReplacements(code, replaces);
+    if (!changedCode) {
+        SPDLOG_ERROR("{}", toString(changedCode.takeError()));
+        return {};
+    }
+    return changedCode.get();
+}
+```
+function must be edited to this format:
+```cpp
+std::string format(std::string_view code) {
+    return std::string(code);
+}
+```
+
diff --git a/include/oklt/core/target_backends.h b/include/oklt/core/target_backends.h
index bff3463a..4559f60c 100644
--- a/include/oklt/core/target_backends.h
+++ b/include/oklt/core/target_backends.h
@@ -14,6 +14,7 @@ enum struct TargetBackend : unsigned char {
     CUDA,    ///< CUDA backend.
     HIP,     ///< HIP backend.
     DPCPP,   ///< DPCPP backend.
+    OPENCL,  ///< OPENCL backend.
 
     _LAUNCHER,  ///< Launcher backend.
 };
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 182f1e03..f8834f94 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -105,6 +105,22 @@ set (OCCA_TRANSPILER_SOURCES
     attributes/backend/dpcpp/common.cpp
     attributes/backend/dpcpp/common.h
 
+    # OPENCL
+    attributes/backend/opencl/kernel.cpp
+    attributes/backend/opencl/translation_unit.cpp
+    attributes/backend/opencl/global_constant.cpp
+    attributes/backend/opencl/global_function.cpp
+    attributes/backend/opencl/outer.cpp
+    attributes/backend/opencl/inner.cpp
+    attributes/backend/opencl/tile.cpp
+    attributes/backend/opencl/shared.cpp
+    attributes/backend/opencl/restrict.cpp
+    attributes/backend/opencl/atomic.cpp
+    attributes/backend/opencl/barrier.cpp
+    attributes/backend/opencl/exclusive.cpp
+    attributes/backend/opencl/common.cpp
+    attributes/backend/opencl/common.h
+
     # Serial subset
     attributes/utils/serial_subset/empty.cpp
     attributes/utils/serial_subset/kernel.cpp
diff --git a/lib/attributes/backend/dpcpp/translation_unit.cpp b/lib/attributes/backend/dpcpp/translation_unit.cpp
index cc963da4..b50fdd30 100644
--- a/lib/attributes/backend/dpcpp/translation_unit.cpp
+++ b/lib/attributes/backend/dpcpp/translation_unit.cpp
@@ -11,7 +11,7 @@ const std::string_view SYCL_INCLUDE = "<CL/sycl.hpp>";
 const std::string_view SYCL_NS = "sycl";
 
 HandleResult handleTranslationUnitDpcpp(SessionStage& s, const clang::TranslationUnitDecl& decl) {
-    return oklt::handleTranslationUnit(s, decl, {SYCL_INCLUDE}, {SYCL_NS});
+    return oklt::handleTranslationUnit(s, decl, {SYCL_INCLUDE}, {}, {SYCL_NS});
 }
 
 __attribute__((constructor)) void registerTranslationUnitAttrBackend() {
diff --git a/lib/attributes/backend/opencl/atomic.cpp b/lib/attributes/backend/opencl/atomic.cpp
new file mode 100644
index 00000000..79193a55
--- /dev/null
+++ b/lib/attributes/backend/opencl/atomic.cpp
@@ -0,0 +1,30 @@
+#include "attributes/attribute_names.h"
+#include "attributes/utils/cuda_subset/handle.h"
+#include "core/handler_manager/backend_handler.h"
+#include "core/transpiler_session/session_stage.h"
+#include "core/utils/attributes.h"
+#include "core/utils/range_to_string.h"
+#include "pipeline/core/error_codes.h"
+
+#include <spdlog/spdlog.h>
+
+namespace {
+using namespace oklt;
+using namespace clang;
+
+HandleResult handleAtomicAttribute(SessionStage& stage, const Stmt& stmt, const Attr& attr) {
+    SPDLOG_DEBUG("Handle [@atomic] attribute (stmt)");
+
+    removeAttribute(stage, attr);
+    return {};
+}
+
+__attribute__((constructor)) void registerAttrBackend() {
+    auto ok = registerBackendHandler(
+        TargetBackend::OPENCL, ATOMIC_ATTR_NAME, handleAtomicAttribute);
+
+    if (!ok) {
+        SPDLOG_ERROR("[OPENCL] Failed to register {} attribute handler", ATOMIC_ATTR_NAME);
+    }
+}
+}  // namespace
diff --git a/lib/attributes/backend/opencl/barrier.cpp b/lib/attributes/backend/opencl/barrier.cpp
new file mode 100644
index 00000000..f543fb4b
--- /dev/null
+++ b/lib/attributes/backend/opencl/barrier.cpp
@@ -0,0 +1,35 @@
+#include "attributes/attribute_names.h"
+#include "attributes/backend/opencl/common.h"
+#include "attributes/utils/cuda_subset/handle.h"
+#include "core/handler_manager/backend_handler.h"
+#include "core/utils/attributes.h"
+
+#include <clang/AST/Attr.h>
+#include <clang/AST/Stmt.h>
+#include <spdlog/spdlog.h>
+
+namespace {
+using namespace oklt;
+using namespace clang;
+
+const std::string BARRIER_STR = "barrier(CLK_LOCAL_MEM_FENCE);\n";
+
+HandleResult handleBarrierAttribute(SessionStage& s,
+                                    const clang::Stmt& stmt,
+                                    const clang::Attr& a) {
+    SPDLOG_DEBUG("Handle [@barrier] attribute");
+
+    SourceRange range(getAttrFullSourceRange(a).getBegin(), stmt.getEndLoc());
+    s.getRewriter().ReplaceText(range, BARRIER_STR);
+    return {};
+}
+
+__attribute__((constructor)) void registerAttrBackend() {
+    auto ok =
+        registerBackendHandler(TargetBackend::OPENCL, BARRIER_ATTR_NAME, handleBarrierAttribute);
+
+    if (!ok) {
+        SPDLOG_ERROR("[OPENCL] Failed to register {} attribute handler", BARRIER_ATTR_NAME);
+    }
+}
+}  // namespace
diff --git a/lib/attributes/backend/opencl/common.cpp b/lib/attributes/backend/opencl/common.cpp
new file mode 100644
index 00000000..5c0f53ad
--- /dev/null
+++ b/lib/attributes/backend/opencl/common.cpp
@@ -0,0 +1,59 @@
+#include "attributes/backend/dpcpp/common.h"
+#include "util/string_utils.hpp"
+#include "core/sema/okl_sema_ctx.h"
+#include "core/utils/range_to_string.h"
+
+#include <clang/Rewrite/Core/Rewriter.h>
+
+#include <spdlog/spdlog.h>
+
+namespace oklt::opencl {
+using namespace clang;
+
+std::string axisToStr(const Axis& axis) {
+    static std::map<Axis, std::string> mapping{{Axis::X, "0"}, {Axis::Y, "1"}, {Axis::Z, "2"}};
+    return mapping[axis];
+}
+
+std::string getIdxVariable(const AttributedLoop& loop) {
+    auto strAxis = axisToStr(loop.axis);
+    switch (loop.type) {
+        case (LoopType::Inner):
+            return util::fmt("get_local_id({})", strAxis).value();
+        case (LoopType::Outer):
+            return util::fmt("get_group_id({})", strAxis).value();
+        default:  // Incorrect case
+            return "";
+    }
+}
+std::string buildInnerOuterLoopIdxLine(const OklLoopInfo& forLoop,
+                                       const AttributedLoop& loop,
+                                       int& openedScopeCounter,
+                                       oklt::Rewriter& rewriter) {
+    static_cast<void>(openedScopeCounter);
+    auto idx = getIdxVariable(loop);
+    auto op = forLoop.IsInc() ? "+" : "-";
+
+    std::string res;
+    if (forLoop.isUnary()) {
+        res = std::move(util::fmt("{} {} = ({}) {} {};\n",
+                                  forLoop.var.typeName,
+                                  forLoop.var.name,
+                                  getLatestSourceText(forLoop.range.start, rewriter),
+                                  op,
+                                  idx)
+                            .value());
+    } else {
+        res = std::move(util::fmt("{} {} = ({}) {} (({}) * {});\n",
+                                  forLoop.var.typeName,
+                                  forLoop.var.name,
+                                  getLatestSourceText(forLoop.range.start, rewriter),
+                                  op,
+                                  getLatestSourceText(forLoop.inc.val, rewriter),
+                                  idx)
+                            .value());
+    }
+    return res;
+}
+
+}
diff --git a/lib/attributes/backend/opencl/common.h b/lib/attributes/backend/opencl/common.h
new file mode 100644
index 00000000..91ec75e1
--- /dev/null
+++ b/lib/attributes/backend/opencl/common.h
@@ -0,0 +1,24 @@
+#include "core/rewriter/rewriter_proxy.h"
+#include "attributes/frontend/params/loop.h"
+
+#include <string>
+
+
+namespace clang {
+class Rewriter;
+}
+
+namespace oklt {
+struct OklLoopInfo;
+}
+
+namespace oklt::opencl {
+std::string axisToStr(const Axis& axis);
+std::string getIdxVariable(const AttributedLoop& loop);
+std::string buildInnerOuterLoopIdxLine(const OklLoopInfo& forLoop,
+                                       const AttributedLoop& loop,
+                                       int& openedScopeCounter,
+                                       oklt::Rewriter& rewriter);
+
+const std::string SYNC_THREADS_BARRIER = "barrier(CLK_LOCAL_MEM_FENCE)";
+}
diff --git a/lib/attributes/backend/opencl/exclusive.cpp b/lib/attributes/backend/opencl/exclusive.cpp
new file mode 100644
index 00000000..c3c8d270
--- /dev/null
+++ b/lib/attributes/backend/opencl/exclusive.cpp
@@ -0,0 +1,76 @@
+#include "attributes/attribute_names.h"
+#include "attributes/utils/cuda_subset/handle.h"
+#include "attributes/utils/default_handlers.h"
+#include "core/handler_manager/backend_handler.h"
+#include "core/sema/okl_sema_ctx.h"
+#include "core/transpiler_session/session_stage.h"
+#include "core/utils/attributes.h"
+
+#include <clang/AST/Attr.h>
+#include <clang/AST/DeclBase.h>
+#include <spdlog/spdlog.h>
+
+namespace {
+using namespace oklt;
+using namespace clang;
+
+HandleResult handleExclusiveDeclAttribute(SessionStage& s , const Decl& decl, const Attr& a) {
+    SPDLOG_DEBUG("Handle [@exclusive] attribute (decl)");
+
+    removeAttribute(s, a);
+    return {};
+}
+
+HandleResult handleExclusiveVarDeclAttribute(SessionStage& s , const VarDecl& decl, const Attr& a) {
+    SPDLOG_DEBUG("Handle [@exclusive] attribute (decl)");
+
+    auto& sema = s.tryEmplaceUserCtx<OklSemaCtx>();
+    auto loopInfo = sema.getLoopInfo();
+    if (!loopInfo) {
+        return tl::make_unexpected(
+            Error{{}, "@exclusive: failed to fetch loop meta data from sema"});
+    }
+
+    auto compStmt = dyn_cast_or_null<CompoundStmt>(loopInfo->stmt.getBody());
+    if (!compStmt || !loopInfo->is(LoopType::Outer)) {
+        return tl::make_unexpected(
+            Error{{}, "Must define [@exclusive] variables between [@outer] and [@inner] loops"});
+    }
+
+    auto child = loopInfo->getFirstAttributedChild();
+    if (!child || !child->is(LoopType::Inner)) {
+        return tl::make_unexpected(
+            Error{{}, "Must define [@exclusive] variables between [@outer] and [@inner] loops"});
+    }
+
+    removeAttribute(s, a);
+    return {};
+}
+
+HandleResult handleExclusiveExprAttribute(SessionStage& s , const DeclRefExpr& expr,const Attr& a) {
+    SPDLOG_DEBUG("Handle [@exclusive] attribute (stmt)");
+
+    auto& sema = s.tryEmplaceUserCtx<OklSemaCtx>();
+    auto loopInfo = sema.getLoopInfo();
+    if (!loopInfo) {
+        return tl::make_unexpected(
+            Error{{}, "@exclusive: failed to fetch loop meta data from sema"});
+    }
+
+    removeAttribute(s, a);
+    return {};
+}
+
+__attribute__((constructor)) void registerAttrBackend() {
+    auto ok = registerBackendHandler(
+        TargetBackend::OPENCL, EXCLUSIVE_ATTR_NAME, handleExclusiveExprAttribute);
+    ok &= registerBackendHandler(
+        TargetBackend::OPENCL, EXCLUSIVE_ATTR_NAME, handleExclusiveDeclAttribute);
+    ok &= registerBackendHandler(
+        TargetBackend::OPENCL, EXCLUSIVE_ATTR_NAME, handleExclusiveVarDeclAttribute);
+
+    if (!ok) {
+        SPDLOG_ERROR("[OPENCL] Failed to register {} attribute handler", EXCLUSIVE_ATTR_NAME);
+    }
+}
+}  // namespace
diff --git a/lib/attributes/backend/opencl/global_constant.cpp b/lib/attributes/backend/opencl/global_constant.cpp
new file mode 100644
index 00000000..49934ffc
--- /dev/null
+++ b/lib/attributes/backend/opencl/global_constant.cpp
@@ -0,0 +1,22 @@
+#include "attributes/utils/replace_attribute.h"
+#include "core/handler_manager/implicid_handler.h"
+
+#include <spdlog/spdlog.h>
+
+namespace {
+using namespace oklt;
+using namespace clang;
+
+HandleResult handleGlobalConstant(oklt::SessionStage& s, const clang::VarDecl& decl) {
+    const std::string OPENCL_CONST_QUALIFIER = "__constant";
+    return oklt::handleGlobalConstant(s, decl, OPENCL_CONST_QUALIFIER);
+}
+
+__attribute__((constructor)) void registeCUDAGlobalConstantHandler() {
+    auto ok = registerImplicitHandler(TargetBackend::OPENCL, handleGlobalConstant);
+
+    if (!ok) {
+        SPDLOG_ERROR("[OPENCL] Failed to register implicit handler for global constant");
+    }
+}
+}  // namespace
diff --git a/lib/attributes/backend/opencl/global_function.cpp b/lib/attributes/backend/opencl/global_function.cpp
new file mode 100644
index 00000000..72f268c0
--- /dev/null
+++ b/lib/attributes/backend/opencl/global_function.cpp
@@ -0,0 +1,54 @@
+#include "attributes/utils/replace_attribute.h"
+#include "attributes/attribute_names.h"
+#include "core/handler_manager/implicid_handler.h"
+#include "core/transpiler_session/header_info.h"
+#include "core/transpiler_session/session_stage.h"
+#include "core/utils/var_decl.h"
+
+#include <clang/AST/AST.h>
+#include <spdlog/spdlog.h>
+
+namespace {
+using namespace oklt;
+using namespace clang;
+
+HandleResult handleGlobalFunction(oklt::SessionStage& s, const clang::FunctionDecl& decl) {
+    if (decl.getLocation().isInvalid() || decl.isInlineBuiltinDeclaration() || !decl.hasBody()) {
+        return {};
+    }
+
+    if (decl.hasAttrs()) {
+        for (auto* attr : decl.getAttrs()) {
+            if (attr->getNormalizedFullName() == KERNEL_ATTR_NAME) {
+                SPDLOG_DEBUG(
+                    "Global function handler skipped function {}, since it has @kernel attribute",
+                    decl.getNameAsString());
+                return {};
+            }
+        }
+    }
+
+    auto& r = s.getRewriter();  
+    auto loc = decl.getFunctionTypeLoc();
+    auto funcr = SourceRange(decl.getBeginLoc(), loc.getRParenLoc());
+    auto str = r.getRewrittenText(funcr);
+    
+    str += ";\n";
+
+    SPDLOG_DEBUG("Handle global function '{}' at {}",
+                 decl.getNameAsString(),
+                 decl.getLocation().printToString(s.getCompiler().getSourceManager()));
+
+    r.InsertTextBefore(decl.getSourceRange().getBegin(), str);
+
+    return {};
+}
+
+__attribute__((constructor)) void registerTranslationUnitAttrBackend() {
+    auto ok = registerImplicitHandler(TargetBackend::OPENCL, handleGlobalFunction);
+
+    if (!ok) {
+        SPDLOG_ERROR("[OPENCL] Failed to register implicit handler for global function");
+    }
+}
+}  // namespace
diff --git a/lib/attributes/backend/opencl/inner.cpp b/lib/attributes/backend/opencl/inner.cpp
new file mode 100644
index 00000000..fd574e81
--- /dev/null
+++ b/lib/attributes/backend/opencl/inner.cpp
@@ -0,0 +1,56 @@
+#include "attributes/attribute_names.h"
+#include "attributes/backend/opencl/common.h"
+#include "attributes/frontend/params/loop.h"
+#include "attributes/utils/code_gen.h"
+#include "attributes/utils/kernel_utils.h"
+#include "core/handler_manager/backend_handler.h"
+#include "core/sema/okl_sema_ctx.h"
+
+#include <spdlog/spdlog.h>
+
+namespace {
+using namespace oklt;
+using namespace clang;
+
+HandleResult handleInnerAttribute(SessionStage& s,
+                                  const clang::ForStmt& forStmt,
+                                  const clang::Attr& a,
+                                  const AttributedLoop* params) {
+    SPDLOG_DEBUG("Handle [@inner] attribute");
+    handleChildAttr(s, forStmt, NO_BARRIER_ATTR_NAME);
+
+    if (!params) {
+        return tl::make_unexpected(Error{std::error_code(), "@inner params nullptr"});
+    }
+
+    auto& astCtx = s.getCompiler().getASTContext();
+    auto& sema = s.tryEmplaceUserCtx<OklSemaCtx>();
+    auto loopInfo = sema.getLoopInfo(forStmt);
+    if (!loopInfo) {
+        return tl::make_unexpected(Error{{}, "@inner: failed to fetch loop meta data from sema"});
+    }
+
+    auto updatedParams = *params;
+    // Auto Axis in loopInfo are replaced with specific. TODO: maybe somehow update params earlier?
+    updatedParams.axis = loopInfo->axis.front();
+
+    int openedScopeCounter = 0;
+    auto prefixCode = opencl::buildInnerOuterLoopIdxLine(
+        *loopInfo, updatedParams, openedScopeCounter, s.getRewriter());
+    auto suffixCode = buildCloseScopes(openedScopeCounter);
+    std::string afterRBraceCode = "";
+    if (loopInfo->shouldSync()) {
+        afterRBraceCode += opencl::SYNC_THREADS_BARRIER + ";\n";
+    }
+
+    return replaceAttributedLoop(s, forStmt, a, suffixCode, afterRBraceCode, prefixCode, true);
+}
+
+__attribute__((constructor)) void registerOpenclInnerAttrBackend() {
+    auto ok = registerBackendHandler(TargetBackend::OPENCL, INNER_ATTR_NAME, handleInnerAttribute);
+
+    if (!ok) {
+        SPDLOG_ERROR("[OPENCL] Failed to register {} attribute handler", INNER_ATTR_NAME);
+    }
+}
+}  // namespace
diff --git a/lib/attributes/backend/opencl/kernel.cpp b/lib/attributes/backend/opencl/kernel.cpp
new file mode 100644
index 00000000..b50339e8
--- /dev/null
+++ b/lib/attributes/backend/opencl/kernel.cpp
@@ -0,0 +1,135 @@
+#include <oklt/core/kernel_metadata.h>
+#include "util/string_utils.hpp"
+
+#include "attributes/attribute_names.h"
+#include "attributes/utils/kernel_utils.h"
+#include "core/handler_manager/backend_handler.h"
+#include "core/rewriter/rewriter_proxy.h"
+#include "core/sema/okl_sema_ctx.h"
+#include "core/sema/okl_sema_info.h"
+#include "core/transpiler_session/session_stage.h"
+#include "core/utils/attributes.h"
+#include "core/utils/type_converter.h"
+
+#include "pipeline/core/error_codes.h"
+
+#include <spdlog/spdlog.h>
+
+namespace {
+using namespace oklt;
+using namespace clang;
+
+const std::string EXTERN_ATTRIBUTE = "__kernel ";
+const std::string EXTERN_ATTRIBUTE_ADD = "__attribute__ ";
+const std::string INNER_SIZES_FMT = "((reqd_work_group_size({},{},{})))";
+
+std::string getFunctionName(const FunctionDecl& func, size_t n) {
+    return util::fmt("_occa_{}_{}", func.getNameAsString(), n).value();
+}
+
+std::string getFunctionAttributesStr([[maybe_unused]] const FunctionDecl& func, OklLoopInfo* info) {
+    std::stringstream out;
+    out << EXTERN_ATTRIBUTE;
+
+    if (info) {
+        auto sizes = info->getInnerSizes();
+        if (!sizes.hasNullOpts()) {
+            out << EXTERN_ATTRIBUTE_ADD;
+            out << util::fmt(INNER_SIZES_FMT, *sizes[0], *sizes[1], *sizes[2]).value();
+        }
+    }
+
+    out << "\n";
+    return out.str();
+}
+
+std::string getFunctionParamStr(const FunctionDecl& func, KernelInfo& kernelInfo, oklt::Rewriter& r) {
+    kernelInfo.args.clear();
+    kernelInfo.args.reserve(func.getNumParams());
+
+    for (auto param : func.parameters()) {
+        if (!param) {
+            continue;
+        }
+        
+        kernelInfo.args.emplace_back(toOklArgInfo(*param).value());
+        auto t = param->getType();
+        if (t->isPointerType()) {
+            r.InsertTextBefore(param->getBeginLoc(), "__global ");
+        }
+    }
+    
+    auto typeLoc = func.getFunctionTypeLoc();
+    return r.getRewrittenText(typeLoc.getParensRange());
+}
+
+HandleResult handleKernelAttribute(SessionStage& s,
+                                   const clang::FunctionDecl& func,
+                                   const clang::Attr& a) {
+    SPDLOG_DEBUG("Handle [@kernel] attribute for function '{}'", func.getNameAsString());
+
+    auto& rewriter = s.getRewriter();
+    auto& sema = s.tryEmplaceUserCtx<OklSemaCtx>();
+
+    if (!sema.getParsingKernelInfo()) {
+        return tl::make_unexpected(Error{OkltPipelineErrorCode::INTERNAL_ERROR_KERNEL_INFO_NULL,
+                                         "handleKernelAttribute"});
+    }
+
+    auto kernelInfo = *sema.getParsingKernelInfo();
+    auto& kernels = sema.getProgramMetaData().kernels;
+
+    auto oklKernelInfo = KernelInfo{.name = func.getNameAsString()};
+    auto typeStr = rewriter.getRewrittenText(func.getReturnTypeSourceRange());
+    auto paramStr = getFunctionParamStr(func, oklKernelInfo, rewriter);
+
+    if (auto verified = verifyLoops(s, kernelInfo); !verified) {
+        return tl::make_unexpected(std::move(verified.error()));
+    }
+
+    size_t n = 0;
+    auto startPos = getAttrFullSourceRange(a).getBegin();
+    for (auto* child : kernelInfo.topLevelOuterLoops) {
+        if (!child) {
+            continue;
+        }
+
+        kernels.push_back(oklKernelInfo);
+        auto& meta = kernels.back();
+        meta.name = getFunctionName(func, n);
+
+        handleChildAttr(s, child->stmt, MAX_INNER_DIMS_NAME);
+        
+        std::stringstream out;
+        if (n != 0) {
+            out << "}\n\n";
+        }
+
+        out << getFunctionAttributesStr(func, child);
+        out << typeStr << " " << getFunctionName(func, n) << paramStr << ";\n";
+        out << "\n";
+
+        out << getFunctionAttributesStr(func, child);
+        out << typeStr << " " << getFunctionName(func, n) << paramStr << " {\n";
+        auto endPos = getAttrFullSourceRange(*child->attr).getBegin();
+        rewriter.ReplaceText(SourceRange{startPos, endPos}, out.str());
+
+        auto body = dyn_cast_or_null<CompoundStmt>(child->stmt.getBody());
+        startPos = (body ? body->getEndLoc() : child->stmt.getRParenLoc()).getLocWithOffset(1);
+
+        ++n;
+    }
+
+    rewriter.ReplaceText(SourceRange{startPos, func.getEndLoc()}, "\n}\n");
+
+    return {};
+}
+
+__attribute__((constructor)) void registerKernelHandler() {
+    auto ok = registerBackendHandler(TargetBackend::OPENCL, KERNEL_ATTR_NAME, handleKernelAttribute);
+
+    if (!ok) {
+        SPDLOG_ERROR("[OPENCL] Failed to register {} attribute handler", KERNEL_ATTR_NAME);
+    }
+}
+}  // namespace
diff --git a/lib/attributes/backend/opencl/outer.cpp b/lib/attributes/backend/opencl/outer.cpp
new file mode 100644
index 00000000..7d2f53d9
--- /dev/null
+++ b/lib/attributes/backend/opencl/outer.cpp
@@ -0,0 +1,49 @@
+#include "attributes/attribute_names.h"
+#include "attributes/backend/opencl/common.h"
+#include "attributes/frontend/params/loop.h"
+#include "attributes/utils/code_gen.h"
+#include "core/handler_manager/backend_handler.h"
+#include "core/sema/okl_sema_ctx.h"
+
+#include <spdlog/spdlog.h>
+
+namespace {
+using namespace oklt;
+using namespace clang;
+
+HandleResult handleOuterAttribute(SessionStage& s,
+                                  const clang::ForStmt& forStmt,
+                                  const clang::Attr& a,
+                                  const AttributedLoop* params) {
+    SPDLOG_DEBUG("Handle [@outer] attribute");
+
+    if (!params) {
+        return tl::make_unexpected(Error{std::error_code(), "@outer params nullptr"});
+    }
+
+    auto& sema = s.tryEmplaceUserCtx<OklSemaCtx>();
+    auto loopInfo = sema.getLoopInfo(forStmt);
+    if (!loopInfo) {
+        return tl::make_unexpected(Error{{}, "@outer: failed to fetch loop meta data from sema"});
+    }
+
+    auto updatedParams = *params;
+    // Auto Axis in loopInfo are replaced with specific. TODO: maybe somehow update params earlier?
+    updatedParams.axis = loopInfo->axis.front();
+
+    int openedScopeCounter = 0;
+    auto prefixCode = opencl::buildInnerOuterLoopIdxLine(
+        *loopInfo, updatedParams, openedScopeCounter, s.getRewriter());
+    auto suffixCode = buildCloseScopes(openedScopeCounter);
+
+    return replaceAttributedLoop(s, forStmt, a, suffixCode, prefixCode, true);
+}
+
+__attribute__((constructor)) void registerOpenclOuterAttrBackend() {
+    auto ok = registerBackendHandler(TargetBackend::OPENCL, OUTER_ATTR_NAME, handleOuterAttribute);
+
+    if (!ok) {
+        SPDLOG_ERROR("[OPENCL] Failed to register {} attribute handler", OUTER_ATTR_NAME);
+    }
+}
+}  // namespace
diff --git a/lib/attributes/backend/opencl/restrict.cpp b/lib/attributes/backend/opencl/restrict.cpp
new file mode 100644
index 00000000..21426b04
--- /dev/null
+++ b/lib/attributes/backend/opencl/restrict.cpp
@@ -0,0 +1,40 @@
+#include "attributes/attribute_names.h"
+#include "attributes/utils/default_handlers.h"
+#include "attributes/utils/cuda_subset/handle.h"
+#include "core/handler_manager/backend_handler.h"
+#include "core/transpiler_session/session_stage.h"
+#include "core/utils/attributes.h"
+#include "core/utils/range_to_string.h"
+
+#include <spdlog/spdlog.h>
+
+namespace {
+using namespace oklt;
+using namespace clang;
+
+const std::string RESTRICT_MODIFIER = "restrict ";
+
+HandleResult handleRestrictAttribute(SessionStage& s, const Decl& decl, const Attr& a) {
+    SPDLOG_DEBUG("Handle [@restrict] attribute");
+
+    removeAttribute(s, a);
+    if (isa<ParmVarDecl>(decl)){
+        if (isa<VarDecl, FieldDecl, FunctionDecl>(decl)) {
+            s.getRewriter().InsertTextBefore(decl.getLocation(), RESTRICT_MODIFIER);
+        }
+    }
+
+    return {};
+}
+
+__attribute__((constructor)) void registerOPENCLRestrictHandler() {
+    auto ok = registerBackendHandler(
+        TargetBackend::OPENCL, RESTRICT_ATTR_NAME, handleRestrictAttribute);
+
+    ok &= registerBackendHandler(TargetBackend::OPENCL, RESTRICT_ATTR_NAME, emptyHandleStmtAttribute);
+
+    if (!ok) {
+        SPDLOG_ERROR("[OPENCL] Failed to register {} attribute handler", RESTRICT_ATTR_NAME);
+    }
+}
+}  // namespace
diff --git a/lib/attributes/backend/opencl/shared.cpp b/lib/attributes/backend/opencl/shared.cpp
new file mode 100644
index 00000000..ad0aee3a
--- /dev/null
+++ b/lib/attributes/backend/opencl/shared.cpp
@@ -0,0 +1,77 @@
+#include "attributes/attribute_names.h"
+#include "attributes/utils/default_handlers.h"
+#include "attributes/utils/utils.h"
+#include "core/handler_manager/backend_handler.h"
+#include "core/sema/okl_sema_ctx.h"
+#include "core/transpiler_session/session_stage.h"
+#include "core/utils/attributes.h"
+
+#include <spdlog/spdlog.h>
+
+namespace {
+using namespace oklt;
+using namespace clang;
+
+const std::string SHARED_MODIFIER = "__local";
+
+HandleResult handleSharedDeclAttribute(SessionStage& s, const Decl& var, const Attr& a) {
+    SPDLOG_DEBUG("Handle [@shared] attribute");
+
+    return removeAttribute(s, a);
+}
+
+HandleResult handleSharedTypeAttribute(SessionStage& s, const TypedefDecl& decl, const Attr& a) {
+    SPDLOG_DEBUG("Handle [@shared] attribute");
+
+    removeAttribute(s, a);
+
+    auto loc = decl.getTypeSourceInfo()->getTypeLoc().getBeginLoc();
+    s.getRewriter().InsertTextBefore(loc, SHARED_MODIFIER + " ");
+
+    return {};
+}
+
+HandleResult handleSharedVarAttribute(SessionStage& s, const VarDecl& d, const Attr& a) {
+    SPDLOG_DEBUG("Handle [@shared] attribute");
+
+    removeAttribute(s, a);
+
+    std::string replacedAttribute = SHARED_MODIFIER + " ";
+
+    auto& sema = s.tryEmplaceUserCtx<OklSemaCtx>();
+    auto loopInfo = sema.getLoopInfo();
+    if (loopInfo && loopInfo->isRegular()) {
+        loopInfo = loopInfo->getAttributedParent();
+    }
+    if (loopInfo && loopInfo->has(LoopType::Inner)) {
+        return tl::make_unexpected(
+            Error{{}, "Cannot define [@shared] variables inside an [@inner] loop"});
+    }
+    auto child = loopInfo ? loopInfo->getFirstAttributedChild() : nullptr;
+    bool isInnerChild = child && child->has(LoopType::Inner);
+
+    // This diagnostic is applied only to variable declaration
+    if (!loopInfo || !loopInfo->has(LoopType::Outer) || !isInnerChild) {
+        return tl::make_unexpected(
+            Error{{}, "Must define [@shared] variables between [@outer] and [@inner] loops"});
+    }
+
+    s.getRewriter().InsertTextBefore(d.getTypeSpecStartLoc(), replacedAttribute);
+
+    return defaultHandleSharedDeclAttribute(s, d, a);
+}
+
+__attribute__((constructor)) void registerOPENCLSharedAttrBackend() {
+    auto ok =
+        registerBackendHandler(TargetBackend::OPENCL, SHARED_ATTR_NAME, handleSharedDeclAttribute);
+    ok &= registerBackendHandler(TargetBackend::OPENCL, SHARED_ATTR_NAME, handleSharedVarAttribute);
+
+    // Empty Stmt handler since @shared variable is of attributed type, it is called on DeclRefExpr
+    ok &= registerBackendHandler(
+        TargetBackend::OPENCL, SHARED_ATTR_NAME, defaultHandleSharedStmtAttribute);
+
+    if (!ok) {
+        SPDLOG_ERROR("[OPENCL] Failed to register {} attribute handler", SHARED_ATTR_NAME);
+    }
+}
+}  // namespace
diff --git a/lib/attributes/backend/opencl/tile.cpp b/lib/attributes/backend/opencl/tile.cpp
new file mode 100644
index 00000000..b9eed6e4
--- /dev/null
+++ b/lib/attributes/backend/opencl/tile.cpp
@@ -0,0 +1,267 @@
+#include <oklt/core/kernel_metadata.h>
+#include "util/string_utils.hpp"
+
+#include "attributes/attribute_names.h"
+#include "attributes/backend/opencl/common.h"
+#include "attributes/frontend/params/tile.h"
+#include "attributes/utils/code_gen.h"
+#include "attributes/utils/kernel_utils.h"
+#include "core/handler_manager/backend_handler.h"
+#include "core/sema/okl_sema_ctx.h"
+#include "core/transpiler_session/session_stage.h"
+#include "core/utils/range_to_string.h"
+
+#include <clang/Rewrite/Core/Rewriter.h>
+
+#include <spdlog/spdlog.h>
+
+namespace {
+using namespace oklt;
+using namespace clang;
+
+std::string getTiledVariableName(const OklLoopInfo& forLoop) {
+    return "_occa_tiled_" + forLoop.var.name;
+}
+
+std::string buildIinnerOuterLoopIdxLineFirst(const OklLoopInfo& forLoop,
+                                             const AttributedLoop& loop,
+                                             const TileParams* params,
+                                             int& openedScopeCounter,
+                                             oklt::Rewriter& rewriter) {
+    auto tiledVar = getTiledVariableName(forLoop);
+    auto idx = opencl::getIdxVariable(loop);
+    auto op = forLoop.IsInc() ? "+" : "-";
+
+    std::string res;
+    if (forLoop.isUnary()) {
+        res = std::move(util::fmt("{} {} = ({}) {} (({}) * {});\n",
+                                  forLoop.var.typeName,
+                                  tiledVar,
+                                  getLatestSourceText(forLoop.range.start, rewriter),
+                                  op,
+                                  params->tileSize,
+                                  idx)
+                            .value());
+    } else {
+        res = std::move(util::fmt("{} {} = ({}) {} ((({}) * {}) * {});\n",
+                                  forLoop.var.typeName,
+                                  tiledVar,
+                                  getLatestSourceText(forLoop.range.start, rewriter),
+                                  op,
+                                  params->tileSize,
+                                  getLatestSourceText(forLoop.inc.val, rewriter),
+                                  idx)
+                            .value());
+    }
+
+    ++openedScopeCounter;
+    return " {\n" + res;
+}
+
+std::string buildInnerOuterLoopIdxLineSecond(const OklLoopInfo& forLoop,
+                                             const AttributedLoop& loop,
+                                             const TileParams* params,
+                                             int& openedScopeCounter,
+                                             oklt::Rewriter& rewriter) {
+    static_cast<void>(params);
+    auto tiledVar = getTiledVariableName(forLoop);
+    auto idx = opencl::getIdxVariable(loop);
+    auto op = forLoop.IsInc() ? "+" : "-";
+
+    std::string res;
+    if (forLoop.isUnary()) {
+        res = std::move(
+            util::fmt(
+                "{} {} = {} {} {};", forLoop.var.typeName, forLoop.var.name, tiledVar, op, idx)
+                .value());
+    } else {
+        res = std::move(util::fmt("{} {} = {} {} (({}) * {});\n",
+                                  forLoop.var.typeName,
+                                  forLoop.var.name,
+                                  tiledVar,
+                                  op,
+                                  getLatestSourceText(forLoop.inc.val, rewriter),
+                                  idx)
+                            .value());
+    }
+
+    ++openedScopeCounter;
+    return " {\n" + res;  // Open new scope
+}
+
+std::string buildRegularLoopIdxLineFirst(const OklLoopInfo& forLoop,
+                                         const AttributedLoop& regularLoop,
+                                         const TileParams* params,
+                                         int& openedScopeCounter,
+                                         oklt::Rewriter& rewriter) {
+    auto tiledVar = getTiledVariableName(forLoop);
+    auto assignUpdate = forLoop.IsInc() ? "+=" : "-=";
+    auto cmpOpStr = getCondCompStr(forLoop.condition.op);
+
+    auto res = util::fmt("for ({} {} = {}; {} {} {}; {} {} ({}))",
+                         forLoop.var.typeName,
+                         tiledVar,
+                         getLatestSourceText(forLoop.range.start, rewriter),
+                         tiledVar,
+                         cmpOpStr,
+                         getLatestSourceText(forLoop.range.end, rewriter),
+                         tiledVar,
+                         assignUpdate,
+                         params->tileSize)
+                   .value();  // shouldn't fail
+
+    // Open new scope (Note: after line unlike @outer and @inner)
+    ++openedScopeCounter;
+    return res + " {\n";
+}
+
+std::string buildRegularLoopIdxLineSecond(const OklLoopInfo& forLoop,
+                                          const AttributedLoop& regularLoop,
+                                          const TileParams* params,
+                                          int& openedScopeCounter,
+                                          oklt::Rewriter& rewriter) {
+    auto tiledVar = getTiledVariableName(forLoop);
+    auto op = forLoop.IsInc() ? "+" : "-";
+    auto cmp = forLoop.IsInc() ? "<" : ">";
+
+    std::string res;
+    if (forLoop.isUnary()) {
+        auto unaryStr = getUnaryStr(forLoop.inc.op.uo, forLoop.var.name);  // ++i/i++/--i/i--
+        res = util::fmt("for ({} {} = {}; {} {} ({} {} ({})); {})",
+                        forLoop.var.typeName,
+                        forLoop.var.name,
+                        tiledVar,
+                        forLoop.var.name,
+                        cmp,
+                        tiledVar,
+                        op,
+                        params->tileSize,
+                        unaryStr)
+                  .value();
+    } else {
+        auto assignUpdate = forLoop.IsInc() ? "+=" : "-=";
+        res = util::fmt("for ({} {} = {}; {} {} ({} {} ({})); {} {} {})",
+                        forLoop.var.typeName,
+                        forLoop.var.name,
+                        tiledVar,
+                        forLoop.var.name,
+                        cmp,
+                        tiledVar,
+                        op,
+                        params->tileSize,
+                        forLoop.var.name,
+                        assignUpdate,
+                        getLatestSourceText(forLoop.inc.val, rewriter))
+                  .value();
+    }
+
+    auto& stmt = forLoop.stmt;
+    if (params->check || !llvm::isa<clang::CompoundStmt>(stmt.getBody())) {
+        ++openedScopeCounter;
+        res += " {\n";
+    }
+
+    return res;
+}
+
+std::string buildLoopIdxLine(const OklLoopInfo& forLoop,
+                             const TileParams* params,
+                             const LoopOrder& ord,
+                             int& openedScopeCounter,
+                             oklt::Rewriter& rewriter) {
+    static std::map<
+        std::tuple<LoopType, LoopOrder>,
+        std::function<std::string(
+            const OklLoopInfo&, const AttributedLoop&, const TileParams*, int&, oklt::Rewriter&)>>
+        mapping{
+            {{LoopType::Inner, LoopOrder::First}, buildIinnerOuterLoopIdxLineFirst},
+            {{LoopType::Outer, LoopOrder::First}, buildIinnerOuterLoopIdxLineFirst},
+            {{LoopType::Regular, LoopOrder::First}, buildRegularLoopIdxLineFirst},
+            {{LoopType::Inner, LoopOrder::Second}, buildInnerOuterLoopIdxLineSecond},
+            {{LoopType::Outer, LoopOrder::Second}, buildInnerOuterLoopIdxLineSecond},
+            {{LoopType::Regular, LoopOrder::Second}, buildRegularLoopIdxLineSecond},
+        };
+    auto& loop = ord == LoopOrder::First ? params->firstLoop : params->secondLoop;
+    return mapping[{loop.type, ord}](forLoop, loop, params, openedScopeCounter, rewriter);
+}
+
+std::string buildCheckLine(const OklLoopInfo& forLoop,
+                           const TileParams* tileParams,
+                           int& openedScopeCounter,
+                           oklt::Rewriter& rewriter) {
+    if (!tileParams->check) {
+        return "";
+    }
+
+    auto cmpStr = getCondCompStr(forLoop.condition.op);
+
+    auto res = util::fmt("if ({} {} {})",
+                         forLoop.var.name,
+                         cmpStr,
+                         getLatestSourceText(forLoop.range.end, rewriter))
+                   .value();
+
+    auto& stmt = forLoop.stmt;
+    if (!llvm::isa<clang::CompoundStmt>(stmt.getBody())) {
+        ++openedScopeCounter;
+        res += " {\n";
+    }
+
+    return res;
+}
+
+std::string buildPreffixTiledCode(const OklLoopInfo& forLoop,
+                                  const TileParams* tileParams,
+                                  int& openedScopeCounter,
+                                  oklt::Rewriter& rewriter) {
+    std::string res;
+    res += buildLoopIdxLine(forLoop, tileParams, LoopOrder::First, openedScopeCounter, rewriter);
+    res += buildLoopIdxLine(forLoop, tileParams, LoopOrder::Second, openedScopeCounter, rewriter);
+    res += buildCheckLine(forLoop, tileParams, openedScopeCounter, rewriter);
+    return res;
+}
+
+HandleResult handleTileAttribute(SessionStage& s,
+                                 const clang::ForStmt& forStmt,
+                                 const clang::Attr& a,
+                                 const TileParams* params) {
+    SPDLOG_DEBUG("Handle [@tile] attribute");
+
+    if (!params) {
+        return tl::make_unexpected(Error{std::error_code(), "@tile params nullptr"});
+    }
+
+    auto& astCtx = s.getCompiler().getASTContext();
+    auto& sema = s.tryEmplaceUserCtx<OklSemaCtx>();
+    auto loopInfo = sema.getLoopInfo(forStmt);
+    if (!loopInfo) {
+        return tl::make_unexpected(Error{{}, "@tile: failed to fetch loop meta data from sema"});
+    }
+
+    auto updatedParams = *params;
+    // Auto Axis in loopInfo are replaced with specific. TODO: maybe somehow update params earlier?
+    updatedParams.firstLoop.axis = loopInfo->axis[0];
+    updatedParams.secondLoop.axis = loopInfo->axis[1];
+
+    int openedScopeCounter = 0;
+    auto prefixCode =
+        buildPreffixTiledCode(*loopInfo, &updatedParams, openedScopeCounter, s.getRewriter());
+    auto suffixCode = buildCloseScopes(openedScopeCounter);
+    std::string afterRBraceCode = "";
+    if (loopInfo->shouldSync()) {
+        afterRBraceCode += opencl::SYNC_THREADS_BARRIER + ";";
+    }
+
+    handleChildAttr(s, forStmt, NO_BARRIER_ATTR_NAME);
+
+    return replaceAttributedLoop(s, forStmt, a, suffixCode, afterRBraceCode, prefixCode, false);
+}
+
+__attribute__((constructor)) void registerOpenclTileAttrBackend() {
+    auto ok = registerBackendHandler(TargetBackend::OPENCL, TILE_ATTR_NAME, handleTileAttribute);
+
+    if (!ok) {
+        SPDLOG_ERROR("[OPENCL] Failed to register {} attribute handler", TILE_ATTR_NAME);
+    }
+}
+}  // namespace
diff --git a/lib/attributes/backend/opencl/translation_unit.cpp b/lib/attributes/backend/opencl/translation_unit.cpp
new file mode 100644
index 00000000..989f942c
--- /dev/null
+++ b/lib/attributes/backend/opencl/translation_unit.cpp
@@ -0,0 +1,23 @@
+#include "attributes/utils/replace_attribute.h"
+#include "core/handler_manager/implicid_handler.h"
+
+#include <spdlog/spdlog.h>
+
+namespace {
+using namespace oklt;
+using namespace clang;
+
+const std::string_view OPENCL_PRAGMA = "#pragma OPENCL EXTENSON cl_khr_fp64 : enable";
+
+HandleResult handleTranslationUnitOpencl(SessionStage& s, const clang::TranslationUnitDecl& decl) {
+    return oklt::handleTranslationUnit(s, decl, {}, {OPENCL_PRAGMA});
+}
+
+__attribute__((constructor)) void registerTranslationUnitAttrBackend() {
+    auto ok = registerImplicitHandler(TargetBackend::OPENCL, handleTranslationUnitOpencl);
+
+    if (!ok) {
+        SPDLOG_ERROR("[OPENCL] Failed to register implicit handler for translation unit");
+    }
+}
+}  // namespace
diff --git a/lib/attributes/utils/replace_attribute.cpp b/lib/attributes/utils/replace_attribute.cpp
index 96b31fe9..a0041f7b 100644
--- a/lib/attributes/utils/replace_attribute.cpp
+++ b/lib/attributes/utils/replace_attribute.cpp
@@ -140,10 +140,15 @@ HandleResult handleCXXRecord(SessionStage& s,
 HandleResult handleTranslationUnit(SessionStage& s,
                                    const clang::TranslationUnitDecl& decl,
                                    std::vector<std::string_view> headers,
+                                   std::vector<std::string_view> defines,
                                    std::vector<std::string_view> ns) {
     SPDLOG_DEBUG("Handle translation unit");
 
     auto& deps = s.tryEmplaceUserCtx<HeaderDepsInfo>();
+    for (auto define : defines) {
+        deps.backendDefines.emplace_back(std::string(define) + "\n\n");
+    }
+    
     for (auto header : headers) {
         deps.backendHeaders.emplace_back("#include " + std::string(header) + "\n");
     }
diff --git a/lib/attributes/utils/replace_attribute.h b/lib/attributes/utils/replace_attribute.h
index 0ac1637f..9fc39b12 100644
--- a/lib/attributes/utils/replace_attribute.h
+++ b/lib/attributes/utils/replace_attribute.h
@@ -37,5 +37,6 @@ HandleResult handleCXXRecord(SessionStage&,
 HandleResult handleTranslationUnit(SessionStage& s,
                                    const clang::TranslationUnitDecl& decl,
                                    std::vector<std::string_view> headers,
+                                   std::vector<std::string_view> defines = {},
                                    std::vector<std::string_view> ns = {});
 }  // namespace oklt
diff --git a/lib/core/target_backends.cpp b/lib/core/target_backends.cpp
index 9a435edf..2a0359d0 100644
--- a/lib/core/target_backends.cpp
+++ b/lib/core/target_backends.cpp
@@ -12,6 +12,7 @@ tl::expected<TargetBackend, std::string> backendFromString(const std::string& ty
         {"cuda", TargetBackend::CUDA},
         {"hip", TargetBackend::HIP},
         {"dpcpp", TargetBackend::DPCPP},
+        {"opencl", TargetBackend::OPENCL},
         {"launcher", TargetBackend::_LAUNCHER},
     };
 
@@ -34,6 +35,8 @@ std::string backendToString(TargetBackend backend) {
             return std::string{"hip"};
         case TargetBackend::DPCPP:
             return std::string{"dpcpp"};
+        case TargetBackend::OPENCL:
+            return std::string{"opencl"};
         case TargetBackend::_LAUNCHER:
             return std::string{"launcher"};
     }
@@ -55,6 +58,7 @@ bool isDeviceCategory(TargetBackend backend) {
         case TargetBackend::CUDA:
         case TargetBackend::HIP:
         case TargetBackend::DPCPP:
+        case TargetBackend::OPENCL:
             return true;
         default:
             return false;
diff --git a/lib/core/transpiler_session/code_generator.cpp b/lib/core/transpiler_session/code_generator.cpp
index 59dc2a3c..bd48ccff 100644
--- a/lib/core/transpiler_session/code_generator.cpp
+++ b/lib/core/transpiler_session/code_generator.cpp
@@ -176,6 +176,10 @@ std::string restoreSystemAndBackendHeaders(
         }
         input.insert(0, "#include <" + dep.fileName + ">\n");
     }
+    
+    for (auto it = deps.backendDefines.rbegin(); it < deps.backendDefines.rend(); ++it) {
+        input.insert(0, *it);
+    }
 
     return input;
 }
diff --git a/lib/core/transpiler_session/header_info.h b/lib/core/transpiler_session/header_info.h
index 68ecc1dc..60d420c8 100644
--- a/lib/core/transpiler_session/header_info.h
+++ b/lib/core/transpiler_session/header_info.h
@@ -30,6 +30,7 @@ using HeaderIncStack = std::vector<HeaderDep>;
 struct HeaderDepsInfo {
     std::vector<HeaderDep> topLevelDeps;
     std::vector<std::string> backendHeaders;
+    std::vector<std::string> backendDefines;
     std::vector<std::string> backendNss;
     bool useOklIntrinsic = false;
 };
diff --git a/script/regenerate_test_ref.py b/script/regenerate_test_ref.py
index 63876c91..dc44af78 100644
--- a/script/regenerate_test_ref.py
+++ b/script/regenerate_test_ref.py
@@ -9,7 +9,8 @@ class Backend(Enum):
     CUDA = 2
     HIP = 3
     DPCPP = 4
-    LAUNCHER = 5
+    OPENCL = 5
+    LAUNCHER = 6
 
     def from_str(s: str) -> "Backend":
         s = s.lower()
@@ -23,6 +24,8 @@ def from_str(s: str) -> "Backend":
             return Backend.CUDA
         if s == "hip":
             return Backend.HIP
+        if s == "opencl":
+            return Backend.OPENCL
         if s == "launcher":
             return Backend.LAUNCHER
 
@@ -37,6 +40,8 @@ def to_str(self) -> str:
             return "cuda"
         if self == Backend.HIP:
             return "hip"
+        if self == Backend.OPENCL:
+            return "opencl"
         if self == Backend.LAUNCHER:
             return "launcher"
 
@@ -63,7 +68,7 @@ def main(occa_tool: str, data_path: str, backend: Backend, verbose: bool):
         "--data", "-d", type=str, required=True, help="Test data directory path"
     )
     parser.add_argument(
-        "--backend", "-b", type=str, required=True, help="serial/openmp/cuda/hip/dpcppp"
+        "--backend", "-b", type=str, required=True, help="serial/openmp/cuda/hip/dpcppp/opencl"
     )
     parser.add_argument(
         "--verbose", "-v", default=False, action="store_const", const=True
diff --git a/tests/functional/configs/test_suite_transpiler/backends/opencl/atomic.json b/tests/functional/configs/test_suite_transpiler/backends/opencl/atomic.json
new file mode 100644
index 00000000..dcab9e29
--- /dev/null
+++ b/tests/functional/configs/test_suite_transpiler/backends/opencl/atomic.json
@@ -0,0 +1,101 @@
+[
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/atomic/atomic_add.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/atomic/atomic_add_ref.cpp"
+     },
+     {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/atomic/atomic_sub.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/atomic/atomic_sub_ref.cpp"
+     },
+     {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/atomic/atomic_exch.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/atomic/atomic_exch_ref.cpp"
+     },
+     {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/atomic/atomic_and.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/atomic/atomic_and_ref.cpp"
+     },
+     {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/atomic/atomic_or.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/atomic/atomic_or_ref.cpp"
+     },
+     {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/atomic/atomic_xor.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/atomic/atomic_xor_ref.cpp"
+     },
+     {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/atomic/atomic_inc.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/atomic/atomic_inc_ref.cpp"
+     },
+     {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/atomic/atomic_dec.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/atomic/atomic_dec_ref.cpp"
+     },
+     {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/atomic/issue_case.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/atomic/issue_case_ref.cpp"
+     }
+]
diff --git a/tests/functional/configs/test_suite_transpiler/backends/opencl/barrier.json b/tests/functional/configs/test_suite_transpiler/backends/opencl/barrier.json
new file mode 100644
index 00000000..8c781419
--- /dev/null
+++ b/tests/functional/configs/test_suite_transpiler/backends/opencl/barrier.json
@@ -0,0 +1,24 @@
+[
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/barrier/barrier_builtin.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/barrier/barrier_builtin_ref.cpp"
+     },
+     {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/barrier/barrier_warp.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/barrier/barrier_warp_ref.cpp"
+     }
+]
diff --git a/tests/functional/configs/test_suite_transpiler/backends/opencl/cxx_record.json b/tests/functional/configs/test_suite_transpiler/backends/opencl/cxx_record.json
new file mode 100644
index 00000000..18acf76c
--- /dev/null
+++ b/tests/functional/configs/test_suite_transpiler/backends/opencl/cxx_record.json
@@ -0,0 +1,25 @@
+[
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/cxx_record/class_struct_template.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/cxx_record/class_struct_template_ref.cpp"
+     },
+     {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/cxx_record/class_template_specialization.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/cxx_record/class_template_specialization_ref.cpp"
+     }
+
+]
diff --git a/tests/functional/configs/test_suite_transpiler/backends/opencl/exclusive.json b/tests/functional/configs/test_suite_transpiler/backends/opencl/exclusive.json
new file mode 100644
index 00000000..d565faf2
--- /dev/null
+++ b/tests/functional/configs/test_suite_transpiler/backends/opencl/exclusive.json
@@ -0,0 +1,25 @@
+[
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/exclusive/exclusive_builtin.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/exclusive/exclusive_builtin_ref.cpp"
+     },
+     {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/exclusive/exclusive_in_typedecl.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/exclusive/exclusive_in_typedecl_ref.cpp"
+     }
+
+]
diff --git a/tests/functional/configs/test_suite_transpiler/backends/opencl/implicit.json b/tests/functional/configs/test_suite_transpiler/backends/opencl/implicit.json
new file mode 100644
index 00000000..12bfc01b
--- /dev/null
+++ b/tests/functional/configs/test_suite_transpiler/backends/opencl/implicit.json
@@ -0,0 +1,68 @@
+[
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/implicit/const_global_const_size_array.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/implicit/const_global_const_size_array_ref.cpp"
+    },
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/implicit/const_global_pointer.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/implicit/const_global_pointer_ref.cpp"
+    },
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/implicit/const_global_variable.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/implicit/const_global_variable_ref.cpp"
+    },
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/implicit/constexpr.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/implicit/constexpr_ref.cpp"
+    },
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/implicit/extern_const_global_array.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/implicit/extern_const_global_array_ref.cpp"
+    },
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/implicit/non_kernel_function.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/implicit/non_kernel_function_ref.cpp"
+    }
+]
diff --git a/tests/functional/configs/test_suite_transpiler/backends/opencl/inner_outer.json b/tests/functional/configs/test_suite_transpiler/backends/opencl/inner_outer.json
new file mode 100644
index 00000000..68e438c7
--- /dev/null
+++ b/tests/functional/configs/test_suite_transpiler/backends/opencl/inner_outer.json
@@ -0,0 +1,57 @@
+[
+  {
+    "action": "normalize_and_transpile",
+    "action_config": {
+      "backend": "opencl",
+      "source": "transpiler/backends/opencl/outer_inner/outer_inner_inc.cpp",
+      "includes": [],
+      "defs": [],
+      "launcher": ""
+    },
+    "reference": "transpiler/backends/opencl/outer_inner/outer_inner_inc_ref.cpp"
+  },
+  {
+    "action": "normalize_and_transpile",
+    "action_config": {
+      "backend": "opencl",
+      "source": "transpiler/backends/opencl/outer_inner/outer_inner_dec.cpp",
+      "includes": [],
+      "defs": [],
+      "launcher": ""
+    },
+    "reference": "transpiler/backends/opencl/outer_inner/outer_inner_dec_ref.cpp"
+  },
+  {
+    "action": "normalize_and_transpile",
+    "action_config": {
+      "backend": "opencl",
+      "source": "transpiler/backends/opencl/outer_inner/outer_inner_multiple.cpp",
+      "includes": [],
+      "defs": [],
+      "launcher": ""
+    },
+    "reference": "transpiler/backends/opencl/outer_inner/outer_inner_multiple_ref.cpp"
+  },
+  {
+    "action": "normalize_and_transpile",
+    "action_config": {
+      "backend": "opencl",
+      "source": "transpiler/backends/opencl/outer_inner/outer_inner_split.cpp",
+      "includes": [],
+      "defs": [],
+      "launcher": ""
+    },
+    "reference": "transpiler/backends/opencl/outer_inner/outer_inner_split_ref.cpp"
+  },
+  {
+    "action": "normalize_and_transpile",
+    "action_config": {
+      "backend": "opencl",
+      "source": "transpiler/backends/opencl/outer_inner/outer_inner_regular_at_same_level_as_attributed.cpp",
+      "includes": [],
+      "defs": [],
+      "launcher": ""
+    },
+    "reference": "transpiler/backends/opencl/outer_inner/outer_inner_regular_at_same_level_as_attributed_ref.cpp"
+  }
+]
diff --git a/tests/functional/configs/test_suite_transpiler/backends/opencl/max_inner_dims.json b/tests/functional/configs/test_suite_transpiler/backends/opencl/max_inner_dims.json
new file mode 100644
index 00000000..58fbe35e
--- /dev/null
+++ b/tests/functional/configs/test_suite_transpiler/backends/opencl/max_inner_dims.json
@@ -0,0 +1,13 @@
+[
+  {
+    "action": "normalize_and_transpile",
+    "action_config": {
+      "backend": "opencl",
+      "source": "transpiler/backends/opencl/max_inner_loops/outer_inner_split_max.cpp",
+      "includes": [],
+      "defs": [],
+      "launcher": ""
+    },
+    "reference": "transpiler/backends/opencl/max_inner_loops/outer_inner_split_max_ref.cpp"
+  }
+]
diff --git a/tests/functional/configs/test_suite_transpiler/backends/opencl/nobarrier.json b/tests/functional/configs/test_suite_transpiler/backends/opencl/nobarrier.json
new file mode 100644
index 00000000..1dd4dc2d
--- /dev/null
+++ b/tests/functional/configs/test_suite_transpiler/backends/opencl/nobarrier.json
@@ -0,0 +1,13 @@
+[
+  {
+    "action": "normalize_and_transpile",
+    "action_config": {
+      "backend": "opencl",
+      "source": "transpiler/backends/opencl/nobarrier/nobarrier_builtin.cpp",
+      "includes": [],
+      "defs": [],
+      "launcher": ""
+    },
+    "reference": "transpiler/backends/opencl/nobarrier/nobarrier_builtin_ref.cpp"
+  }
+]
diff --git a/tests/functional/configs/test_suite_transpiler/backends/opencl/restrict.json b/tests/functional/configs/test_suite_transpiler/backends/opencl/restrict.json
new file mode 100644
index 00000000..df105dc7
--- /dev/null
+++ b/tests/functional/configs/test_suite_transpiler/backends/opencl/restrict.json
@@ -0,0 +1,47 @@
+[
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/restrict/restrict_builtin_types.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/restrict/restrict_builtin_types_ref.cpp"
+     },
+     {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/restrict/restrict_complex_types.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/restrict/restrict_complex_types_ref.cpp"
+     },
+     {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/restrict/restrict_namespaced_types.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/restrict/restrict_namespaced_types_ref.cpp"
+     },
+     {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/restrict/restrict_return_type.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/restrict/restrict_return_type_ref.cpp"
+     }
+
+]
diff --git a/tests/functional/configs/test_suite_transpiler/backends/opencl/shared.json b/tests/functional/configs/test_suite_transpiler/backends/opencl/shared.json
new file mode 100644
index 00000000..110bd45d
--- /dev/null
+++ b/tests/functional/configs/test_suite_transpiler/backends/opencl/shared.json
@@ -0,0 +1,57 @@
+[
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/shared/shared_builtin_types.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/shared/shared_builtin_types_ref.cpp"
+    },
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/shared/shared_struct_types.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/shared/shared_struct_types_ref.cpp"
+    },
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/shared/shared_template_type.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/shared/shared_template_type_ref.cpp"
+    },
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/shared/shared_between_tiles.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/shared/shared_between_tiles_ref.cpp"
+    },
+    {
+        "action": "normalize_and_transpile",
+        "action_config": {
+            "backend": "opencl",
+            "source": "transpiler/backends/opencl/shared/shared_in_typedecl.cpp",
+            "includes": [],
+            "defs": [],
+            "launcher": ""
+        },
+        "reference": "transpiler/backends/opencl/shared/shared_in_typedecl_ref.cpp"
+    }
+]
diff --git a/tests/functional/configs/test_suite_transpiler/backends/opencl/suite.json b/tests/functional/configs/test_suite_transpiler/backends/opencl/suite.json
new file mode 100644
index 00000000..5f78239e
--- /dev/null
+++ b/tests/functional/configs/test_suite_transpiler/backends/opencl/suite.json
@@ -0,0 +1,13 @@
+[
+    "tile.json",
+    "inner_outer.json",
+    "max_inner_dims.json",
+    "shared.json",
+    "restrict.json",
+    "atomic.json",
+    "barrier.json",
+    "nobarrier.json",
+    "exclusive.json",
+    "cxx_record.json",
+    "implicit.json"
+]
diff --git a/tests/functional/configs/test_suite_transpiler/backends/opencl/tile.json b/tests/functional/configs/test_suite_transpiler/backends/opencl/tile.json
new file mode 100644
index 00000000..aa860243
--- /dev/null
+++ b/tests/functional/configs/test_suite_transpiler/backends/opencl/tile.json
@@ -0,0 +1,46 @@
+[
+  {
+    "action": "normalize_and_transpile",
+    "action_config": {
+      "backend": "opencl",
+      "source": "transpiler/backends/opencl/tile/outer_inner_inc.cpp",
+      "includes": [],
+      "defs": [],
+      "launcher": ""
+    },
+    "reference": "transpiler/backends/opencl/tile/outer_inner_inc_ref.cpp"
+  },
+  {
+    "action": "normalize_and_transpile",
+    "action_config": {
+      "backend": "opencl",
+      "source": "transpiler/backends/opencl/tile/outer_inner_dec.cpp",
+      "includes": [],
+      "defs": [],
+      "launcher": ""
+    },
+    "reference": "transpiler/backends/opencl/tile/outer_inner_dec_ref.cpp"
+  },
+  {
+    "action": "normalize_and_transpile",
+    "action_config": {
+      "backend": "opencl",
+      "source": "transpiler/backends/opencl/tile/outer_inner_regular_inc.cpp",
+      "includes": [],
+      "defs": [],
+      "launcher": ""
+    },
+    "reference": "transpiler/backends/opencl/tile/outer_inner_regular_inc_ref.cpp"
+  },
+  {
+    "action": "normalize_and_transpile",
+    "action_config": {
+      "backend": "opencl",
+      "source": "transpiler/backends/opencl/tile/outer_inner_regular_dec.cpp",
+      "includes": [],
+      "defs": [],
+      "launcher": ""
+    },
+    "reference": "transpiler/backends/opencl/tile/outer_inner_regular_dec_ref.cpp"
+  }
+]
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_add.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_add.cpp
new file mode 100644
index 00000000..75cf8b59
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_add.cpp
@@ -0,0 +1,40 @@
+
+@kernel void atomic_add_builtin(const int* iVec, int* iSum, const float* fVec, float* fSum) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic* iSum += iVec[0];
+            @atomic* fSum += fVec[0];
+        }
+    }
+}
+
+struct ComplexTypeF32 {
+    float real;
+    float imag;
+};
+
+
+@kernel void atomic_add_struct(const ComplexTypeF32* vec, ComplexTypeF32* sum) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic sum->real += vec[0].real;
+            @atomic sum->imag += vec[0].imag;
+        }
+    }
+}
+
+template <class T>
+struct ComplexType {
+    T real;
+    T imag;
+};
+
+
+@kernel void atomic_add_template(const ComplexType<float>* vec, ComplexType<float>* sum) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic sum->real += vec[0].real;
+            @atomic sum->imag += vec[0].imag;
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_add_ref.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_add_ref.cpp
new file mode 100644
index 00000000..eb3464e7
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_add_ref.cpp
@@ -0,0 +1,62 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_add_builtin_0(__global const int *iVec, __global int *iSum,
+                           __global const float *fVec, __global float *fSum);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_add_builtin_0(__global const int *iVec, __global int *iSum,
+                           __global const float *fVec, __global float *fSum) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      *iSum += iVec[0];
+      *fSum += fVec[0];
+    }
+  }
+}
+
+struct ComplexTypeF32 {
+  float real;
+  float imag;
+};
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_add_struct_0(__global const ComplexTypeF32 *vec,
+                          __global ComplexTypeF32 *sum);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_add_struct_0(__global const ComplexTypeF32 *vec,
+                          __global ComplexTypeF32 *sum) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      sum->real += vec[0].real;
+      sum->imag += vec[0].imag;
+    }
+  }
+}
+
+template <class T> struct ComplexType {
+  T real;
+  T imag;
+};
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_add_template_0(__global const ComplexType<float> *vec,
+                            __global ComplexType<float> *sum);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_add_template_0(__global const ComplexType<float> *vec,
+                            __global ComplexType<float> *sum) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      sum->real += vec[0].real;
+      sum->imag += vec[0].imag;
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_and.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_and.cpp
new file mode 100644
index 00000000..6947d313
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_and.cpp
@@ -0,0 +1,23 @@
+
+@kernel void atomic_and_builtin(const unsigned int* masks, unsigned int* mask) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic* mask &= masks[0];
+        }
+    }
+}
+
+struct ComplexMaskType {
+    unsigned int mask1;
+    unsigned int mask2;
+};
+
+
+@kernel void atomic_and_struct(const ComplexMaskType* masks, ComplexMaskType* mask) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic mask->mask1 &= masks[0].mask1;
+            @atomic mask->mask2 &= masks[0].mask2;
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_and_ref.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_and_ref.cpp
new file mode 100644
index 00000000..792a0c57
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_and_ref.cpp
@@ -0,0 +1,39 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_and_builtin_0(__global const unsigned int *masks,
+                           __global unsigned int *mask);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_and_builtin_0(__global const unsigned int *masks,
+                           __global unsigned int *mask) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      *mask &= masks[0];
+    }
+  }
+}
+
+struct ComplexMaskType {
+  unsigned int mask1;
+  unsigned int mask2;
+};
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_and_struct_0(__global const ComplexMaskType *masks,
+                          __global ComplexMaskType *mask);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_and_struct_0(__global const ComplexMaskType *masks,
+                          __global ComplexMaskType *mask) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      mask->mask1 &= masks[0].mask1;
+      mask->mask2 &= masks[0].mask2;
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_dec.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_dec.cpp
new file mode 100644
index 00000000..c2646fe8
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_dec.cpp
@@ -0,0 +1,24 @@
+
+@kernel void atomic_dec_builtin(unsigned int* value) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic--(*value);
+        }
+    }
+    // @atomic (*value)--; normalizer issue
+}
+
+struct ComplexMaskType {
+    unsigned int val1;
+    int val2;
+};
+
+
+@kernel void atomic_dec_struct(ComplexMaskType* value) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic-- value->val1;
+            @atomic value->val2--;
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_dec_ref.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_dec_ref.cpp
new file mode 100644
index 00000000..1637e838
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_dec_ref.cpp
@@ -0,0 +1,35 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_dec_builtin_0(__global unsigned int *value);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_dec_builtin_0(__global unsigned int *value) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      --(*value);
+    }
+  }
+}
+
+struct ComplexMaskType {
+  unsigned int val1;
+  int val2;
+};
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_dec_struct_0(__global ComplexMaskType *value);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_dec_struct_0(__global ComplexMaskType *value) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      --value->val1;
+      value->val2--;
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_exch.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_exch.cpp
new file mode 100644
index 00000000..69b91d73
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_exch.cpp
@@ -0,0 +1,38 @@
+
+@kernel void atomic_exch_builtin(const int* iVec, int* iSum, const float* fVec, float* fSum) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic* iSum = iVec[0];
+            @atomic* fSum = fVec[0];
+        }
+    }
+}
+
+struct ComplexTypeF32 {
+    float real;
+    float imag;
+};
+
+
+@kernel void atomic_exch_struct(const ComplexTypeF32* vec, ComplexTypeF32* result) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic* result = vec[0];
+        }
+    }
+}
+
+template <class T>
+struct ComplexType {
+    T real;
+    T imag;
+};
+
+
+@kernel void atomic_exch_template(const ComplexType<float>* vec, ComplexType<float>* result) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic* result = vec[0];
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_exch_ref.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_exch_ref.cpp
new file mode 100644
index 00000000..025ff8fd
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_exch_ref.cpp
@@ -0,0 +1,60 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_exch_builtin_0(__global const int *iVec, __global int *iSum,
+                            __global const float *fVec, __global float *fSum);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_exch_builtin_0(__global const int *iVec, __global int *iSum,
+                            __global const float *fVec, __global float *fSum) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      *iSum = iVec[0];
+      *fSum = fVec[0];
+    }
+  }
+}
+
+struct ComplexTypeF32 {
+  float real;
+  float imag;
+};
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_exch_struct_0(__global const ComplexTypeF32 *vec,
+                           __global ComplexTypeF32 *result);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_exch_struct_0(__global const ComplexTypeF32 *vec,
+                           __global ComplexTypeF32 *result) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      *result = vec[0];
+    }
+  }
+}
+
+template <class T> struct ComplexType {
+  T real;
+  T imag;
+};
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_exch_template_0(__global const ComplexType<float> *vec,
+                             __global ComplexType<float> *result);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_exch_template_0(__global const ComplexType<float> *vec,
+                             __global ComplexType<float> *result) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      *result = vec[0];
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_inc.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_inc.cpp
new file mode 100644
index 00000000..76cf8ba2
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_inc.cpp
@@ -0,0 +1,24 @@
+
+@kernel void atomic_inc_builtin(unsigned int* value) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic++(*value);
+        }
+    }
+    // @atomic (*value)++; normalizer issue
+}
+
+struct ComplexMaskType {
+    unsigned int val1;
+    int val2;
+};
+
+
+@kernel void atomic_inc_struct(ComplexMaskType* value) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic++ value->val1;
+            @atomic value->val2++;
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_inc_ref.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_inc_ref.cpp
new file mode 100644
index 00000000..04db2c44
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_inc_ref.cpp
@@ -0,0 +1,35 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_inc_builtin_0(__global unsigned int *value);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_inc_builtin_0(__global unsigned int *value) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      ++(*value);
+    }
+  }
+}
+
+struct ComplexMaskType {
+  unsigned int val1;
+  int val2;
+};
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_inc_struct_0(__global ComplexMaskType *value);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_inc_struct_0(__global ComplexMaskType *value) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      ++value->val1;
+      value->val2++;
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_or.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_or.cpp
new file mode 100644
index 00000000..8d10d88d
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_or.cpp
@@ -0,0 +1,23 @@
+
+@kernel void atomic_and_builtin(const unsigned int* masks, unsigned int* mask) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic* mask |= masks[0];
+        }
+    }
+}
+
+struct ComplexMaskType {
+    unsigned int mask1;
+    unsigned int mask2;
+};
+
+
+@kernel void atomic_and_struct(const ComplexMaskType* masks, ComplexMaskType* mask) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic mask->mask1 |= masks[0].mask1;
+            @atomic mask->mask2 |= masks[0].mask2;
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_or_ref.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_or_ref.cpp
new file mode 100644
index 00000000..3863aa62
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_or_ref.cpp
@@ -0,0 +1,39 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_and_builtin_0(__global const unsigned int *masks,
+                           __global unsigned int *mask);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_and_builtin_0(__global const unsigned int *masks,
+                           __global unsigned int *mask) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      *mask |= masks[0];
+    }
+  }
+}
+
+struct ComplexMaskType {
+  unsigned int mask1;
+  unsigned int mask2;
+};
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_and_struct_0(__global const ComplexMaskType *masks,
+                          __global ComplexMaskType *mask);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_and_struct_0(__global const ComplexMaskType *masks,
+                          __global ComplexMaskType *mask) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      mask->mask1 |= masks[0].mask1;
+      mask->mask2 |= masks[0].mask2;
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_sub.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_sub.cpp
new file mode 100644
index 00000000..b87a11d7
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_sub.cpp
@@ -0,0 +1,40 @@
+
+@kernel void atomic_sub_builtin(const int* iVec, int* iSum, const float* fVec, float* fSum) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic* iSum -= iVec[0];
+            @atomic* fSum -= fVec[0];
+        }
+    }
+}
+
+struct ComplexTypeF32 {
+    float real;
+    float imag;
+};
+
+
+@kernel void atomic_sub_struct(const ComplexTypeF32* vec, ComplexTypeF32* sum) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic sum->real -= vec[0].real;
+            @atomic sum->imag -= vec[0].imag;
+        }
+    }
+}
+
+template <class T>
+struct ComplexType {
+    T real;
+    T imag;
+};
+
+
+@kernel void atomic_sub_template(const ComplexType<float>* vec, ComplexType<float>* sum) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic sum->real -= vec[0].real;
+            @atomic sum->imag -= vec[0].imag;
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_sub_ref.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_sub_ref.cpp
new file mode 100644
index 00000000..05b3649b
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_sub_ref.cpp
@@ -0,0 +1,62 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_sub_builtin_0(__global const int *iVec, __global int *iSum,
+                           __global const float *fVec, __global float *fSum);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_sub_builtin_0(__global const int *iVec, __global int *iSum,
+                           __global const float *fVec, __global float *fSum) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      *iSum -= iVec[0];
+      *fSum -= fVec[0];
+    }
+  }
+}
+
+struct ComplexTypeF32 {
+  float real;
+  float imag;
+};
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_sub_struct_0(__global const ComplexTypeF32 *vec,
+                          __global ComplexTypeF32 *sum);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_sub_struct_0(__global const ComplexTypeF32 *vec,
+                          __global ComplexTypeF32 *sum) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      sum->real -= vec[0].real;
+      sum->imag -= vec[0].imag;
+    }
+  }
+}
+
+template <class T> struct ComplexType {
+  T real;
+  T imag;
+};
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_sub_template_0(__global const ComplexType<float> *vec,
+                            __global ComplexType<float> *sum);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_sub_template_0(__global const ComplexType<float> *vec,
+                            __global ComplexType<float> *sum) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      sum->real -= vec[0].real;
+      sum->imag -= vec[0].imag;
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_xor.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_xor.cpp
new file mode 100644
index 00000000..9fd5489e
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_xor.cpp
@@ -0,0 +1,23 @@
+
+@kernel void atomic_and_builtin(const unsigned int* masks, unsigned int* mask) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic* mask ^= masks[0];
+        }
+    }
+}
+
+struct ComplexMaskType {
+    unsigned int mask1;
+    unsigned int mask2;
+};
+
+
+@kernel void atomic_and_struct(const ComplexMaskType* masks, ComplexMaskType* mask) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic mask->mask1 ^= masks[0].mask1;
+            @atomic mask->mask2 ^= masks[0].mask2;
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/atomic_xor_ref.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_xor_ref.cpp
new file mode 100644
index 00000000..8e2e849c
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/atomic_xor_ref.cpp
@@ -0,0 +1,39 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_and_builtin_0(__global const unsigned int *masks,
+                           __global unsigned int *mask);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_and_builtin_0(__global const unsigned int *masks,
+                           __global unsigned int *mask) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      *mask ^= masks[0];
+    }
+  }
+}
+
+struct ComplexMaskType {
+  unsigned int mask1;
+  unsigned int mask2;
+};
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_and_struct_0(__global const ComplexMaskType *masks,
+                          __global ComplexMaskType *mask);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_and_struct_0(__global const ComplexMaskType *masks,
+                          __global ComplexMaskType *mask) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      mask->mask1 ^= masks[0].mask1;
+      mask->mask2 ^= masks[0].mask2;
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/issue_case.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/issue_case.cpp
new file mode 100644
index 00000000..10906213
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/issue_case.cpp
@@ -0,0 +1,15 @@
+
+struct ComplexTypeF32 {
+    ComplexTypeF32& operator=(const ComplexTypeF32&) = default;
+    float real;
+    float imag;
+};
+
+
+@kernel void atomic_exch_struct(const ComplexTypeF32* vec, ComplexTypeF32* result) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @atomic* result = vec[0];
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/atomic/issue_case_ref.cpp b/tests/functional/data/transpiler/backends/opencl/atomic/issue_case_ref.cpp
new file mode 100644
index 00000000..47557377
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/atomic/issue_case_ref.cpp
@@ -0,0 +1,23 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+struct ComplexTypeF32 {
+  ComplexTypeF32 &operator=(const ComplexTypeF32 &) = default;
+  float real;
+  float imag;
+};
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_exch_struct_0(__global const ComplexTypeF32 *vec,
+                           __global ComplexTypeF32 *result);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_atomic_exch_struct_0(__global const ComplexTypeF32 *vec,
+                           __global ComplexTypeF32 *result) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      *result = vec[0];
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/barrier/barrier_builtin.cpp b/tests/functional/data/transpiler/backends/opencl/barrier/barrier_builtin.cpp
new file mode 100644
index 00000000..ed6ee3d9
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/barrier/barrier_builtin.cpp
@@ -0,0 +1,19 @@
+static float add(const float* a, int i, const float* b, int j) {
+    return a[i] + b[j];
+}
+
+// TODO: fix preprocessor handling and try with define
+#define BLOCK_SIZE 4
+
+@kernel void addVectors(const int N, const float* a, const float* b, float* ab) {
+    @outer for (int i = 0; i < N; i += BLOCK_SIZE) {
+        @shared float s_b[BLOCK_SIZE];
+        const float* g_a = a;
+        @inner for (int j = 0; j < BLOCK_SIZE; ++j) {
+            s_b[j] = b[i + j];
+            @barrier;
+
+            ab[i + j] = add(g_a, i + j, s_b, j);
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/barrier/barrier_builtin_ref.cpp b/tests/functional/data/transpiler/backends/opencl/barrier/barrier_builtin_ref.cpp
new file mode 100644
index 00000000..a4047d8d
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/barrier/barrier_builtin_ref.cpp
@@ -0,0 +1,28 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+static float add(const float *a, int i, const float *b, int j);
+
+static float add(const float *a, int i, const float *b, int j) {
+  return a[i] + b[j];
+}
+
+// TODO: fix preprocessor handling and try with define
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors_0(const int N, __global const float *a,
+                   __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors_0(const int N, __global const float *a,
+                   __global const float *b, __global float *ab) {
+  {
+    int i = (0) + ((4) * get_group_id(0));
+    __local float s_b[4];
+    const float *g_a = a;
+    {
+      int j = (0) + get_local_id(0);
+      s_b[j] = b[i + j];
+      barrier(CLK_LOCAL_MEM_FENCE);
+      ab[i + j] = add(g_a, i + j, s_b, j);
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/barrier/barrier_warp.cpp b/tests/functional/data/transpiler/backends/opencl/barrier/barrier_warp.cpp
new file mode 100644
index 00000000..86b7e1c3
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/barrier/barrier_warp.cpp
@@ -0,0 +1,7 @@
+@kernel void test_kern() {
+    @outer for (int i = 0; i < 10; ++i) {
+        @inner for (int j = 0; j < 10; ++j) {
+            @barrier("warp");
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/barrier/barrier_warp_ref.cpp b/tests/functional/data/transpiler/backends/opencl/barrier/barrier_warp_ref.cpp
new file mode 100644
index 00000000..9205f1d7
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/barrier/barrier_warp_ref.cpp
@@ -0,0 +1,15 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void
+_occa_test_kern_0();
+
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void
+_occa_test_kern_0() {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/cxx_record/class_struct_template.cpp b/tests/functional/data/transpiler/backends/opencl/cxx_record/class_struct_template.cpp
new file mode 100644
index 00000000..69a2f239
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/cxx_record/class_struct_template.cpp
@@ -0,0 +1,25 @@
+template <typename T>
+struct ComplexType {
+    T v1;
+    T v2;
+    T calc();
+    ComplexType(T in)
+        : v1(in),
+          v2(in) {}
+    template <typename U>
+    U calc(T in);
+};
+
+struct ComplexTypeFloat {
+    float v1;
+    float v2;
+    float calc();
+    template <typename T>
+    ComplexTypeFloat(T in);
+};
+
+@kernel void reductionWithSharedMemory(const int entries, const float* vec) {
+    @tile(16, @outer, @inner) for (int i = 0; i < entries; ++i) {
+        auto tmp = vec[i];
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/cxx_record/class_struct_template_ref.cpp b/tests/functional/data/transpiler/backends/opencl/cxx_record/class_struct_template_ref.cpp
new file mode 100644
index 00000000..05e969e2
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/cxx_record/class_struct_template_ref.cpp
@@ -0,0 +1,35 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+template <typename T> struct ComplexType {
+  T v1;
+  T v2;
+  T calc();
+
+  ComplexType(T in) : v1(in), v2(in) {}
+
+  template <typename U> U calc(T in);
+};
+
+struct ComplexTypeFloat {
+  float v1;
+  float v2;
+  float calc();
+  template <typename T> ComplexTypeFloat(T in);
+};
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
+_occa_reductionWithSharedMemory_0(const int entries, __global const float *vec);
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
+_occa_reductionWithSharedMemory_0(const int entries,
+                                  __global const float *vec) {
+  {
+    int _occa_tiled_i = (0) + ((16) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + get_local_id(0);
+      if (i < entries) {
+        auto tmp = vec[i];
+      }
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/cxx_record/class_template_specialization.cpp b/tests/functional/data/transpiler/backends/opencl/cxx_record/class_template_specialization.cpp
new file mode 100644
index 00000000..ba57cbb9
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/cxx_record/class_template_specialization.cpp
@@ -0,0 +1,27 @@
+template<int aa, int bb>
+class HelloClass;
+
+template<int bb>
+class HelloClass<0, bb> {
+public:
+    static inline void myfn() { }
+};
+
+template<int bb>
+class HelloClassFull {
+public:
+    inline void myfn() { }
+};
+
+
+template<>
+class HelloClassFull<0> {
+public:
+    inline void myfn() { }
+};
+
+@kernel void reductionWithSharedMemory(const int entries, const float* vec) {
+    @tile(16, @outer, @inner) for (int i = 0; i < entries; ++i) {
+        auto tmp = vec[i];
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/cxx_record/class_template_specialization_ref.cpp b/tests/functional/data/transpiler/backends/opencl/cxx_record/class_template_specialization_ref.cpp
new file mode 100644
index 00000000..e4b5f9f1
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/cxx_record/class_template_specialization_ref.cpp
@@ -0,0 +1,35 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+template <int aa, int bb> class HelloClass;
+
+template <int bb> class HelloClass<0, bb> {
+public:
+  static inline void myfn() {}
+};
+
+template <int bb> class HelloClassFull {
+public:
+  inline void myfn() {}
+};
+
+template <> class HelloClassFull<0> {
+public:
+  inline void myfn() {}
+};
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
+_occa_reductionWithSharedMemory_0(const int entries, __global const float *vec);
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
+_occa_reductionWithSharedMemory_0(const int entries,
+                                  __global const float *vec) {
+  {
+    int _occa_tiled_i = (0) + ((16) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + get_local_id(0);
+      if (i < entries) {
+        auto tmp = vec[i];
+      }
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/exclusive/exclusive_builtin.cpp b/tests/functional/data/transpiler/backends/opencl/exclusive/exclusive_builtin.cpp
new file mode 100644
index 00000000..e52b9df5
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/exclusive/exclusive_builtin.cpp
@@ -0,0 +1,20 @@
+static float add(const float* a, int i, const float* b, int j) {
+    return a[i] + b[j];
+}
+
+// TODO: fix preprocessor handling and try with define
+// #define BLOCK_SIZE 4
+const int BLOCK_SIZE = 4;
+
+@kernel void addVectors(const int N, const float* a, const float* b, float* ab) {
+    @outer for (int i = 0; i < N; i += BLOCK_SIZE) {
+        @shared float s_b[BLOCK_SIZE];
+        @exclusive const float* g_a = a;
+        @inner for (int j = 0; j < BLOCK_SIZE; ++j) {
+            s_b[j] = b[i + j];
+            @barrier;
+
+            ab[i + j] = add(g_a, i + j, s_b, j);
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/exclusive/exclusive_builtin_ref.cpp b/tests/functional/data/transpiler/backends/opencl/exclusive/exclusive_builtin_ref.cpp
new file mode 100644
index 00000000..28d2cc12
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/exclusive/exclusive_builtin_ref.cpp
@@ -0,0 +1,30 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+static float add(const float *a, int i, const float *b, int j);
+
+static float add(const float *a, int i, const float *b, int j) {
+  return a[i] + b[j];
+}
+
+// TODO: fix preprocessor handling and try with define
+// #define BLOCK_SIZE 4
+__constant int BLOCK_SIZE = 4;
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors_0(const int N, __global const float *a,
+                   __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors_0(const int N, __global const float *a,
+                   __global const float *b, __global float *ab) {
+  {
+    int i = (0) + ((BLOCK_SIZE)*get_group_id(0));
+    __local float s_b[BLOCK_SIZE];
+    const float *g_a = a;
+    {
+      int j = (0) + get_local_id(0);
+      s_b[j] = b[i + j];
+      barrier(CLK_LOCAL_MEM_FENCE);
+      ab[i + j] = add(g_a, i + j, s_b, j);
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/exclusive/exclusive_in_typedecl.cpp b/tests/functional/data/transpiler/backends/opencl/exclusive/exclusive_in_typedecl.cpp
new file mode 100644
index 00000000..b6575482
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/exclusive/exclusive_in_typedecl.cpp
@@ -0,0 +1,10 @@
+typedef float ex_float32_t @exclusive;
+
+@kernel void test_kernel() {
+    @outer for (int i = 0; i < 32; ++i) {
+        ex_float32_t d[32];
+        @inner for (int j = 0; j < 32; ++j) {
+            d[j] = i-j;
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/exclusive/exclusive_in_typedecl_ref.cpp b/tests/functional/data/transpiler/backends/opencl/exclusive/exclusive_in_typedecl_ref.cpp
new file mode 100644
index 00000000..60920e78
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/exclusive/exclusive_in_typedecl_ref.cpp
@@ -0,0 +1,17 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+typedef float ex_float32_t;
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void
+_occa_test_kernel_0();
+
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void
+_occa_test_kernel_0() {
+  {
+    int i = (0) + get_group_id(0);
+    ex_float32_t d[32];
+    {
+      int j = (0) + get_local_id(0);
+      d[j] = i - j;
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/implicit/const_global_const_size_array.cpp b/tests/functional/data/transpiler/backends/opencl/implicit/const_global_const_size_array.cpp
new file mode 100644
index 00000000..7d80983a
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/implicit/const_global_const_size_array.cpp
@@ -0,0 +1,20 @@
+// const array
+const int arr_const0[12] = {0};
+int const arr_const1[12] = {0};
+
+// Stupid formatting
+const int arr_const2[12] = {0};
+
+// Deduced size
+const float arr_const3[] = {1., 2., 3., 4., 5., 6.};
+
+// Multidimensional
+const float arr_const4[][2] = {{1., 2.}, {3., 4.}, {5., 6.}};
+const float arr_const5[][3][2] = {{{1., 2.}, {3., 4.}, {5., 6.}}, {{1., 2.}, {3., 4.}, {5., 6.}}};
+
+// At least one @kern function is requried
+@kernel void kern () {
+    @outer for (int i = 0; i < 32; ++i) {
+        @inner for (int j = 0; j < 32; ++j) {}
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/implicit/const_global_const_size_array_ref.cpp b/tests/functional/data/transpiler/backends/opencl/implicit/const_global_const_size_array_ref.cpp
new file mode 100644
index 00000000..df1eb7a5
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/implicit/const_global_const_size_array_ref.cpp
@@ -0,0 +1,22 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+// const array
+__constant int arr_const0[12] = {0};
+__constant int arr_const1[12] = {0};
+// Stupid formatting
+__constant int arr_const2[12] = {0};
+// Deduced size
+__constant float arr_const3[] = {1., 2., 3., 4., 5., 6.};
+// Multidimensional
+__constant float arr_const4[][2] = {{1., 2.}, {3., 4.}, {5., 6.}};
+__constant float arr_const5[][3][2] = {{{1., 2.}, {3., 4.}, {5., 6.}},
+                                       {{1., 2.}, {3., 4.}, {5., 6.}}};
+// At least one @kern function is requried
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void _occa_kern_0();
+
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void _occa_kern_0() {
+  {
+    int i = (0) + get_group_id(0);
+    { int j = (0) + get_local_id(0); }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/implicit/const_global_pointer.cpp b/tests/functional/data/transpiler/backends/opencl/implicit/const_global_pointer.cpp
new file mode 100644
index 00000000..ae1b3229
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/implicit/const_global_pointer.cpp
@@ -0,0 +1,20 @@
+// pointer to const
+const int* ptr_const0 = 0;
+int const* ptr_const1 = 0;
+
+// const pointer to const
+const int* const ptr_const2 = 0;
+int const* const ptr_const3 = 0;
+
+// const pointer to non const
+int* const ptr_const4 = 0;
+
+// Stupid formatting
+const int* ptr_const5 = 0;
+
+// At least one @kern function is requried
+@kernel void kern () {
+    @outer for (int i = 0; i < 32; ++i) {
+        @inner for (int j = 0; j < 32; ++j) {}
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/implicit/const_global_pointer_ref.cpp b/tests/functional/data/transpiler/backends/opencl/implicit/const_global_pointer_ref.cpp
new file mode 100644
index 00000000..76bd9766
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/implicit/const_global_pointer_ref.cpp
@@ -0,0 +1,21 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+// pointer to const
+__constant int *ptr_const0 = 0;
+__constant int *ptr_const1 = 0;
+// const pointer to const
+__constant int *const ptr_const2 = 0;
+__constant int *const ptr_const3 = 0;
+// const pointer to non const
+int *const ptr_const4 = 0;
+// Stupid formatting
+__constant int *ptr_const5 = 0;
+// At least one @kern function is requried
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void _occa_kern_0();
+
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void _occa_kern_0() {
+  {
+    int i = (0) + get_group_id(0);
+    { int j = (0) + get_local_id(0); }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/implicit/const_global_variable.cpp b/tests/functional/data/transpiler/backends/opencl/implicit/const_global_variable.cpp
new file mode 100644
index 00000000..fa5e086a
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/implicit/const_global_variable.cpp
@@ -0,0 +1,19 @@
+// int const, const int
+const int var_const0 = 0;
+int const var_const1 = 0;
+
+// volatile qualifier
+volatile const int var_const2 = 0;
+volatile int const var_const3 = 0;
+
+// Stupid formatting
+const int var_const4 = 0;
+
+int const var_const5 = 0;
+
+// At least one @kern function is requried
+@kernel void kern () {
+    @outer for (int i = 0; i < 32; ++i) {
+        @inner for (int j = 0; j < 32; ++j) {}
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/implicit/const_global_variable_ref.cpp b/tests/functional/data/transpiler/backends/opencl/implicit/const_global_variable_ref.cpp
new file mode 100644
index 00000000..17446176
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/implicit/const_global_variable_ref.cpp
@@ -0,0 +1,20 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+// int const, const int
+__constant int var_const0 = 0;
+__constant int var_const1 = 0;
+// volatile qualifier
+volatile __constant int var_const2 = 0;
+volatile __constant int var_const3 = 0;
+// Stupid formatting
+__constant int var_const4 = 0;
+__constant int var_const5 = 0;
+// At least one @kern function is requried
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void _occa_kern_0();
+
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void _occa_kern_0() {
+  {
+    int i = (0) + get_group_id(0);
+    { int j = (0) + get_local_id(0); }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/implicit/constexpr.cpp b/tests/functional/data/transpiler/backends/opencl/implicit/constexpr.cpp
new file mode 100644
index 00000000..580e8fbd
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/implicit/constexpr.cpp
@@ -0,0 +1,13 @@
+constexpr float f = 13;
+
+class HelloClass {
+public:
+    static constexpr int a = 2 + 2;
+};
+
+@kernel void test() {
+    for (int i = 0; i < 10; ++i; @outer) {
+        for (int j = 0; j < 10; ++j; @inner) {
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/implicit/constexpr_ref.cpp b/tests/functional/data/transpiler/backends/opencl/implicit/constexpr_ref.cpp
new file mode 100644
index 00000000..62ac0cfb
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/implicit/constexpr_ref.cpp
@@ -0,0 +1,17 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+constexpr float f = 13;
+
+class HelloClass {
+public:
+  static constexpr int a = 2 + 2;
+};
+
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void _occa_test_0();
+
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void _occa_test_0() {
+  {
+    int i = (0) + get_group_id(0);
+    { int j = (0) + get_local_id(0); }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/implicit/extern_const_global_array.cpp b/tests/functional/data/transpiler/backends/opencl/implicit/extern_const_global_array.cpp
new file mode 100644
index 00000000..065efe72
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/implicit/extern_const_global_array.cpp
@@ -0,0 +1,13 @@
+struct S {
+    int hello[12];
+};
+extern const int arr_0[];
+extern const float arr_1[];
+extern const S arr_2[];
+
+// At least one @kern function is requried
+@kernel void kern () {
+    @outer for (int i = 0; i < 32; ++i) {
+        @inner for (int j = 0; j < 32; ++j) {}
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/implicit/extern_const_global_array_ref.cpp b/tests/functional/data/transpiler/backends/opencl/implicit/extern_const_global_array_ref.cpp
new file mode 100644
index 00000000..461af978
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/implicit/extern_const_global_array_ref.cpp
@@ -0,0 +1,18 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+struct S {
+  int hello[12];
+};
+
+extern __constant int arr_0[];
+extern __constant float arr_1[];
+extern __constant S arr_2[];
+// At least one @kern function is requried
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void _occa_kern_0();
+
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void _occa_kern_0() {
+  {
+    int i = (0) + get_group_id(0);
+    { int j = (0) + get_local_id(0); }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/implicit/non_kernel_function.cpp b/tests/functional/data/transpiler/backends/opencl/implicit/non_kernel_function.cpp
new file mode 100644
index 00000000..3a90e85b
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/implicit/non_kernel_function.cpp
@@ -0,0 +1,14 @@
+static float add1(const float* a, int i, const float* b, int j) {
+    return a[i] + b[i];
+}
+
+float add2(const float* a, int i, const float* b, int j) {
+    return a[i] + b[i];
+}
+
+// At least one @kern function is requried
+@kernel void kern () {
+    @outer for (int i = 0; i < 32; ++i) {
+        @inner for (int j = 0; j < 32; ++j) {}
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/implicit/non_kernel_function_ref.cpp b/tests/functional/data/transpiler/backends/opencl/implicit/non_kernel_function_ref.cpp
new file mode 100644
index 00000000..e45fd425
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/implicit/non_kernel_function_ref.cpp
@@ -0,0 +1,21 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+static float add1(const float *a, int i, const float *b, int j);
+
+static float add1(const float *a, int i, const float *b, int j) {
+  return a[i] + b[i];
+}
+
+float add2(const float *a, int i, const float *b, int j);
+
+float add2(const float *a, int i, const float *b, int j) { return a[i] + b[i]; }
+
+// At least one @kern function is requried
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void _occa_kern_0();
+
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void _occa_kern_0() {
+  {
+    int i = (0) + get_group_id(0);
+    { int j = (0) + get_local_id(0); }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/max_inner_loops/outer_inner_split_max.cpp b/tests/functional/data/transpiler/backends/opencl/max_inner_loops/outer_inner_split_max.cpp
new file mode 100644
index 00000000..a2ddca2b
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/max_inner_loops/outer_inner_split_max.cpp
@@ -0,0 +1,83 @@
+
+@kernel void test0(const int entries, const float* a, const float* b, float* ab) {
+    int before0 = 0;
+    @outer for (int x = 0; x < 10; ++x; @max_inner_dims(2, 3, 5)) {
+        // int before1 = 1 + before0;
+        int before1 = 1;
+        @outer for (int y = 0; y < 20; ++y) {
+            int before2 = 1 + before1;
+            @outer for (int z = 0; z < 30; ++z) {
+                int before3 = 1 + before2;
+                @inner for (int n = 0; n < 2; ++n) {
+                    int after0 = 1 + before3;
+                    @inner for (int m = 0; m < 3; ++m) {
+                        int after1 = 1 + after0;
+                        @inner for (int k = 0; k < 5; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                        @inner for (int k = 0; k < 5; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                    }
+                    @inner for (int m = 0; m < 5; ++m) {
+                        int after1 = 1 + after0;
+                        @inner for (int k = 0; k < 7; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                        @inner for (int k = 0; k < 7; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    int before00 = 1 + before0;
+    @max_inner_dims(3, 5, 7) for (int x = 0; x < 10; ++x; @outer) {
+        // int before1 = 1 + before00;
+        int before1 = 1;
+        @outer for (int y = 0; y < 20; ++y) {
+            int before2 = 1 + before1;
+            @outer for (int z = 0; z < 30; ++z) {
+                int before3 = 1 + before2;
+                @inner for (int n = 0; n < 2; ++n) {
+                    int after0 = 1 + before3;
+                    @inner for (int m = 0; m < 3; ++m) {
+                        int after1 = 1 + after0;
+                        @inner for (int k = 0; k < 5; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                        @inner for (int k = 0; k < 5; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                    }
+                    @inner for (int m = 0; m < 5; ++m) {
+                        int after1 = 1 + after0;
+                        @inner for (int k = 0; k < 7; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                        @inner for (int k = 0; k < 7; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/max_inner_loops/outer_inner_split_max_ref.cpp b/tests/functional/data/transpiler/backends/opencl/max_inner_loops/outer_inner_split_max_ref.cpp
new file mode 100644
index 00000000..5666d60f
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/max_inner_loops/outer_inner_split_max_ref.cpp
@@ -0,0 +1,117 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(2, 3, 5))) void
+_occa_test0_0(const int entries, __global const float *a,
+              __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(2, 3, 5))) void
+_occa_test0_0(const int entries, __global const float *a,
+              __global const float *b, __global float *ab) {
+  {
+    int x = (0) + get_group_id(2);
+    // int before1 = 1 + before0;
+    int before1 = 1;
+    {
+      int y = (0) + get_group_id(1);
+      int before2 = 1 + before1;
+      {
+        int z = (0) + get_group_id(0);
+        int before3 = 1 + before2;
+        {
+          int n = (0) + get_local_id(2);
+          int after0 = 1 + before3;
+          {
+            int m = (0) + get_local_id(1);
+            int after1 = 1 + after0;
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+          }
+          {
+            int m = (0) + get_local_id(1);
+            int after1 = 1 + after0;
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+__kernel __attribute__((reqd_work_group_size(3, 5, 7))) void
+_occa_test0_1(const int entries, __global const float *a,
+              __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(3, 5, 7))) void
+_occa_test0_1(const int entries, __global const float *a,
+              __global const float *b, __global float *ab) {
+  {
+    int x = (0) + get_group_id(2);
+    // int before1 = 1 + before00;
+    int before1 = 1;
+    {
+      int y = (0) + get_group_id(1);
+      int before2 = 1 + before1;
+      {
+        int z = (0) + get_group_id(0);
+        int before3 = 1 + before2;
+        {
+          int n = (0) + get_local_id(2);
+          int after0 = 1 + before3;
+          {
+            int m = (0) + get_local_id(1);
+            int after1 = 1 + after0;
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+          }
+          {
+            int m = (0) + get_local_id(1);
+            int after1 = 1 + after0;
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/nobarrier/nobarrier_builtin.cpp b/tests/functional/data/transpiler/backends/opencl/nobarrier/nobarrier_builtin.cpp
new file mode 100644
index 00000000..ba0310ce
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/nobarrier/nobarrier_builtin.cpp
@@ -0,0 +1,32 @@
+@kernel void hello_kern() {
+    for (int i = 0; i < 10; ++i; @outer) {
+        @shared int shm[10];
+        for (int j = 0; j < 10; ++j; @inner) {
+            shm[j] = j;
+        }
+
+        @nobarrier for (int j = 0; j < 10; ++j; @inner) {
+            shm[j] = j;
+        }
+
+        for (int j = 0; j < 10; ++j; @inner) {
+            shm[j] = j;
+        }
+
+        for (int j = 0; j < 10; ++j; @inner) {
+            shm[j] = j;
+        }
+    }
+}
+
+@kernel void priority_issue() {
+    @outer for (int i = 0; i < 32; ++i) {
+        @shared float shm[32];
+        @nobarrier for (int j = 0; j < 32; ++j; @inner) {
+                shm[i] = i;
+        }
+        @inner for (int j = 0; j < 32; ++j) {
+                @atomic shm[i * j] += 32;
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/nobarrier/nobarrier_builtin_ref.cpp b/tests/functional/data/transpiler/backends/opencl/nobarrier/nobarrier_builtin_ref.cpp
new file mode 100644
index 00000000..4c96c272
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/nobarrier/nobarrier_builtin_ref.cpp
@@ -0,0 +1,49 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void
+_occa_hello_kern_0();
+
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void
+_occa_hello_kern_0() {
+  {
+    int i = (0) + get_group_id(0);
+    __local int shm[10];
+    {
+      int j = (0) + get_local_id(0);
+      shm[j] = j;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    {
+      int j = (0) + get_local_id(0);
+      shm[j] = j;
+    }
+    {
+      int j = (0) + get_local_id(0);
+      shm[j] = j;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    {
+      int j = (0) + get_local_id(0);
+      shm[j] = j;
+    }
+  }
+}
+
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void
+_occa_priority_issue_0();
+
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void
+_occa_priority_issue_0() {
+  {
+    int i = (0) + get_group_id(0);
+    __local float shm[32];
+    {
+      int j = (0) + get_local_id(0);
+      shm[i] = i;
+    }
+    {
+      int j = (0) + get_local_id(0);
+      shm[i * j] += 32;
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_dec.cpp b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_dec.cpp
new file mode 100644
index 00000000..d2254673
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_dec.cpp
@@ -0,0 +1,83 @@
+const int offset = 1;
+
+// template<typename T>
+float add(float a, float b) {
+    return a + b + offset;
+}
+
+// Outer -> inner
+@kernel void addVectors0(const int entries, const float* a, const float* b, float* ab) {
+    @outer for (int j = entries - 1; j >= 0; j -= 1) {
+        @inner for (int i = entries - 1; i >= 0; i -= 1) {
+            ab[i] = add(a[i], b[i]);
+        }
+    }
+}
+
+// Outer -> inner non 1 increment
+@kernel void addVectors1(const int entries, const float* a, const float* b, float* ab) {
+    @outer for (int j = entries - 1; j >= 0; j -= 2) {
+        @inner for (int i = entries - 1; i >= 0; i -= 2) {
+            ab[i] = add(a[i], b[i]);
+        }
+    }
+}
+
+// Outer -> inner unary post add
+@kernel void addVectors2(const int entries, const float* a, const float* b, float* ab) {
+    @outer for (int j = entries - 1; j >= 0; j--) {
+        @inner for (int i = entries; i > 0; i--) {
+            ab[i] = add(a[i], b[i]);
+        }
+    }
+}
+
+// Outer -> inner unary pre add
+@kernel void addVectors3(const int entries, const float* a, const float* b, float* ab) {
+    @outer for (int j = entries - 1; j >= 0; --j) {
+        @inner for (int i = entries - 1; i >= 0; --i) {
+            ab[i] = add(a[i], b[i]);
+        }
+    }
+}
+
+// Outer -> outer -> inner -> inner
+// TODO: change after sema calculates dimensions
+@kernel void addVectors4(const int entries, const float* a, const float* b, float* ab) {
+    @outer for (int i = entries - 1; i >= 0; --i) {
+        @outer for (int j = entries - 1; j >= 0; --j) {
+            @inner for (int k = entries - 1; k >= 0; --k) {
+                @inner for (int ii = entries - 1; ii >= 0; --ii) {
+                    ab[ii + k] = add(a[i], b[j]);
+                }
+            }
+        }
+    }
+}
+
+// Outer -> outer -> inner -> inner + manual dimensions specification
+@kernel void addVectors5(const int entries, const float* a, const float* b, float* ab) {
+    @outer(1) for (int i = entries - 1; i >= 0; --i) {
+        @outer(0) for (int j = entries - 1; j >= 0; --j) {
+            @inner(1) for (int k = entries - 1; k >= 0; --k) {
+                @inner(0) for (int ii = entries - 1; ii >= 0; --ii) {
+                    ab[ii + k] = add(a[i], b[j]);
+                }
+            }
+        }
+    }
+}
+
+// Outer -> outer -> inner -> inner + partially manual dimensions specification
+// TODO: change after sema calculates dimensions
+@kernel void addVectors6(const int entries, const float* a, const float* b, float* ab) {
+    @outer for (int i = entries - 1; i >= 0; --i) {
+        @outer(0) for (int j = entries - 1; j >= 0; --j) {
+            @inner for (int k = entries - 1; k >= 0; --k) {
+                @inner(0) for (int ii = entries - 1; ii >= 0; --ii) {
+                    ab[ii + k] = add(a[i], b[j]);
+                }
+            }
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_dec_ref.cpp b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_dec_ref.cpp
new file mode 100644
index 00000000..d9e5c965
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_dec_ref.cpp
@@ -0,0 +1,132 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__constant int offset = 1;
+// template<typename T>
+float add(float a, float b);
+
+float add(float a, float b) { return a + b + offset; }
+
+// Outer -> inner
+__kernel void _occa_addVectors0_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors0_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int j = (entries - 1) - ((1) * get_group_id(0));
+    {
+      int i = (entries - 1) - ((1) * get_local_id(0));
+      ab[i] = add(a[i], b[i]);
+    }
+  }
+}
+
+// Outer -> inner non 1 increment
+__kernel void _occa_addVectors1_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors1_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int j = (entries - 1) - ((2) * get_group_id(0));
+    {
+      int i = (entries - 1) - ((2) * get_local_id(0));
+      ab[i] = add(a[i], b[i]);
+    }
+  }
+}
+
+// Outer -> inner unary post add
+__kernel void _occa_addVectors2_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors2_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int j = (entries - 1) - get_group_id(0);
+    {
+      int i = (entries)-get_local_id(0);
+      ab[i] = add(a[i], b[i]);
+    }
+  }
+}
+
+// Outer -> inner unary pre add
+__kernel void _occa_addVectors3_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors3_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int j = (entries - 1) - get_group_id(0);
+    {
+      int i = (entries - 1) - get_local_id(0);
+      ab[i] = add(a[i], b[i]);
+    }
+  }
+}
+
+// Outer -> outer -> inner -> inner
+// TODO: change after sema calculates dimensions
+__kernel void _occa_addVectors4_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors4_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int i = (entries - 1) - get_group_id(1);
+    {
+      int j = (entries - 1) - get_group_id(0);
+      {
+        int k = (entries - 1) - get_local_id(1);
+        {
+          int ii = (entries - 1) - get_local_id(0);
+          ab[ii + k] = add(a[i], b[j]);
+        }
+      }
+    }
+  }
+}
+
+// Outer -> outer -> inner -> inner + manual dimensions specification
+__kernel void _occa_addVectors5_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors5_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int i = (entries - 1) - get_group_id(1);
+    {
+      int j = (entries - 1) - get_group_id(0);
+      {
+        int k = (entries - 1) - get_local_id(1);
+        {
+          int ii = (entries - 1) - get_local_id(0);
+          ab[ii + k] = add(a[i], b[j]);
+        }
+      }
+    }
+  }
+}
+
+// Outer -> outer -> inner -> inner + partially manual dimensions specification
+// TODO: change after sema calculates dimensions
+__kernel void _occa_addVectors6_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors6_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int i = (entries - 1) - get_group_id(1);
+    {
+      int j = (entries - 1) - get_group_id(0);
+      {
+        int k = (entries - 1) - get_local_id(1);
+        {
+          int ii = (entries - 1) - get_local_id(0);
+          ab[ii + k] = add(a[i], b[j]);
+        }
+      }
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_inc.cpp b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_inc.cpp
new file mode 100644
index 00000000..a11c3d7b
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_inc.cpp
@@ -0,0 +1,82 @@
+const int offset = 1;
+
+// template<typename T>
+float add(float a, float b) {
+    return a + b + offset;
+}
+
+// Outer -> inner
+@kernel void addVectors0(const int entries, const float* a, const float* b, float* ab) {
+    @outer for (int j = 0; j < entries; j += 1) {
+        @inner for (int i = 0; i < entries; i += 1) {
+            ab[i] = add(a[i], b[i]);
+        }
+    }
+}
+
+// Outer -> inner non 1 increment
+@kernel void addVectors1(const int entries, const float* a, const float* b, float* ab) {
+    @outer for (int j = 0; j < entries; j += 2) {
+        @inner for (int i = 0; i < entries; i += 2) {
+            ab[i] = add(a[i], b[i]);
+        }
+    }
+}
+
+// Outer -> inner unary post add
+@kernel void addVectors2(const int entries, const float* a, const float* b, float* ab) {
+    @outer for (int j = 0; j < entries; j++) {
+        @inner for (int i = 0; i <= entries - 1; i++) {
+            ab[i] = add(a[i], b[i]);
+        }
+    }
+}
+
+// Outer -> inner unary pre add
+@kernel void addVectors3(const int entries, const float* a, const float* b, float* ab) {
+    @outer for (int j = 0; j < entries; ++j) {
+        @inner for (int i = 0; i < entries; ++i) {
+            ab[i] = add(a[i], b[i]);
+        }
+    }
+}
+
+// Outer -> outer -> inner -> inner
+// TODO: change after sema calculates dimensions
+@kernel void addVectors4(const int entries, const float* a, const float* b, float* ab) {
+    @outer for (int i = 0; i < entries; ++i) {
+        @outer for (int j = 0; j < entries; ++j) {
+            @inner for (int k = 0; k < entries; ++k) {
+                @inner for (int ii = 0; ii < entries; ++ii) {
+                    ab[ii + k] = add(a[i], b[j]);
+                }
+            }
+        }
+    }
+}
+
+// Outer -> outer -> inner -> inner + manual dimensions specification
+@kernel void addVectors5(const int entries, const float* a, const float* b, float* ab) {
+    @outer(1) for (int i = 0; i < entries; ++i) {
+        @outer(0) for (int j = 0; j < entries; ++j) {
+            @inner(1) for (int k = 0; k < entries; ++k) {
+                @inner(0) for (int ii = 0; ii < entries; ++ii) {
+                    ab[ii + k] = add(a[i], b[j]);
+                }
+            }
+        }
+    }
+}
+
+// Outer -> outer -> inner -> inner + partially manual dimensions specification
+@kernel void addVectors6(const int entries, const float* a, const float* b, float* ab) {
+    @outer for (int i = 0; i < entries; ++i) {
+        @outer(0) for (int j = 0; j < entries; ++j) {
+            @inner for (int k = 0; k < entries; ++k) {
+                @inner(0) for (int ii = 0; ii < entries; ++ii) {
+                    ab[ii + k] = add(a[i], b[j]);
+                }
+            }
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_inc_ref.cpp b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_inc_ref.cpp
new file mode 100644
index 00000000..3cd8766d
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_inc_ref.cpp
@@ -0,0 +1,131 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__constant int offset = 1;
+// template<typename T>
+float add(float a, float b);
+
+float add(float a, float b) { return a + b + offset; }
+
+// Outer -> inner
+__kernel void _occa_addVectors0_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors0_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int j = (0) + ((1) * get_group_id(0));
+    {
+      int i = (0) + ((1) * get_local_id(0));
+      ab[i] = add(a[i], b[i]);
+    }
+  }
+}
+
+// Outer -> inner non 1 increment
+__kernel void _occa_addVectors1_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors1_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int j = (0) + ((2) * get_group_id(0));
+    {
+      int i = (0) + ((2) * get_local_id(0));
+      ab[i] = add(a[i], b[i]);
+    }
+  }
+}
+
+// Outer -> inner unary post add
+__kernel void _occa_addVectors2_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors2_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int j = (0) + get_group_id(0);
+    {
+      int i = (0) + get_local_id(0);
+      ab[i] = add(a[i], b[i]);
+    }
+  }
+}
+
+// Outer -> inner unary pre add
+__kernel void _occa_addVectors3_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors3_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int j = (0) + get_group_id(0);
+    {
+      int i = (0) + get_local_id(0);
+      ab[i] = add(a[i], b[i]);
+    }
+  }
+}
+
+// Outer -> outer -> inner -> inner
+// TODO: change after sema calculates dimensions
+__kernel void _occa_addVectors4_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors4_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int i = (0) + get_group_id(1);
+    {
+      int j = (0) + get_group_id(0);
+      {
+        int k = (0) + get_local_id(1);
+        {
+          int ii = (0) + get_local_id(0);
+          ab[ii + k] = add(a[i], b[j]);
+        }
+      }
+    }
+  }
+}
+
+// Outer -> outer -> inner -> inner + manual dimensions specification
+__kernel void _occa_addVectors5_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors5_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int i = (0) + get_group_id(1);
+    {
+      int j = (0) + get_group_id(0);
+      {
+        int k = (0) + get_local_id(1);
+        {
+          int ii = (0) + get_local_id(0);
+          ab[ii + k] = add(a[i], b[j]);
+        }
+      }
+    }
+  }
+}
+
+// Outer -> outer -> inner -> inner + partially manual dimensions specification
+__kernel void _occa_addVectors6_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors6_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int i = (0) + get_group_id(1);
+    {
+      int j = (0) + get_group_id(0);
+      {
+        int k = (0) + get_local_id(1);
+        {
+          int ii = (0) + get_local_id(0);
+          ab[ii + k] = add(a[i], b[j]);
+        }
+      }
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_multiple.cpp b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_multiple.cpp
new file mode 100644
index 00000000..c53693e2
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_multiple.cpp
@@ -0,0 +1,108 @@
+const int offset = 1;
+
+// template<typename T>
+float add(float a, float b) {
+    return a + b + offset;
+}
+
+// with shared memory usage (should be automatic sync)
+@kernel void addVectors(const int entries, float* a, float* b, float* ab, float* mat @dim(10, 10)) {
+    for (int i = 0; i < entries; i += 1; @outer) {
+        for (int i2 = 0; i2 < entries; i2 += 1; @outer) {
+            @shared int shm[32];
+            @shared int shm2[32];
+            for (int j = 0; j < entries; j += 1; @inner) {
+                shm[j] = 0;  // shared memory usage -> should be barrier after @inner loop
+                mat(0, 0) = 12;
+                for (int k = 0; k < entries; k += 1; @inner) {
+                    for (int ii = 0; ii < entries; ii += 1; @inner) {
+                        ab[i] = add(a[i], b[k]);
+                    }
+                    ab[i] = add(a[i], b[k]);
+                }
+                for (int k = 0; k < entries; k += 1; @inner) {
+                    for (int ii = 0; ii < entries; ii += 1; @inner) {
+                        ab[i] = add(a[i], b[k]);
+                    }
+
+                    ab[i] = add(a[i], b[k]);
+                }
+            }
+
+            for (int j = 0; j < entries; j += 1; @tile(4, @inner, @inner, check = false)) {
+                for (int k = 0; k < entries; k += 1; @inner) {
+                    // shared memory usage -> should be barrier, since @tile is inner, inner
+                    shm[j] = 0;
+                }
+            }
+
+            for (int j = 0; j < entries; j += 1; @inner) {
+                shm[j] = 0;
+                for (int k = 0; k < entries; k += 1; @inner) {
+                    for (int ii = 0; ii < entries; ii += 1; @inner) {
+                        ab[i] = add(a[i], b[k]);
+                    }
+
+                    ab[i] = add(a[i], b[k]);
+                }
+
+                for (int k = 0; k < entries; k += 1; @tile(4, @inner, @inner, check = false)) {
+                    ab[i] = add(a[i], b[k]);
+                }
+            }
+        }
+    }
+}
+
+// without shared memory usage (should be no automatic sync)
+@kernel void addVectors1(const int entries,
+                         float* a,
+                         float* b,
+                         float* ab,
+                         float* mat @dim(10, 10)) {
+    for (int i = 0; i < entries; i += 1; @outer) {
+        for (int i2 = 0; i2 < entries; i2 += 1; @outer) {
+            @shared int shm[32];
+            @shared int shm2[32];
+            for (int j = 0; j < entries; j += 1; @inner) {
+                // shm[j] = 0;  // shared memory usage -> should be barrier after @inner loop
+                mat(0, 0) = 12;
+                for (int k = 0; k < entries; k += 1; @inner) {
+                    for (int ii = 0; ii < entries; ii += 1; @inner) {
+                        ab[i] = add(a[i], b[k]);
+                    }
+                    ab[i] = add(a[i], b[k]);
+                }
+                for (int k = 0; k < entries; k += 1; @inner) {
+                    for (int ii = 0; ii < entries; ii += 1; @inner) {
+                        ab[i] = add(a[i], b[k]);
+                    }
+
+                    ab[i] = add(a[i], b[k]);
+                }
+            }
+
+            for (int j = 0; j < entries; j += 1; @tile(4, @inner, @inner, check = false)) {
+                for (int k = 0; k < entries; k += 1; @inner) {
+                    // shared memory usage -> should be barrier, since @tile is inner, inner
+                    // shm[j] = 0;
+                }
+            }
+
+            for (int j = 0; j < entries; j += 1; @inner) {
+                shm[j] = 0;
+                for (int k = 0; k < entries; k += 1; @inner) {
+                    for (int ii = 0; ii < entries; ii += 1; @inner) {
+                        ab[i] = add(a[i], b[k]);
+                    }
+
+                    ab[i] = add(a[i], b[k]);
+                }
+
+                for (int k = 0; k < entries; k += 1; @tile(4, @inner, @inner, check = false)) {
+                    ab[i] = add(a[i], b[k]);
+                }
+            }
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_multiple_ref.cpp b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_multiple_ref.cpp
new file mode 100644
index 00000000..d0fdd330
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_multiple_ref.cpp
@@ -0,0 +1,154 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__constant int offset = 1;
+// template<typename T>
+float add(float a, float b);
+
+float add(float a, float b) { return a + b + offset; }
+
+// with shared memory usage (should be automatic sync)
+__kernel void _occa_addVectors_0(const int entries, __global float *a,
+                                 __global float *b, __global float *ab,
+                                 __global float *mat);
+
+__kernel void _occa_addVectors_0(const int entries, __global float *a,
+                                 __global float *b, __global float *ab,
+                                 __global float *mat) {
+  {
+    int i = (0) + ((1) * get_group_id(1));
+    {
+      int i2 = (0) + ((1) * get_group_id(0));
+      __local int shm[32];
+      __local int shm2[32];
+      {
+        int j = (0) + ((1) * get_local_id(2));
+        shm[j] =
+            0; // shared memory usage -> should be barrier after @inner loop
+        mat[0 + (10 * (0))] = 12;
+        {
+          int k = (0) + ((1) * get_local_id(1));
+          {
+            int ii = (0) + ((1) * get_local_id(0));
+            ab[i] = add(a[i], b[k]);
+          }
+          ab[i] = add(a[i], b[k]);
+        }
+        {
+          int k = (0) + ((1) * get_local_id(1));
+          {
+            int ii = (0) + ((1) * get_local_id(0));
+            ab[i] = add(a[i], b[k]);
+          }
+          ab[i] = add(a[i], b[k]);
+        }
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+      {
+        int _occa_tiled_j = (0) + (((4) * 1) * get_local_id(2));
+        {
+          int j = _occa_tiled_j + ((1) * get_local_id(1));
+          {
+            {
+              int k = (0) + ((1) * get_local_id(0));
+              // shared memory usage -> should be barrier, since @tile is inner,
+              // inner
+              shm[j] = 0;
+            }
+          }
+        }
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+      {
+        int j = (0) + ((1) * get_local_id(2));
+        shm[j] = 0;
+        {
+          int k = (0) + ((1) * get_local_id(1));
+          {
+            int ii = (0) + ((1) * get_local_id(0));
+            ab[i] = add(a[i], b[k]);
+          }
+          ab[i] = add(a[i], b[k]);
+        }
+        {
+          int _occa_tiled_k = (0) + (((4) * 1) * get_local_id(1));
+          {
+            int k = _occa_tiled_k + ((1) * get_local_id(0));
+            { ab[i] = add(a[i], b[k]); }
+          }
+        }
+      }
+    }
+  }
+}
+
+// without shared memory usage (should be no automatic sync)
+__kernel void _occa_addVectors1_0(const int entries, __global float *a,
+                                  __global float *b, __global float *ab,
+                                  __global float *mat);
+
+__kernel void _occa_addVectors1_0(const int entries, __global float *a,
+                                  __global float *b, __global float *ab,
+                                  __global float *mat) {
+  {
+    int i = (0) + ((1) * get_group_id(1));
+    {
+      int i2 = (0) + ((1) * get_group_id(0));
+      __local int shm[32];
+      __local int shm2[32];
+      {
+        int j = (0) + ((1) * get_local_id(2));
+        // shm[j] = 0; // shared memory usage -> should be barrier after @inner
+        // loop
+        mat[0 + (10 * (0))] = 12;
+        {
+          int k = (0) + ((1) * get_local_id(1));
+          {
+            int ii = (0) + ((1) * get_local_id(0));
+            ab[i] = add(a[i], b[k]);
+          }
+          ab[i] = add(a[i], b[k]);
+        }
+        {
+          int k = (0) + ((1) * get_local_id(1));
+          {
+            int ii = (0) + ((1) * get_local_id(0));
+            ab[i] = add(a[i], b[k]);
+          }
+          ab[i] = add(a[i], b[k]);
+        }
+      }
+      {
+        int _occa_tiled_j = (0) + (((4) * 1) * get_local_id(2));
+        {
+          int j = _occa_tiled_j + ((1) * get_local_id(1));
+          {
+            {
+              int k = (0) + ((1) * get_local_id(0));
+              // shared memory usage -> should be barrier, since @tile is inner,
+              // inner shm[j] = 0;
+            }
+          }
+        }
+      }
+      {
+        int j = (0) + ((1) * get_local_id(2));
+        shm[j] = 0;
+        {
+          int k = (0) + ((1) * get_local_id(1));
+          {
+            int ii = (0) + ((1) * get_local_id(0));
+            ab[i] = add(a[i], b[k]);
+          }
+          ab[i] = add(a[i], b[k]);
+        }
+        {
+          int _occa_tiled_k = (0) + (((4) * 1) * get_local_id(1));
+          {
+            int k = _occa_tiled_k + ((1) * get_local_id(0));
+            { ab[i] = add(a[i], b[k]); }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_regular_at_same_level_as_attributed.cpp b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_regular_at_same_level_as_attributed.cpp
new file mode 100644
index 00000000..4877b67f
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_regular_at_same_level_as_attributed.cpp
@@ -0,0 +1,32 @@
+@kernel void test_kernel() {
+    @outer for (int i = 0; i < 10; ++i) {
+        @outer for (int i2 = 0; i2 < 10; ++i2) {
+            @inner for (int j = 0; j < 10; ++j) {
+            }
+
+            for (int ii = 0; ii < 10; ++ii) {
+                @inner for (int j = 0; j < 10; ++j) {
+                }
+                for (int j = 0; j < 10; ++j) {
+                }
+            }
+        }
+
+        for (int ii = 0; ii < 10; ++ii) {
+            @outer for (int i = 0; i < 10; ++i) {
+                @inner for (int j = 0; j < 10; ++j) {
+                }
+            }
+        }
+    }
+    for (int ii = 0; ii < 10; ++ii) {
+        @outer for (int i = 0; i < 10; ++i) {
+            for (int i2 = 0; i2 < 10; ++i2) {
+                @outer for (int i2 = 0; i2 < 10; ++i2) {
+                    @inner for (int j = 0; j < 10; ++j) {
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_regular_at_same_level_as_attributed_ref.cpp b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_regular_at_same_level_as_attributed_ref.cpp
new file mode 100644
index 00000000..72aab01a
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_regular_at_same_level_as_attributed_ref.cpp
@@ -0,0 +1,44 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void
+_occa_test_kernel_0();
+
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void
+_occa_test_kernel_0() {
+  {
+    int i = (0) + get_group_id(1);
+    {
+      int i2 = (0) + get_group_id(0);
+      { int j = (0) + get_local_id(0); }
+      for (int ii = 0; ii < 10; ++ii) {
+        {
+          int j = (0) + get_local_id(0);
+        }
+        for (int j = 0; j < 10; ++j) {
+        }
+      }
+    }
+    for (int ii = 0; ii < 10; ++ii) {
+      {
+        int i = (0) + get_group_id(0);
+        { int j = (0) + get_local_id(0); }
+      }
+    }
+  }
+}
+
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void
+_occa_test_kernel_1();
+
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void
+_occa_test_kernel_1() {
+  {
+    int i = (0) + get_group_id(1);
+    for (int i2 = 0; i2 < 10; ++i2) {
+      {
+        int i2 = (0) + get_group_id(0);
+        { int j = (0) + get_local_id(0); }
+      }
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_split.cpp b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_split.cpp
new file mode 100644
index 00000000..ada5061a
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_split.cpp
@@ -0,0 +1,83 @@
+
+@kernel void test0(const int entries, const float* a, const float* b, float* ab) {
+    int before0 = 0;
+    @outer for (int x = 0; x < 10; ++x) {
+        // int before1 = 1 + before0;
+        int before1 = 1;
+        @outer for (int y = 0; y < 20; ++y) {
+            int before2 = 1 + before1;
+            @outer for (int z = 0; z < 30; ++z) {
+                int before3 = 1 + before2;
+                @inner for (int n = 0; n < 2; ++n) {
+                    int after0 = 1 + before3;
+                    @inner for (int m = 0; m < 3; ++m) {
+                        int after1 = 1 + after0;
+                        @inner for (int k = 0; k < 5; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                        @inner for (int k = 0; k < 5; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                    }
+                    @inner for (int m = 0; m < 5; ++m) {
+                        int after1 = 1 + after0;
+                        @inner for (int k = 0; k < 7; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                        @inner for (int k = 0; k < 7; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    int before00 = 1 + before0;
+    @outer for (int x = 0; x < 10; ++x) {
+        // int before1 = 1 + before00;
+        int before1 = 1;
+        @outer for (int y = 0; y < 20; ++y) {
+            int before2 = 1 + before1;
+            @outer for (int z = 0; z < 30; ++z) {
+                int before3 = 1 + before2;
+                @inner for (int n = 0; n < 2; ++n) {
+                    int after0 = 1 + before3;
+                    @inner for (int m = 0; m < 3; ++m) {
+                        int after1 = 1 + after0;
+                        @inner for (int k = 0; k < 5; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                        @inner for (int k = 0; k < 5; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                    }
+                    @inner for (int m = 0; m < 5; ++m) {
+                        int after1 = 1 + after0;
+                        @inner for (int k = 0; k < 7; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                        @inner for (int k = 0; k < 7; ++k) {
+                            int after2 = 1 + after1;
+                            ab[x] =
+                                a[x] + b[x] + static_cast<float>(k + m + n + z + y + x + after2);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_split_ref.cpp b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_split_ref.cpp
new file mode 100644
index 00000000..56ae07b8
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/outer_inner/outer_inner_split_ref.cpp
@@ -0,0 +1,117 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(7, 5, 2))) void
+_occa_test0_0(const int entries, __global const float *a,
+              __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(7, 5, 2))) void
+_occa_test0_0(const int entries, __global const float *a,
+              __global const float *b, __global float *ab) {
+  {
+    int x = (0) + get_group_id(2);
+    // int before1 = 1 + before0;
+    int before1 = 1;
+    {
+      int y = (0) + get_group_id(1);
+      int before2 = 1 + before1;
+      {
+        int z = (0) + get_group_id(0);
+        int before3 = 1 + before2;
+        {
+          int n = (0) + get_local_id(2);
+          int after0 = 1 + before3;
+          {
+            int m = (0) + get_local_id(1);
+            int after1 = 1 + after0;
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+          }
+          {
+            int m = (0) + get_local_id(1);
+            int after1 = 1 + after0;
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+__kernel __attribute__((reqd_work_group_size(7, 5, 2))) void
+_occa_test0_1(const int entries, __global const float *a,
+              __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(7, 5, 2))) void
+_occa_test0_1(const int entries, __global const float *a,
+              __global const float *b, __global float *ab) {
+  {
+    int x = (0) + get_group_id(2);
+    // int before1 = 1 + before00;
+    int before1 = 1;
+    {
+      int y = (0) + get_group_id(1);
+      int before2 = 1 + before1;
+      {
+        int z = (0) + get_group_id(0);
+        int before3 = 1 + before2;
+        {
+          int n = (0) + get_local_id(2);
+          int after0 = 1 + before3;
+          {
+            int m = (0) + get_local_id(1);
+            int after1 = 1 + after0;
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+          }
+          {
+            int m = (0) + get_local_id(1);
+            int after1 = 1 + after0;
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+            {
+              int k = (0) + get_local_id(0);
+              int after2 = 1 + after1;
+              ab[x] = a[x] + b[x] +
+                      static_cast<float>(k + m + n + z + y + x + after2);
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/restrict/restrict_builtin_types.cpp b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_builtin_types.cpp
new file mode 100644
index 00000000..bcd8f046
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_builtin_types.cpp
@@ -0,0 +1,11 @@
+
+
+@kernel void function1(const int* i32Data @restrict,
+                       float* fp32Data @restrict,
+                       const double* fp64Data @restrict) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+            @restrict float* b = &fp32Data[0];
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/restrict/restrict_builtin_types_ref.cpp b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_builtin_types_ref.cpp
new file mode 100644
index 00000000..201f69e4
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_builtin_types_ref.cpp
@@ -0,0 +1,19 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_function1_0(__global const int *restrict i32Data,
+                  __global float *restrict fp32Data,
+                  __global const double *restrict fp64Data);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_function1_0(__global const int *restrict i32Data,
+                  __global float *restrict fp32Data,
+                  __global const double *restrict fp64Data) {
+  {
+    int i = (0) + get_group_id(0);
+    {
+      int j = (0) + get_local_id(0);
+      float *b = &fp32Data[0];
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/restrict/restrict_complex_types.cpp b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_complex_types.cpp
new file mode 100644
index 00000000..63ade8c4
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_complex_types.cpp
@@ -0,0 +1,28 @@
+
+template <class T>
+struct Complex {
+    T real;
+    T imaginary;
+};
+
+struct Configs {
+    unsigned int size1;
+    unsigned long size2;
+};
+
+struct Data {
+    @restrict float* x;
+    @restrict float* y;
+    unsigned long size;
+};
+
+
+@kernel void function1(const Complex<float>* vectorData @restrict,
+                       unsigned int vectorSize,
+                       const Complex<float>** matricesData @restrict,
+                       const Configs* matricesSizes @restrict) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/restrict/restrict_complex_types_ref.cpp b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_complex_types_ref.cpp
new file mode 100644
index 00000000..2000ec62
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_complex_types_ref.cpp
@@ -0,0 +1,34 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+template <class T> struct Complex {
+  T real;
+  T imaginary;
+};
+
+struct Configs {
+  unsigned int size1;
+  unsigned long size2;
+};
+
+struct Data {
+  float *x;
+  float *y;
+  unsigned long size;
+};
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_function1_0(__global const Complex<float> *restrict vectorData,
+                  unsigned int vectorSize,
+                  __global const Complex<float> **restrict matricesData,
+                  __global const Configs *restrict matricesSizes);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_function1_0(__global const Complex<float> *restrict vectorData,
+                  unsigned int vectorSize,
+                  __global const Complex<float> **restrict matricesData,
+                  __global const Configs *restrict matricesSizes) {
+  {
+    int i = (0) + get_group_id(0);
+    { int j = (0) + get_local_id(0); }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/restrict/restrict_namespaced_types.cpp b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_namespaced_types.cpp
new file mode 100644
index 00000000..74688888
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_namespaced_types.cpp
@@ -0,0 +1,40 @@
+
+namespace A {
+template <class T>
+struct Complex {
+    T real;
+    T imaginary;
+};
+
+namespace B {
+struct Configs {
+    unsigned int size1;
+    unsigned long size2;
+};
+namespace C {
+typedef int SIZE_TYPE;
+typedef SIZE_TYPE SIZES;
+}  // namespace C
+}  // namespace B
+}  // namespace A
+
+
+@kernel void function1(const A::Complex<float>* vectorData @restrict,
+                       unsigned int vectorSize,
+                       const A::Complex<float>** matricesData @restrict,
+                       const A::B::Configs* matricesSizes @restrict) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+        }
+    }
+}
+
+
+@kernel void function2(const A::Complex<float>* vectorData @restrict,
+                       const A::B::Configs* configs @restrict,
+                       A::B::C::SIZES* vectorSize @restrict) {
+    @outer for (int i = 0; i < 1; ++i) {
+        @inner for (int j = 0; j < 1; ++j) {
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/restrict/restrict_namespaced_types_ref.cpp b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_namespaced_types_ref.cpp
new file mode 100644
index 00000000..4843c621
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_namespaced_types_ref.cpp
@@ -0,0 +1,52 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+namespace A {
+template <class T> struct Complex {
+  T real;
+  T imaginary;
+};
+
+namespace B {
+struct Configs {
+  unsigned int size1;
+  unsigned long size2;
+};
+
+namespace C {
+typedef int SIZE_TYPE;
+typedef SIZE_TYPE SIZES;
+} // namespace C
+} // namespace B
+} // namespace A
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_function1_0(__global const A::Complex<float> *restrict vectorData,
+                  unsigned int vectorSize,
+                  __global const A::Complex<float> **restrict matricesData,
+                  __global const A::B::Configs *restrict matricesSizes);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_function1_0(__global const A::Complex<float> *restrict vectorData,
+                  unsigned int vectorSize,
+                  __global const A::Complex<float> **restrict matricesData,
+                  __global const A::B::Configs *restrict matricesSizes) {
+  {
+    int i = (0) + get_group_id(0);
+    { int j = (0) + get_local_id(0); }
+  }
+}
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_function2_0(__global const A::Complex<float> *restrict vectorData,
+                  __global const A::B::Configs *restrict configs,
+                  __global A::B::C::SIZES *restrict vectorSize);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
+_occa_function2_0(__global const A::Complex<float> *restrict vectorData,
+                  __global const A::B::Configs *restrict configs,
+                  __global A::B::C::SIZES *restrict vectorSize) {
+  {
+    int i = (0) + get_group_id(0);
+    { int j = (0) + get_local_id(0); }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/restrict/restrict_return_type.cpp b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_return_type.cpp
new file mode 100644
index 00000000..752c2379
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_return_type.cpp
@@ -0,0 +1,15 @@
+float* @restrict myfn(float* a) {
+    return a + 1;
+}
+
+float* myfn2(float* a) {
+    return a + 1;
+}
+
+@kernel void hello() {
+    for (int i = 0; i < 10; i++; @outer) {
+        for (int j = 0; j < 10; j++; @inner) {
+
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/restrict/restrict_return_type_ref.cpp b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_return_type_ref.cpp
new file mode 100644
index 00000000..bc0cb309
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/restrict/restrict_return_type_ref.cpp
@@ -0,0 +1,18 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+float *myfn(float *a);
+
+float *myfn(float *a) { return a + 1; }
+
+float *myfn2(float *a);
+
+float *myfn2(float *a) { return a + 1; }
+
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void _occa_hello_0();
+
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void _occa_hello_0() {
+  {
+    int i = (0) + get_group_id(0);
+    { int j = (0) + get_local_id(0); }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/shared/shared_between_tiles.cpp b/tests/functional/data/transpiler/backends/opencl/shared/shared_between_tiles.cpp
new file mode 100644
index 00000000..e0b67760
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/shared/shared_between_tiles.cpp
@@ -0,0 +1,8 @@
+@kernel void test_kern() {
+    @tile(4, @outer) for (int i = 0; i < 10; ++i) {
+        @shared int shm[10];
+        @tile(4, @inner, @inner) for (int j = 0; j < 10; ++j) {
+            shm[j] = j;
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/shared/shared_between_tiles_ref.cpp b/tests/functional/data/transpiler/backends/opencl/shared/shared_between_tiles_ref.cpp
new file mode 100644
index 00000000..3b6d014e
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/shared/shared_between_tiles_ref.cpp
@@ -0,0 +1,26 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(4, 3, 1))) void
+_occa_test_kern_0();
+
+__kernel __attribute__((reqd_work_group_size(4, 3, 1))) void
+_occa_test_kern_0() {
+  {
+    int _occa_tiled_i = (0) + ((4) * get_group_id(0));
+    for (int i = _occa_tiled_i; i < (_occa_tiled_i + (4)); ++i) {
+      if (i < 10) {
+        __local int shm[10];
+        {
+          int _occa_tiled_j = (0) + ((4) * get_local_id(1));
+          {
+            int j = _occa_tiled_j + get_local_id(0);
+            if (j < 10) {
+              shm[j] = j;
+            }
+          }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+      }
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/shared/shared_builtin_types.cpp b/tests/functional/data/transpiler/backends/opencl/shared/shared_builtin_types.cpp
new file mode 100644
index 00000000..af3d5c90
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/shared/shared_builtin_types.cpp
@@ -0,0 +1,39 @@
+@kernel void function1(const int* data) {
+    @outer for (int i = 0; i < 64; ++i) {
+        @shared int arr1[32];
+        @shared float arr2[8][32];
+        @shared double arr3[4 + 4];
+        @inner for (int j = 0; j < 64; ++j) {
+        }
+    }
+}
+
+// syncronization between @inner loops:
+@kernel void function2() {
+    for (int i = 0; i < 10; i++; @outer) {
+        @shared int shm[10];
+
+        for (int j = 0; j < 10; j++; @inner) {
+            shm[i] = j;
+        }
+        // sync should be here
+        for (int j = 0; j < 10; j++; @inner) {
+            shm[i] = j;
+        }
+        // sync should not be here
+    }
+}
+
+// Even if loop is last, if it is inside regular loop, syncronization is inserted
+@kernel void function3() {
+    for (int i = 0; i < 10; i++; @outer) {
+        @shared int shm[10];
+
+        for (int q = 0; q < 5; ++q) {
+            for (int j = 0; j < 10; j++; @inner) {
+                shm[i] = j;
+            }
+            // sync should be here
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/shared/shared_builtin_types_ref.cpp b/tests/functional/data/transpiler/backends/opencl/shared/shared_builtin_types_ref.cpp
new file mode 100644
index 00000000..acdda5d5
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/shared/shared_builtin_types_ref.cpp
@@ -0,0 +1,59 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void
+_occa_function1_0(__global const int *data);
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void
+_occa_function1_0(__global const int *data) {
+  {
+    int i = (0) + get_group_id(0);
+    __local int arr1[32];
+    __local float arr2[8][32];
+    __local double arr3[4 + 4];
+    { int j = (0) + get_local_id(0); }
+  }
+}
+
+// syncronization between @inner loops:
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void
+_occa_function2_0();
+
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void
+_occa_function2_0() {
+  {
+    int i = (0) + get_group_id(0);
+    __local int shm[10];
+    {
+      int j = (0) + get_local_id(0);
+      shm[i] = j;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // sync should be here
+    {
+      int j = (0) + get_local_id(0);
+      shm[i] = j;
+    }
+    // sync should not be here
+  }
+}
+
+// Even if loop is last, if it is inside regular loop, syncronization is
+// inserted
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void
+_occa_function3_0();
+
+__kernel __attribute__((reqd_work_group_size(10, 1, 1))) void
+_occa_function3_0() {
+  {
+    int i = (0) + get_group_id(0);
+    __local int shm[10];
+    for (int q = 0; q < 5; ++q) {
+      {
+        int j = (0) + get_local_id(0);
+        shm[i] = j;
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+      // sync should be here
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/shared/shared_in_typedecl.cpp b/tests/functional/data/transpiler/backends/opencl/shared/shared_in_typedecl.cpp
new file mode 100644
index 00000000..07d9e718
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/shared/shared_in_typedecl.cpp
@@ -0,0 +1,10 @@
+typedef float sh_float32_t @shared;
+
+@kernel void test_kernel() {
+    @outer for (int i = 0; i < 32; ++i) {
+        sh_float32_t b[32];
+        @inner for (int j = 0; j < 32; ++j) {
+            b[j] = i+j;
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/shared/shared_in_typedecl_ref.cpp b/tests/functional/data/transpiler/backends/opencl/shared/shared_in_typedecl_ref.cpp
new file mode 100644
index 00000000..a2c4eb4f
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/shared/shared_in_typedecl_ref.cpp
@@ -0,0 +1,17 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+typedef float sh_float32_t;
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void
+_occa_test_kernel_0();
+
+__kernel __attribute__((reqd_work_group_size(32, 1, 1))) void
+_occa_test_kernel_0() {
+  {
+    int i = (0) + get_group_id(0);
+    __local sh_float32_t b[32];
+    {
+      int j = (0) + get_local_id(0);
+      b[j] = i + j;
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/shared/shared_struct_types.cpp b/tests/functional/data/transpiler/backends/opencl/shared/shared_struct_types.cpp
new file mode 100644
index 00000000..affb4fed
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/shared/shared_struct_types.cpp
@@ -0,0 +1,14 @@
+
+struct ComplexValueFloat {
+    float real;
+    float imaginary;
+};
+
+@kernel void function1(const int* data) {
+    @outer for (int i = 0; i < 64; ++i) {
+        @shared ComplexValueFloat arr2[8][32];
+        @shared ComplexValueFloat arr1[32];
+        @inner for (int j = 0; j < 64; ++j) {
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/shared/shared_struct_types_ref.cpp b/tests/functional/data/transpiler/backends/opencl/shared/shared_struct_types_ref.cpp
new file mode 100644
index 00000000..604d31af
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/shared/shared_struct_types_ref.cpp
@@ -0,0 +1,19 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+struct ComplexValueFloat {
+  float real;
+  float imaginary;
+};
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void
+_occa_function1_0(__global const int *data);
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void
+_occa_function1_0(__global const int *data) {
+  {
+    int i = (0) + get_group_id(0);
+    __local ComplexValueFloat arr2[8][32];
+    __local ComplexValueFloat arr1[32];
+    { int j = (0) + get_local_id(0); }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/shared/shared_template_type.cpp b/tests/functional/data/transpiler/backends/opencl/shared/shared_template_type.cpp
new file mode 100644
index 00000000..61cc672c
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/shared/shared_template_type.cpp
@@ -0,0 +1,15 @@
+
+template <class T>
+struct ComplexType {
+    T real;
+    T imaginary;
+};
+
+@kernel void function1(const int* data) {
+    @outer for (int i = 0; i < 64; ++i) {
+        @shared ComplexType<int> arr1[32];
+        @shared ComplexType<float> arr2[8][32];
+        @inner for (int j = 0; j < 64; ++j) {
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/shared/shared_template_type_ref.cpp b/tests/functional/data/transpiler/backends/opencl/shared/shared_template_type_ref.cpp
new file mode 100644
index 00000000..9d261bcd
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/shared/shared_template_type_ref.cpp
@@ -0,0 +1,19 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+template <class T> struct ComplexType {
+  T real;
+  T imaginary;
+};
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void
+_occa_function1_0(__global const int *data);
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void
+_occa_function1_0(__global const int *data) {
+  {
+    int i = (0) + get_group_id(0);
+    __local ComplexType<int> arr1[32];
+    __local ComplexType<float> arr2[8][32];
+    { int j = (0) + get_local_id(0); }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_dec.cpp b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_dec.cpp
new file mode 100644
index 00000000..4271f2d6
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_dec.cpp
@@ -0,0 +1,78 @@
+const int offset = 1;
+
+// template<typename T>
+float add(float a, float b) {
+    return a + b + offset;
+}
+
+// Outer -> inner
+@kernel void addVectors0(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer, @inner)) {
+        ab[i] = add(a[i], b[i]);
+    }
+}
+
+// Outer -> inner non 1 increment
+@kernel void addVectors1(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = entries - 1; i >= 0; i -= 2; @tile(4, @outer, @inner)) {
+        ab[i] = add(a[i], b[i]);
+    }
+}
+
+// Outer -> inner unary post add
+@kernel void addVectors2(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = entries - 1; i >= 0; i--; @tile(4, @outer, @inner)) {
+        ab[i] = add(a[i], b[i]);
+    }
+}
+
+// Outer -> inner unary pre add
+@kernel void addVectors3(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = entries - 1; i >= 0; --i; @tile(4, @outer, @inner)) {
+        ab[i] = add(a[i], b[i]);
+    }
+}
+
+// Outer -> inner, check=True
+@kernel void addVectors4(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer, @inner, check = true)) {
+        ab[i] = add(a[i], b[i]);
+    }
+}
+
+// Outer -> inner, complex range
+@kernel void addVectors5(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = (entries + 16); i >= (entries - 12 + 4); i -= (entries / 16 + 1);
+         @tile(4, @outer, @inner)) {
+        ab[i] = add(a[i], b[i]);
+    }
+}
+
+// Outer -> inner, set dimension
+@kernel void addVectors6(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(1), @inner(2))) {
+        ab[i] = add(a[i], b[i]);
+    }
+}
+
+// Outer -> inner ==> inner -> inner (nested)
+@kernel void addVectors7(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(0), @inner(0))) {
+        for (int j = entries - 1; j >= 0; j -= 1; @tile(4, @inner(1), @inner(2))) {
+            ab[i] = add(a[i], b[j]);
+        }
+    }
+}
+
+// Outer -> inner ==> inner -> inner (nested) + complex range + check true
+@kernel void addVectors8(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = (entries + 16); i >= (entries - 12 + static_cast<int>(*a));
+         i -= (entries / 16 + 1);
+         @tile(4, @outer(0), @inner(0), check = true)) {
+        for (unsigned long long j = (entries + 16); j >= (entries - 12 + static_cast<int>(*a));
+             j -= (entries / 16 + 1);
+             @tile(4, @inner(1), @inner(2), check = true)) {
+            ab[i] = add(a[i], b[j]);
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_dec_ref.cpp b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_dec_ref.cpp
new file mode 100644
index 00000000..71aa856d
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_dec_ref.cpp
@@ -0,0 +1,194 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__constant int offset = 1;
+// template<typename T>
+float add(float a, float b);
+
+float add(float a, float b) { return a + b + offset; }
+
+// Outer -> inner
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors0_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors0_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (entries - 1) - (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i - ((1) * get_local_id(0));
+      if (i >= 0) {
+        ab[i] = add(a[i], b[i]);
+      }
+    }
+  }
+}
+
+// Outer -> inner non 1 increment
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors1_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors1_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (entries - 1) - (((4) * 2) * get_group_id(0));
+    {
+      int i = _occa_tiled_i - ((2) * get_local_id(0));
+      if (i >= 0) {
+        ab[i] = add(a[i], b[i]);
+      }
+    }
+  }
+}
+
+// Outer -> inner unary post add
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors2_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors2_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (entries - 1) - ((4) * get_group_id(0));
+    {
+      int i = _occa_tiled_i - get_local_id(0);
+      if (i >= 0) {
+        ab[i] = add(a[i], b[i]);
+      }
+    }
+  }
+}
+
+// Outer -> inner unary pre add
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors3_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors3_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (entries - 1) - ((4) * get_group_id(0));
+    {
+      int i = _occa_tiled_i - get_local_id(0);
+      if (i >= 0) {
+        ab[i] = add(a[i], b[i]);
+      }
+    }
+  }
+}
+
+// Outer -> inner, check=True
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors4_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors4_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (entries - 1) - (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i - ((1) * get_local_id(0));
+      if (i >= 0) {
+        ab[i] = add(a[i], b[i]);
+      }
+    }
+  }
+}
+
+// Outer -> inner, complex range
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors5_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors5_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i =
+        ((entries + 16)) - (((4) * (entries / 16 + 1)) * get_group_id(0));
+    {
+      int i = _occa_tiled_i - (((entries / 16 + 1)) * get_local_id(0));
+      if (i >= (entries - 12 + 4)) {
+        ab[i] = add(a[i], b[i]);
+      }
+    }
+  }
+}
+
+// Outer -> inner, set dimension
+__kernel __attribute__((reqd_work_group_size(1, 1, 4))) void
+_occa_addVectors6_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 4))) void
+_occa_addVectors6_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (entries - 1) - (((4) * 1) * get_group_id(1));
+    {
+      int i = _occa_tiled_i - ((1) * get_local_id(2));
+      if (i >= 0) {
+        ab[i] = add(a[i], b[i]);
+      }
+    }
+  }
+}
+
+// Outer -> inner ==> inner -> inner (nested)
+__kernel void _occa_addVectors7_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors7_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (entries - 1) - (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i - ((1) * get_local_id(0));
+      if (i >= 0) {
+        {
+          int _occa_tiled_j = (entries - 1) - (((4) * 1) * get_local_id(1));
+          {
+            int j = _occa_tiled_j - ((1) * get_local_id(2));
+            if (j >= 0) {
+              ab[i] = add(a[i], b[j]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Outer -> inner ==> inner -> inner (nested) + complex range + check true
+__kernel void _occa_addVectors8_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors8_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i =
+        ((entries + 16)) - (((4) * (entries / 16 + 1)) * get_group_id(0));
+    {
+      int i = _occa_tiled_i - (((entries / 16 + 1)) * get_local_id(0));
+      if (i >= (entries - 12 + static_cast<int>(*a))) {
+        {
+          unsigned long long _occa_tiled_j =
+              ((entries + 16)) - (((4) * (entries / 16 + 1)) * get_local_id(1));
+          {
+            unsigned long long j =
+                _occa_tiled_j - (((entries / 16 + 1)) * get_local_id(2));
+            if (j >= (entries - 12 + static_cast<int>(*a))) {
+              ab[i] = add(a[i], b[j]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_inc.cpp b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_inc.cpp
new file mode 100644
index 00000000..f9238766
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_inc.cpp
@@ -0,0 +1,84 @@
+const int offset = 1;
+
+// template<typename T>
+float add(float a, float b) {
+    return a + b + offset;
+}
+
+// Outer -> inner
+@kernel void addVectors0(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = 0; i < entries; i += 1; @tile(4, @outer, @inner)) {
+        ab[i] = add(a[i], b[i]);
+    }
+}
+
+// Outer -> inner non 1 increment
+@kernel void addVectors1(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = 0; i < entries; i += 2; @tile(4, @outer, @inner)) {
+        ab[i] = add(a[i], b[i]);
+    }
+}
+
+// Outer -> inner unary post add
+@kernel void addVectors2(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = 0; i < entries; i++; @tile(4, @outer, @inner)) {
+        ab[i] = add(a[i], b[i]);
+    }
+}
+
+// Outer -> inner unary pre add
+@kernel void addVectors3(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = 0; i < entries; ++i; @tile(4, @outer, @inner)) {
+        ab[i] = add(a[i], b[i]);
+    }
+}
+
+// Outer -> inner, check=True
+@kernel void addVectors4(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = 0; i < entries; i += 1; @tile(4, @outer, @inner, check = true)) {
+        ab[i] = add(a[i], b[i]);
+    }
+}
+
+// Outer -> inner, complex range
+@kernel void addVectors5(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = (entries - 12 + 4); i < (entries + 16); i += (entries / 16 + 1);
+         @tile(4, @outer, @inner)) {
+        ab[i] = add(a[i], b[i]);
+    }
+}
+
+// Outer -> inner, set dimension
+@kernel void addVectors6(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = 0; i < entries; i += 1; @tile(4, @outer(1), @inner(2))) {
+        ab[i] = add(a[i], b[i]);
+    }
+}
+
+// Outer -> inner ==> inner -> inner (nested)
+@kernel void addVectors7(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = 0; i < entries; i += 1; @tile(4, @outer(0), @inner(0))) {
+        for (int j = 0; j < entries; j += 1; @tile(4, @inner(1), @inner(2))) {
+            ab[i] = add(a[i], b[j]);
+        }
+    }
+}
+
+// Outer -> inner ==> inner -> inner (nested) + complex range + check true
+@kernel void addVectors8(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = (entries - 12 + static_cast<int>(*a)); i < (entries + 16); i += (entries / 16 + 1);
+         @tile(4, @outer(0), @inner(0), check = true)) {
+        for (unsigned long long j = (entries - 12 + static_cast<int>(*a)); j < (entries + 16);
+             j += (entries / 16 + 1);
+             @tile(4, @inner(1), @inner(2), check = true)) {
+            ab[i] = add(a[i], b[j]);
+        }
+    }
+}
+
+// Outer -> inner, <=
+@kernel void addVectors9(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = 0; i <= entries; i += 1; @tile(4, @outer, @inner)) {
+        ab[i] = add(a[i], b[i]);
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_inc_ref.cpp b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_inc_ref.cpp
new file mode 100644
index 00000000..2c0b38d7
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_inc_ref.cpp
@@ -0,0 +1,214 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__constant int offset = 1;
+// template<typename T>
+float add(float a, float b);
+
+float add(float a, float b) { return a + b + offset; }
+
+// Outer -> inner
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors0_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors0_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (0) + (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + ((1) * get_local_id(0));
+      if (i < entries) {
+        ab[i] = add(a[i], b[i]);
+      }
+    }
+  }
+}
+
+// Outer -> inner non 1 increment
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors1_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors1_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (0) + (((4) * 2) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + ((2) * get_local_id(0));
+      if (i < entries) {
+        ab[i] = add(a[i], b[i]);
+      }
+    }
+  }
+}
+
+// Outer -> inner unary post add
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors2_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors2_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (0) + ((4) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + get_local_id(0);
+      if (i < entries) {
+        ab[i] = add(a[i], b[i]);
+      }
+    }
+  }
+}
+
+// Outer -> inner unary pre add
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors3_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors3_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (0) + ((4) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + get_local_id(0);
+      if (i < entries) {
+        ab[i] = add(a[i], b[i]);
+      }
+    }
+  }
+}
+
+// Outer -> inner, check=True
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors4_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors4_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (0) + (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + ((1) * get_local_id(0));
+      if (i < entries) {
+        ab[i] = add(a[i], b[i]);
+      }
+    }
+  }
+}
+
+// Outer -> inner, complex range
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors5_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors5_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i =
+        ((entries - 12 + 4)) + (((4) * (entries / 16 + 1)) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + (((entries / 16 + 1)) * get_local_id(0));
+      if (i < (entries + 16)) {
+        ab[i] = add(a[i], b[i]);
+      }
+    }
+  }
+}
+
+// Outer -> inner, set dimension
+__kernel __attribute__((reqd_work_group_size(1, 1, 4))) void
+_occa_addVectors6_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 4))) void
+_occa_addVectors6_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (0) + (((4) * 1) * get_group_id(1));
+    {
+      int i = _occa_tiled_i + ((1) * get_local_id(2));
+      if (i < entries) {
+        ab[i] = add(a[i], b[i]);
+      }
+    }
+  }
+}
+
+// Outer -> inner ==> inner -> inner (nested)
+__kernel void _occa_addVectors7_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors7_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (0) + (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + ((1) * get_local_id(0));
+      if (i < entries) {
+        {
+          int _occa_tiled_j = (0) + (((4) * 1) * get_local_id(1));
+          {
+            int j = _occa_tiled_j + ((1) * get_local_id(2));
+            if (j < entries) {
+              ab[i] = add(a[i], b[j]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Outer -> inner ==> inner -> inner (nested) + complex range + check true
+__kernel void _occa_addVectors8_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors8_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = ((entries - 12 + static_cast<int>(*a))) +
+                        (((4) * (entries / 16 + 1)) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + (((entries / 16 + 1)) * get_local_id(0));
+      if (i < (entries + 16)) {
+        {
+          unsigned long long _occa_tiled_j =
+              ((entries - 12 + static_cast<int>(*a))) +
+              (((4) * (entries / 16 + 1)) * get_local_id(1));
+          {
+            unsigned long long j =
+                _occa_tiled_j + (((entries / 16 + 1)) * get_local_id(2));
+            if (j < (entries + 16)) {
+              ab[i] = add(a[i], b[j]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Outer -> inner, <=
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors9_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors9_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (0) + (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + ((1) * get_local_id(0));
+      if (i <= entries) {
+        ab[i] = add(a[i], b[i]);
+      }
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_regular_dec.cpp b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_regular_dec.cpp
new file mode 100644
index 00000000..f1a9e8d2
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_regular_dec.cpp
@@ -0,0 +1,59 @@
+const int offset = 1;
+
+float add(float a, float b) {
+    return a + b + offset;
+}
+
+// Outer -> inner ==> regular -> regular
+@kernel void addVectors0(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(0), @inner(0))) {
+        for (int j = entries; j > 0; --j; @tile(4)) {
+            ab[i] = add(a[i], b[j - 1]);
+        }
+    }
+}
+
+// Outer -> inner ==> inner -> regular
+@kernel void addVectors2(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(0), @inner(0))) {
+        for (int j = entries; j > 0; --j; @tile(4, @inner(1))) {
+            ab[i] = add(a[i], b[j - 1]);
+        }
+    }
+}
+
+// Outer -> inner ==> inner -> inner
+@kernel void addVectors3(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(0), @inner(0))) {
+        for (int j = entries; j > 0; --j; @tile(4, @inner(1), @inner(1))) {
+            ab[i] = add(a[i], b[j - 1]);
+        }
+    }
+}
+
+// Outer -> outer ==> inner -> regular
+@kernel void addVectors4(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(0), @outer(1))) {
+        for (int j = entries; j > 0; --j; @tile(4, @inner(1))) {
+            ab[i] = add(a[i], b[j - 1]);
+        }
+    }
+}
+
+// Outer -> outer ==> inner -> inner
+@kernel void addVectors5(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(0), @outer(1))) {
+        for (int j = entries; j > 0; --j; @tile(4, @inner(1), @inner(2))) {
+            ab[i] = add(a[i], b[j - 1]);
+        }
+    }
+}
+
+// Outer -> outer ==> outer -> inner
+@kernel void addVectors6(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(0), @outer(1))) {
+        for (int j = entries; j > 0; --j; @tile(4, @outer(2), @inner(0))) {
+            ab[i] = add(a[i], b[j - 1]);
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_regular_dec_ref.cpp b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_regular_dec_ref.cpp
new file mode 100644
index 00000000..5f1d42f4
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_regular_dec_ref.cpp
@@ -0,0 +1,157 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__constant int offset = 1;
+float add(float a, float b);
+
+float add(float a, float b) { return a + b + offset; }
+
+// Outer -> inner ==> regular -> regular
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors0_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors0_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (entries - 1) - (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i - ((1) * get_local_id(0));
+      if (i >= 0) {
+        for (int _occa_tiled_j = entries; _occa_tiled_j > 0;
+             _occa_tiled_j -= (4)) {
+          for (int j = _occa_tiled_j; j > (_occa_tiled_j - (4)); --j) {
+            if (j > 0) {
+              ab[i] = add(a[i], b[j - 1]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Outer -> inner ==> inner -> regular
+__kernel void _occa_addVectors2_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors2_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (entries - 1) - (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i - ((1) * get_local_id(0));
+      if (i >= 0) {
+        {
+          int _occa_tiled_j = (entries) - ((4) * get_local_id(1));
+          for (int j = _occa_tiled_j; j > (_occa_tiled_j - (4)); --j) {
+            if (j > 0) {
+              ab[i] = add(a[i], b[j - 1]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Outer -> inner ==> inner -> inner
+__kernel void _occa_addVectors3_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors3_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (entries - 1) - (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i - ((1) * get_local_id(0));
+      if (i >= 0) {
+        {
+          int _occa_tiled_j = (entries) - ((4) * get_local_id(1));
+          {
+            int j = _occa_tiled_j - get_local_id(1);
+            if (j > 0) {
+              ab[i] = add(a[i], b[j - 1]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Outer -> outer ==> inner -> regular
+__kernel void _occa_addVectors4_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors4_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (entries - 1) - (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i - ((1) * get_group_id(1));
+      if (i >= 0) {
+        {
+          int _occa_tiled_j = (entries) - ((4) * get_local_id(1));
+          for (int j = _occa_tiled_j; j > (_occa_tiled_j - (4)); --j) {
+            if (j > 0) {
+              ab[i] = add(a[i], b[j - 1]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Outer -> outer ==> inner -> inner
+__kernel void _occa_addVectors5_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors5_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (entries - 1) - (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i - ((1) * get_group_id(1));
+      if (i >= 0) {
+        {
+          int _occa_tiled_j = (entries) - ((4) * get_local_id(1));
+          {
+            int j = _occa_tiled_j - get_local_id(2);
+            if (j > 0) {
+              ab[i] = add(a[i], b[j - 1]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Outer -> outer ==> outer -> inner
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors6_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors6_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (entries - 1) - (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i - ((1) * get_group_id(1));
+      if (i >= 0) {
+        {
+          int _occa_tiled_j = (entries) - ((4) * get_group_id(2));
+          {
+            int j = _occa_tiled_j - get_local_id(0);
+            if (j > 0) {
+              ab[i] = add(a[i], b[j - 1]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_regular_inc.cpp b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_regular_inc.cpp
new file mode 100644
index 00000000..02e3be70
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_regular_inc.cpp
@@ -0,0 +1,60 @@
+const int offset = 1;
+
+// template<typename T>
+float add(float a, float b) {
+    return a + b + offset;
+}
+
+// Outer -> inner ==> regular -> regular
+@kernel void addVectors0(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = 0; i < entries; i += 1; @tile(4, @outer(0), @inner(0))) {
+        for (int j = 0; j < entries; ++j; @tile(4)) {
+            ab[i] = add(a[i], b[j]);
+        }
+    }
+}
+
+// Outer -> inner ==> inner -> regular
+@kernel void addVectors2(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = 0; i < entries; i += 1; @tile(4, @outer(0), @inner(0))) {
+        for (int j = 0; j < entries; ++j; @tile(4, @inner(1))) {
+            ab[i] = add(a[i], b[j]);
+        }
+    }
+}
+
+// Outer -> inner ==> inner -> inner
+@kernel void addVectors3(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = 0; i < entries; i += 1; @tile(4, @outer(0), @inner(0))) {
+        for (int j = 0; j < entries; ++j; @tile(4, @inner(1), @inner(1))) {
+            ab[i] = add(a[i], b[j]);
+        }
+    }
+}
+
+// Outer -> outer ==> inner -> regular
+@kernel void addVectors4(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = 0; i < entries; i += 1; @tile(4, @outer(0), @outer(1))) {
+        for (int j = 0; j < entries; ++j; @tile(4, @inner(1))) {
+            ab[i] = add(a[i], b[j]);
+        }
+    }
+}
+
+// Outer -> outer ==> inner -> inner
+@kernel void addVectors5(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = 0; i < entries; i += 1; @tile(4, @outer(0), @outer(1))) {
+        for (int j = 0; j < entries; ++j; @tile(4, @inner(1), @inner(2))) {
+            ab[i] = add(a[i], b[j]);
+        }
+    }
+}
+
+// Outer -> outer ==> outer -> inner
+@kernel void addVectors6(const int entries, const float* a, const float* b, float* ab) {
+    for (int i = 0; i < entries; i += 1; @tile(4, @outer(0), @outer(1))) {
+        for (int j = 0; j < entries; ++j; @tile(4, @outer(2), @inner(0))) {
+            ab[i] = add(a[i], b[j]);
+        }
+    }
+}
diff --git a/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_regular_inc_ref.cpp b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_regular_inc_ref.cpp
new file mode 100644
index 00000000..fec83d27
--- /dev/null
+++ b/tests/functional/data/transpiler/backends/opencl/tile/outer_inner_regular_inc_ref.cpp
@@ -0,0 +1,158 @@
+#pragma OPENCL EXTENSON cl_khr_fp64 : enable
+
+__constant int offset = 1;
+// template<typename T>
+float add(float a, float b);
+
+float add(float a, float b) { return a + b + offset; }
+
+// Outer -> inner ==> regular -> regular
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors0_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors0_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (0) + (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + ((1) * get_local_id(0));
+      if (i < entries) {
+        for (int _occa_tiled_j = 0; _occa_tiled_j < entries;
+             _occa_tiled_j += (4)) {
+          for (int j = _occa_tiled_j; j < (_occa_tiled_j + (4)); ++j) {
+            if (j < entries) {
+              ab[i] = add(a[i], b[j]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Outer -> inner ==> inner -> regular
+__kernel void _occa_addVectors2_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors2_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (0) + (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + ((1) * get_local_id(0));
+      if (i < entries) {
+        {
+          int _occa_tiled_j = (0) + ((4) * get_local_id(1));
+          for (int j = _occa_tiled_j; j < (_occa_tiled_j + (4)); ++j) {
+            if (j < entries) {
+              ab[i] = add(a[i], b[j]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Outer -> inner ==> inner -> inner
+__kernel void _occa_addVectors3_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors3_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (0) + (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + ((1) * get_local_id(0));
+      if (i < entries) {
+        {
+          int _occa_tiled_j = (0) + ((4) * get_local_id(1));
+          {
+            int j = _occa_tiled_j + get_local_id(1);
+            if (j < entries) {
+              ab[i] = add(a[i], b[j]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Outer -> outer ==> inner -> regular
+__kernel void _occa_addVectors4_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors4_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (0) + (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + ((1) * get_group_id(1));
+      if (i < entries) {
+        {
+          int _occa_tiled_j = (0) + ((4) * get_local_id(1));
+          for (int j = _occa_tiled_j; j < (_occa_tiled_j + (4)); ++j) {
+            if (j < entries) {
+              ab[i] = add(a[i], b[j]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Outer -> outer ==> inner -> inner
+__kernel void _occa_addVectors5_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab);
+
+__kernel void _occa_addVectors5_0(const int entries, __global const float *a,
+                                  __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (0) + (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + ((1) * get_group_id(1));
+      if (i < entries) {
+        {
+          int _occa_tiled_j = (0) + ((4) * get_local_id(1));
+          {
+            int j = _occa_tiled_j + get_local_id(2);
+            if (j < entries) {
+              ab[i] = add(a[i], b[j]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Outer -> outer ==> outer -> inner
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors6_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab);
+
+__kernel __attribute__((reqd_work_group_size(4, 1, 1))) void
+_occa_addVectors6_0(const int entries, __global const float *a,
+                    __global const float *b, __global float *ab) {
+  {
+    int _occa_tiled_i = (0) + (((4) * 1) * get_group_id(0));
+    {
+      int i = _occa_tiled_i + ((1) * get_group_id(1));
+      if (i < entries) {
+        {
+          int _occa_tiled_j = (0) + ((4) * get_group_id(2));
+          {
+            int j = _occa_tiled_j + get_local_id(0);
+            if (j < entries) {
+              ab[i] = add(a[i], b[j]);
+            }
+          }
+        }
+      }
+    }
+  }
+}