uxlfoundation · coldav · Mar 25, 2025
@@ -21,14 +21,22 @@
 #define HOST_PASSES_MACHINERY_H_INCLUDED
 
 #include <base/base_module_pass_machinery.h>
-
+#include <vecz/pass.h>
 #include <optional>
 
 namespace llvm {
 class TargetMachine;
 }
 
+namespace vecz {
+  class VeczPassOptions;
+}
 namespace host {
+struct OptimizationOptions {
+  llvm::SmallVector<vecz::VeczPassOptions> vecz_pass_opts;
+  bool force_no_tail = false;
+  bool early_link_builtins = false;
+};
 
 class HostPassMachinery final : public compiler::BaseModulePassMachinery {
  public:
@@ -63,6 +71,10 @@ class HostPassMachinery final : public compiler::BaseModulePassMachinery {
   /// @brief Returns an optimization pass pipeline correponding to
   /// BaseModule::getLateTargetPasses.
   llvm::ModulePassManager getLateTargetPasses();
+
+  static host::OptimizationOptions processOptimizationOptions(
+    std::optional<std::string> env_debug_prefix,
+    std::optional<compiler::VectorizationMode> vecz_mode);
 };
 
 }  // namespace host

@@ -26,6 +26,7 @@
 #include <compiler/utils/attributes.h>
 #include <compiler/utils/compute_local_memory_usage_pass.h>
 #include <compiler/utils/define_mux_builtins_pass.h>
+#include <compiler/utils/link_builtins_pass.h>
 #include <compiler/utils/make_function_name_unique_pass.h>
 #include <compiler/utils/manual_type_legalization_pass.h>
 #include <compiler/utils/metadata.h>
@@ -55,6 +56,99 @@
 
 namespace host {
 
+// Process various compiler options based off compiler build options and common
+// environment variables
+ host::OptimizationOptions
+HostPassMachinery::processOptimizationOptions(
+    std::optional<std::string> env_debug_prefix,
+    std::optional<compiler::VectorizationMode> vecz_mode) {
+  OptimizationOptions env_var_opts;
+  vecz::VeczPassOptions vecz_opts;
+  // The minimum number of elements to vectorize for. For a fixed-length VF,
+  // this is the exact number of elements to vectorize by. For scalable VFs,
+  // the actual number of elements is a multiple (vscale) of these, unknown at
+  // compile time. Default taken from config. May be overriden later.
+  vecz_opts.factor = compiler::utils::VectorizationFactor::getScalar();
+
+  vecz_opts.choices.enable(vecz::VectorizationChoices::eDivisionExceptions);
+
+  vecz_opts.vecz_auto = vecz_mode == compiler::VectorizationMode::AUTO;
+  vecz_opts.vec_dim_idx = 0;
+
+  // This is of the form of a comma separated set of fields
+  // S     - use scalable vectorization
+  // V     - vectorize only, otherwise produce both scalar and vector kernels
+  // A     - let vecz automatically choose the vectorization factor
+  // 1-64  - vectorization factor multiplier: the fixed amount itself, or the
+  //         value that multiplies the scalable amount
+  // VP    - produce a vector-predicated kernel
+  // VVP   - produce both a vectorized and a vector-predicated kernel
+  bool add_vvp = false;
+  if (const auto *vecz_vf_flags_env = std::getenv("CA_HOST_VF")) {
+    // Set scalable to off and let users add it explicitly with 'S'.
+    vecz_opts.factor.setIsScalable(false);
+    llvm::SmallVector<llvm::StringRef, 4> flags;
+    const llvm::StringRef vf_flags_ref(vecz_vf_flags_env);
+    vf_flags_ref.split(flags, ',');
+    for (auto r : flags) {
+      if (r == "A" || r == "a") {
+        vecz_opts.vecz_auto = true;
+      } else if (r == "V" || r == "v") {
+        // Note: This is a legacy toggle for forcing vectorization with no
+        // scalar tail based on the "VF" environment variable. Ideally we'd be
+        // setting it on a per-function basis, and we'd also be setting the
+        // vectorization options themselves on a per-function basis. Until we've
+        // designed a new method, keep the legacy behaviour by re-parsing the
+        // "VF" environment variable and look for a "v/V" toggle.
+        env_var_opts.force_no_tail = true;
+      } else if (r == "S" || r == "s") {
+        vecz_opts.factor.setIsScalable(true);
+        env_var_opts.early_link_builtins = true;
+      } else if (isdigit(r[0])) {
+        vecz_opts.factor.setKnownMin(std::stoi(r.str()));
+      } else if (r == "VP" || r == "vp") {
+        vecz_opts.choices.enable(
+            vecz::VectorizationChoices::eVectorPredication);
+      } else if (r == "VVP" || r == "vvp") {
+        // Add the vectorized pass option now (controlled by other iterations
+        // of this loop), and flag that we have to add a vector-predicated form
+        // later.
+        add_vvp = true;
+      } else {
+        // An error - just stop processing the environment variable now.
+        break;
+      }
+    }
+  }
+
+  // Choices override the cost model
+  const char *ptr = std::getenv("CODEPLAY_VECZ_CHOICES");
+  if (ptr) {
+    const bool success = vecz_opts.choices.parseChoicesString(ptr);
+    if (!success) {
+      llvm::errs() << "failed to parse the CODEPLAY_VECZ_CHOICES variable\n";
+    }
+  }
+
+  env_var_opts.vecz_pass_opts.push_back(vecz_opts);
+  if (add_vvp) {
+    vecz_opts.choices.enable(vecz::VectorizationChoices::eVectorPredication);
+    env_var_opts.vecz_pass_opts.push_back(vecz_opts);
+  }
+
+  // Allow any decisions made on early linking builtins to be overridden
+  // with an env variable
+  if (env_debug_prefix) {
+    const std::string env_name = *env_debug_prefix + "_EARLY_LINK_BUILTINS";
+    if (const char *early_link_builtins_env = getenv(env_name.c_str())) {
+      env_var_opts.early_link_builtins = atoi(early_link_builtins_env) != 0;
+    }
+  }
+
+  return env_var_opts;
+}
+
+
 static bool hostVeczPassOpts(
     llvm::Function &F, llvm::ModuleAnalysisManager &MAM,
     llvm::SmallVectorImpl<vecz::VeczPassOptions> &Opts) {
@@ -117,7 +211,16 @@ static bool hostVeczPassOpts(
   vecz_options.factor =
       compiler::utils::VectorizationFactor::getFixedWidth(SIMDWidth);
 
-  Opts.push_back(vecz_options);
+  if (getenv("CA_HOST_VF")) {
+    auto env_var_opts = HostPassMachinery::processOptimizationOptions(
+        /*env_debug_prefix*/ {}, vecz_mode);
+    if (env_var_opts.vecz_pass_opts.empty()) {
+      return false;
+    }
+    Opts.assign(env_var_opts.vecz_pass_opts);    
+  } else {
+    Opts.push_back(vecz_options);
+  }
   return true;
 }
 
@@ -178,6 +281,7 @@ void HostPassMachinery::registerPassCallbacks() {
 
 bool HostPassMachinery::handlePipelineElement(llvm::StringRef Name,
                                               llvm::ModulePassManager &PM) {
+
   if (Name.consume_front("host-late-passes")) {
     PM.addPass(getLateTargetPasses());
     return true;
@@ -238,6 +342,9 @@ llvm::ModulePassManager HostPassMachinery::getKernelFinalizationPasses(
   llvm::ModulePassManager PM;
   const compiler::BasePassPipelineTuner tuner(options);
 
+  auto env_var_opts =
+      processOptimizationOptions("CA_HOST", /* vecz_mode*/ {});
+
   // Forcibly compute the BuiltinInfoAnalysis so that cached retrievals work.
   PM.addPass(llvm::RequireAnalysisPass<compiler::utils::BuiltinInfoAnalysis,
                                        llvm::Module>());
@@ -246,6 +353,10 @@ llvm::ModulePassManager HostPassMachinery::getKernelFinalizationPasses(
   PM.addPass(llvm::createModuleToFunctionPassAdaptor(
       compiler::utils::ReplaceAddressSpaceQualifierFunctionsPass()));
 
+  if (env_var_opts.early_link_builtins) {
+    PM.addPass(compiler::utils::LinkBuiltinsPass());
+  }
+
   addPreVeczPasses(PM, tuner);
 
   PM.addPass(vecz::RunVeczPass());

@@ -101,7 +101,7 @@ HostInfo::HostInfo(host::arch arch, host::os os,
 
   vectorizable = true;
   dma_optimizable = true;
-  scalable_vector_support = false;
+  scalable_vector_support = getenv("CA_HOST_VF") ? true : false;
   kernel_debug = true;
 #ifdef CA_ENABLE_DEBUG_SUPPORT
   // Dummy values for testing. Enabled only on debug enabled builds with a

@@ -300,11 +300,11 @@ HostKernel::lookupOrCreateOptimizedKernel(std::array<size_t, 3> local_size) {
     }
 
     // Host doesn't support scalable values.
-    if (fn_metadata.min_work_item_factor.isScalable() ||
-        fn_metadata.pref_work_item_factor.isScalable() ||
-        fn_metadata.sub_group_size.isScalable()) {
-      return cargo::make_unexpected(compiler::Result::FINALIZE_PROGRAM_FAILURE);
-    }
+    // if (fn_metadata.min_work_item_factor.isScalable() ||
+    //     fn_metadata.pref_work_item_factor.isScalable() ||
+    //     fn_metadata.sub_group_size.isScalable()) {
+    //   return cargo::make_unexpected(compiler::Result::FINALIZE_PROGRAM_FAILURE);
+    // }
 
     // Note that we grab a handle to the module here, which we use to reference
     // the module going forward. This is despite us passing ownership of the

@@ -78,17 +78,21 @@ cargo::optional<kernel_variant_map> readBinaryMetadata(loader::ElfFile *elf,
   handler::VectorizeInfoMetadata md;
   while (handler.read(md)) {
     // We don't expect scalable vectorization widths on host.
+    bool isScalable = false;
     if (md.min_work_item_factor.isScalable() ||
         md.pref_work_item_factor.isScalable()) {
-      return cargo::nullopt;
+      printf("Warning: Scalable support is experimental on host target\n");
+     // return cargo::nullopt;
+     isScalable = true;
     }
     const host::binary_kernel_s kernel{
         /*hook*/ 0,
         std::move(md.kernel_name),
+        // TODO: Work out sensible values for scalable.
         static_cast<uint32_t>(md.local_memory_usage),
-        md.min_work_item_factor.getFixedValue(),
-        md.pref_work_item_factor.getFixedValue(),
-        md.sub_group_size.getFixedValue()};
+         isScalable ? 1 : md.min_work_item_factor.getFixedValue(),
+         isScalable ?  1 : md.pref_work_item_factor.getFixedValue(),
+         isScalable ? 1 : md.sub_group_size.getFixedValue()};
     auto it = kernels.find(md.source_name);
     if (it != kernels.end()) {
       it->second.push_back(kernel);

@@ -222,6 +222,7 @@ endif()
 # ${CLC_EXECUTABLE} may have other things in it (like a qemu invocation). Turn
 # it into a CMake list, so that execute_process() isn't confused.
 string(REPLACE " " ";" CLC_EXECUTABLE "${CLC_EXECUTABLE}")
+# message("__CSD__  ${CLC_EXECUTABLE} -d ${DEVICE_NAME}  -cl-kernel-arg-info  -cl-std=CL${CLC_CL_STD}  ${CLC_OPTIONS_LIST}  ${DEFS_LIST}  -o ${OUTPUT_FILE} -- ${INPUT_FILE}")
 execute_process(
   COMMAND ${CLC_EXECUTABLE}
     -d ${DEVICE_NAME}