borg323 · ContradNamiseb · Dec 22, 2024 · Dec 22, 2024 · Dec 22, 2024 · Dec 23, 2024
diff --git a/build-sycl.cmd b/build-sycl.cmd
@@ -20,8 +20,9 @@ set CUDNN_PATH=%CUDA_PATH%
 set OPENBLAS_PATH=C:\OpenBLAS
 set MKL_PATH=C:\Program Files (x86)\Intel\oneAPI\mkl\latest\
 set DNNL_PATH=C:\Program Files (x86)\Intel\oneAPI\dnnl\latest\cpu_iomp
-set OPENCL_LIB_PATH=%CUDA_PATH%\lib\x64
-set OPENCL_INCLUDE_PATH=%CUDA_PATH%\include
+set SYCL_PATH=C:\Program Files (x86)\Intel\oneAPI\compiler\2025.0
+set OPENCL_LIB_PATH=%SYCL_PATH%\lib
+set OPENCL_INCLUDE_PATH=%SYCL_PATH%\include\sycl\CL
 
 rem 3. In most cases you won't need to change anything further down.
 echo Deleting build directory:
@@ -48,7 +49,7 @@ meson setup build --buildtype release -Ddx=%DX12% -Dcudnn=%CUDNN% -Dplain_cuda=%
 -Dopencl=%OPENCL% -Dblas=%BLAS% -Dmkl=%MKL% -Dopenblas=%OPENBLAS% -Ddnnl=%DNNL% -Dgtest=%TEST% ^
 -Dcudnn_include="%CUDNN_INCLUDE_PATH%" -Dcudnn_libdirs="%CUDNN_LIB_PATH%" ^
 -Dmkl_include="%MKL_PATH%\include" -Dmkl_libdirs="%MKL_PATH%\lib\intel64" -Ddnnl_dir="%DNNL_PATH%" ^
--Dopencl_libdirs="%OPENCL_LIB_PATH%" -Dopencl_include="%OPENCL_INCLUDE_PATH%" ^
+-Dpopcnt=false -Dopencl_libdirs="%OPENCL_LIB_PATH%" -Dopencl_include="%OPENCL_INCLUDE_PATH%" ^
 -Dopenblas_include="%OPENBLAS_PATH%\include" -Dopenblas_libdirs="%OPENBLAS_PATH%\lib" ^
 -Ddefault_library=static -Dsycl=%SYCL% -Db_vscrt=md
 

diff --git a/meson.build b/meson.build
@@ -38,7 +38,7 @@ if cc.get_id() == 'clang'
 endif
 if cc.get_id() != 'msvc'
   if get_option('buildtype') == 'release'
-    add_project_arguments(cc.get_supported_arguments(['-march=native']), language : 'cpp')
+   #add_project_arguments(cc.get_supported_arguments(['-march=native']), language : 'cpp')
   endif
 endif
 if cc.get_id() == 'msvc'

diff --git a/src/neural/sycl/inputs_outputs.h b/src/neural/sycl/inputs_outputs.h
@@ -108,9 +108,9 @@ struct InputsOutputs {
   void** offset_pointers_ = nullptr;
   void** head_offset_pointers_ = nullptr;
 
-  // cuda stream used to run the network
+  // sycl queue used to run the network
   sycl::queue& q_ct1;
 };
 
-}  // namespace cudnn_backend
+}  // namespace sycldnn_backend
 }  // namespace lczero
diff --git a/src/neural/sycl/network_sycl.cc.dp.cpp b/src/neural/sycl/network_sycl.cc.dp.cpp
@@ -201,29 +201,51 @@ class SyclNetwork : public Network {
                  nf.network() == NF::NETWORK_ATTENTIONBODY_WITH_MULTIHEADFORMAT;
 
     max_batch_size_ = options.GetOrDefault<int>("max_batch", 1024);
-
+    std::string device_plat_ = options.GetOrDefault<std::string>("platform", "OpenCL");
 
+    // Get all the available platforms
+    auto platforms = sycl::platform::get_platforms();
+
+    // Look only for OpenCL platforms
+    for (const auto& platform : platforms) {
+      if (platform.get_info<sycl::info::platform::name>().find(device_plat_) != std::string::npos) {
+            auto platform_devices = platform.get_devices();
+            devices.insert(devices.end(), platform_devices.begin(), platform_devices.end());
+        }
+    }
+
+    // Count the GPU's.
+    for (const auto& device : devices) {
+        if (device.is_gpu()) { total_gpus_++; }
+    }
+
+    // Get the sycl device.
+    device_ = devices[gpu_id_];
+
+    sycl::context context{device_};
+
+    // Get the number of compute units(execution units).
+    compute_units_ = (int)device_.get_info<sycl::info::device::max_compute_units>();
 
-    int total_gpus = dpct::dev_mgr::instance().device_count();
-
-    if (gpu_id_ >= total_gpus)
+    if (gpu_id_ >= (int)devices.size())
       throw Exception("Invalid GPU Id: " + std::to_string(gpu_id_));
-
 
-    //dpct::dev_mgr::instance().get_device(gpu_id_).get_device_info(deviceProp);
-
-    sycl_queue_ = new sycl::queue{dpct::dev_mgr::instance().get_device(gpu_id_), [] (sycl::exception_list exceptions) {
-
+    auto exceptions_handler = [&] (sycl::exception_list exceptions) {
         for (std::exception_ptr const& e : exceptions) {
-                    try {
-                          std::rethrow_exception(e);
-                        } catch(sycl::exception const& e) {
-
-				std::cout << "Caught asynchronous SYCL exception during GEMM:\n" << e.what() << std::endl;
-                        }
-
-                 }
-    },  sycl::property_list{sycl::property::queue::in_order{}}};
+           try {
+               std::rethrow_exception(e);
+            } catch(sycl::exception const& e) {
+				std::cout 
+                << "Caught asynchronous SYCL exception during GEMM:\n" 
+                << e.what() 
+                << std::endl;
+                std::terminate();
+            }
+        }
+    };
+
+    sycl_queue_ = new sycl::queue{context, device_, 
+              exceptions_handler, sycl::property_list{sycl::property::queue::in_order{}} };
 
     showDeviceInfo(*sycl_queue_);
 
@@ -233,7 +255,7 @@ class SyclNetwork : public Network {
 
     // Select GPU to run on (for *the current* thread).
     multi_stream_ = options.GetOrDefault<bool>("multi_stream", false);
-
+    
     // layout used by cuda backend is nchw.
     has_tensor_cores_ = false;
     constexpr bool fp16 = std::is_same<sycl::half, DataType>::value;
@@ -915,15 +937,23 @@ class SyclNetwork : public Network {
   const NetworkCapabilities& GetCapabilities() const override {
     return capabilities_;
   }
+
+  // A vector to store all sycl devices.
+  std::vector<sycl::device> devices;
+
+  // Check if device is the cpu for thread handling.
+  bool IsCpu() const override { return device_.is_cpu(); }
+
+  // 2 threads for cpu and 1 + total_gpu's for the multiple gpu's. 
+  int GetThreads() const override { return device_.is_cpu() ? 1 : 1 + total_gpus_; }
+
+  int GetMiniBatchSize() const override {
+     if (device_.is_cpu()) { return 47; }
+       // Simple heuristic that seems to work for a wide range of GPUs.
+       return 2 * compute_units_;
+    }
 
-  std::unique_ptr<NetworkComputation> NewComputation() override {
-    // Set correct gpu id for this computation (as it might have been called
-    // from a different thread).
-    /*
-    DPCT1093:90: The "gpu_id_" device may be not the one intended for use.
-    Adjust the selected device if needed.
-    */
-    dpct::select_device(gpu_id_);
+  std::unique_ptr<NetworkComputation> NewComputation() override { 
     return std::make_unique<SyclNetworkComputation<DataType>>(this, wdl_,
                                                               moves_left_);
   }
@@ -953,6 +983,8 @@ class SyclNetwork : public Network {
   int gpu_id_;
   int l2_cache_size_;
   int max_batch_size_;
+  int compute_units_;
+  int total_gpus_;
   bool wdl_;
   bool moves_left_;
   bool use_res_block_winograd_fuse_opt_;  // fuse operations inside the residual
@@ -964,6 +996,8 @@ class SyclNetwork : public Network {
   // by allocating more memory).
   mutable std::mutex lock_;
   sycl::queue * sycl_queue_;
+  sycl::device device_;
+
 
 
   int numBlocks_;
@@ -997,15 +1031,37 @@ class SyclNetwork : public Network {
   mutable std::mutex inputs_outputs_lock_;
   std::list<std::unique_ptr<InputsOutputs>> free_inputs_outputs_;
 
-  void showDeviceInfo(const sycl::queue & mqueue) const {
-    CERR << "PLATFORM: " << mqueue.get_device().get_platform().get_info<sycl::info::platform::name>();
-    CERR << "GPU: " << mqueue.get_device().get_info<sycl::info::device::name>();
-    CERR << "GPU memory: " << mqueue.get_device().get_info<sycl::info::device::max_mem_alloc_size>();
-    CERR << "GPU clock frequency: " << mqueue.get_device().get_info<sycl::info::device::max_clock_frequency>(); 
-    CERR << "L2 cache capacity: " << mqueue.get_device().get_info<sycl::info::device::local_mem_size>();
-    CERR << "Global memory Size: " << mqueue.get_device().get_info<sycl::info::device::global_mem_size>();
-
-  } 
+  void showDeviceInfo(const sycl::queue &mqueue) const {
+    // Platform name
+    std::cerr << "PLATFORM: " 
+              << mqueue.get_device().get_platform().get_info<sycl::info::platform::name>() 
+              << std::endl;
+    // Device name
+    std::string device_type = mqueue.get_device().is_gpu() ? "GPU" : "CPU";
+    std::cerr << device_type << ": " 
+              << mqueue.get_device().get_info<sycl::info::device::name>() 
+              << std::endl;
+    // Device memory (max_mem_alloc_size) in MB
+    std::cerr << device_type << ": " 
+              << mqueue.get_device().get_info<sycl::info::device::max_mem_alloc_size>() / (1024 * 1024) 
+              << " MB" 
+              << std::endl;
+    // Device clock frequency (max_clock_frequency)
+    std::cerr << device_type << " clock frequency: " 
+              << mqueue.get_device().get_info<sycl::info::device::max_clock_frequency>() 
+              << " MHz" 
+              << std::endl;
+    // L2 cache capacity (local_mem_size) in MB
+    std::cerr << "L2 cache capacity: " 
+              << mqueue.get_device().get_info<sycl::info::device::local_mem_size>() / (1024) 
+              << " KB" 
+              << std::endl;
+    // Global memory size (global_mem_size) in MB
+    std::cerr << "Global memory size: " 
+              << mqueue.get_device().get_info<sycl::info::device::global_mem_size>() / (1024 * 1024) 
+              << " MB" 
+              << std::endl;          
+    }
 };
 
 template <typename DataType>