diff --git a/build-sycl.cmd b/build-sycl.cmd index 7f4d626d77..c50ae1af0b 100644 --- a/build-sycl.cmd +++ b/build-sycl.cmd @@ -20,8 +20,9 @@ set CUDNN_PATH=%CUDA_PATH% set OPENBLAS_PATH=C:\OpenBLAS set MKL_PATH=C:\Program Files (x86)\Intel\oneAPI\mkl\latest\ set DNNL_PATH=C:\Program Files (x86)\Intel\oneAPI\dnnl\latest\cpu_iomp -set OPENCL_LIB_PATH=%CUDA_PATH%\lib\x64 -set OPENCL_INCLUDE_PATH=%CUDA_PATH%\include +set SYCL_PATH=C:\Program Files (x86)\Intel\oneAPI\compiler\2025.0 +set OPENCL_LIB_PATH=%SYCL_PATH%\lib +set OPENCL_INCLUDE_PATH=%SYCL_PATH%\include\sycl\CL rem 3. In most cases you won't need to change anything further down. echo Deleting build directory: @@ -48,7 +49,7 @@ meson setup build --buildtype release -Ddx=%DX12% -Dcudnn=%CUDNN% -Dplain_cuda=% -Dopencl=%OPENCL% -Dblas=%BLAS% -Dmkl=%MKL% -Dopenblas=%OPENBLAS% -Ddnnl=%DNNL% -Dgtest=%TEST% ^ -Dcudnn_include="%CUDNN_INCLUDE_PATH%" -Dcudnn_libdirs="%CUDNN_LIB_PATH%" ^ -Dmkl_include="%MKL_PATH%\include" -Dmkl_libdirs="%MKL_PATH%\lib\intel64" -Ddnnl_dir="%DNNL_PATH%" ^ --Dopencl_libdirs="%OPENCL_LIB_PATH%" -Dopencl_include="%OPENCL_INCLUDE_PATH%" ^ +-Dpopcnt=false -Dopencl_libdirs="%OPENCL_LIB_PATH%" -Dopencl_include="%OPENCL_INCLUDE_PATH%" ^ -Dopenblas_include="%OPENBLAS_PATH%\include" -Dopenblas_libdirs="%OPENBLAS_PATH%\lib" ^ -Ddefault_library=static -Dsycl=%SYCL% -Db_vscrt=md diff --git a/meson.build b/meson.build index d81ae7f979..f0bad260a0 100644 --- a/meson.build +++ b/meson.build @@ -38,7 +38,7 @@ if cc.get_id() == 'clang' endif if cc.get_id() != 'msvc' if get_option('buildtype') == 'release' - add_project_arguments(cc.get_supported_arguments(['-march=native']), language : 'cpp') + #add_project_arguments(cc.get_supported_arguments(['-march=native']), language : 'cpp') endif endif if cc.get_id() == 'msvc' diff --git a/src/neural/sycl/inputs_outputs.h b/src/neural/sycl/inputs_outputs.h index e5b7907db0..a83ca1ae1b 100644 --- a/src/neural/sycl/inputs_outputs.h +++ b/src/neural/sycl/inputs_outputs.h @@ -108,9 +108,9 @@ struct InputsOutputs { void** offset_pointers_ = nullptr; void** head_offset_pointers_ = nullptr; - // cuda stream used to run the network + // sycl queue used to run the network sycl::queue& q_ct1; }; -} // namespace cudnn_backend +} // namespace sycldnn_backend } // namespace lczero diff --git a/src/neural/sycl/network_sycl.cc.dp.cpp b/src/neural/sycl/network_sycl.cc.dp.cpp index 0c600b3d3d..90041cab14 100644 --- a/src/neural/sycl/network_sycl.cc.dp.cpp +++ b/src/neural/sycl/network_sycl.cc.dp.cpp @@ -201,29 +201,51 @@ class SyclNetwork : public Network { nf.network() == NF::NETWORK_ATTENTIONBODY_WITH_MULTIHEADFORMAT; max_batch_size_ = options.GetOrDefault("max_batch", 1024); - + std::string device_plat_ = options.GetOrDefault("platform", "OpenCL"); + // Get all the available platforms + auto platforms = sycl::platform::get_platforms(); + + // Look only for OpenCL platforms + for (const auto& platform : platforms) { + if (platform.get_info().find(device_plat_) != std::string::npos) { + auto platform_devices = platform.get_devices(); + devices.insert(devices.end(), platform_devices.begin(), platform_devices.end()); + } + } + + // Count the GPU's. + for (const auto& device : devices) { + if (device.is_gpu()) { total_gpus_++; } + } + + // Get the sycl device. + device_ = devices[gpu_id_]; + + sycl::context context{device_}; + + // Get the number of compute units(execution units). + compute_units_ = (int)device_.get_info(); - int total_gpus = dpct::dev_mgr::instance().device_count(); - - if (gpu_id_ >= total_gpus) + if (gpu_id_ >= (int)devices.size()) throw Exception("Invalid GPU Id: " + std::to_string(gpu_id_)); - - //dpct::dev_mgr::instance().get_device(gpu_id_).get_device_info(deviceProp); - - sycl_queue_ = new sycl::queue{dpct::dev_mgr::instance().get_device(gpu_id_), [] (sycl::exception_list exceptions) { - + auto exceptions_handler = [&] (sycl::exception_list exceptions) { for (std::exception_ptr const& e : exceptions) { - try { - std::rethrow_exception(e); - } catch(sycl::exception const& e) { - - std::cout << "Caught asynchronous SYCL exception during GEMM:\n" << e.what() << std::endl; - } - - } - }, sycl::property_list{sycl::property::queue::in_order{}}}; + try { + std::rethrow_exception(e); + } catch(sycl::exception const& e) { + std::cout + << "Caught asynchronous SYCL exception during GEMM:\n" + << e.what() + << std::endl; + std::terminate(); + } + } + }; + + sycl_queue_ = new sycl::queue{context, device_, + exceptions_handler, sycl::property_list{sycl::property::queue::in_order{}} }; showDeviceInfo(*sycl_queue_); @@ -233,7 +255,7 @@ class SyclNetwork : public Network { // Select GPU to run on (for *the current* thread). multi_stream_ = options.GetOrDefault("multi_stream", false); - + // layout used by cuda backend is nchw. has_tensor_cores_ = false; constexpr bool fp16 = std::is_same::value; @@ -915,15 +937,23 @@ class SyclNetwork : public Network { const NetworkCapabilities& GetCapabilities() const override { return capabilities_; } + + // A vector to store all sycl devices. + std::vector devices; + + // Check if device is the cpu for thread handling. + bool IsCpu() const override { return device_.is_cpu(); } + + // 2 threads for cpu and 1 + total_gpu's for the multiple gpu's. + int GetThreads() const override { return device_.is_cpu() ? 1 : 1 + total_gpus_; } + + int GetMiniBatchSize() const override { + if (device_.is_cpu()) { return 47; } + // Simple heuristic that seems to work for a wide range of GPUs. + return 2 * compute_units_; + } - std::unique_ptr NewComputation() override { - // Set correct gpu id for this computation (as it might have been called - // from a different thread). - /* - DPCT1093:90: The "gpu_id_" device may be not the one intended for use. - Adjust the selected device if needed. - */ - dpct::select_device(gpu_id_); + std::unique_ptr NewComputation() override { return std::make_unique>(this, wdl_, moves_left_); } @@ -953,6 +983,8 @@ class SyclNetwork : public Network { int gpu_id_; int l2_cache_size_; int max_batch_size_; + int compute_units_; + int total_gpus_; bool wdl_; bool moves_left_; bool use_res_block_winograd_fuse_opt_; // fuse operations inside the residual @@ -964,6 +996,8 @@ class SyclNetwork : public Network { // by allocating more memory). mutable std::mutex lock_; sycl::queue * sycl_queue_; + sycl::device device_; + int numBlocks_; @@ -997,15 +1031,37 @@ class SyclNetwork : public Network { mutable std::mutex inputs_outputs_lock_; std::list> free_inputs_outputs_; - void showDeviceInfo(const sycl::queue & mqueue) const { - CERR << "PLATFORM: " << mqueue.get_device().get_platform().get_info(); - CERR << "GPU: " << mqueue.get_device().get_info(); - CERR << "GPU memory: " << mqueue.get_device().get_info(); - CERR << "GPU clock frequency: " << mqueue.get_device().get_info(); - CERR << "L2 cache capacity: " << mqueue.get_device().get_info(); - CERR << "Global memory Size: " << mqueue.get_device().get_info(); - - } + void showDeviceInfo(const sycl::queue &mqueue) const { + // Platform name + std::cerr << "PLATFORM: " + << mqueue.get_device().get_platform().get_info() + << std::endl; + // Device name + std::string device_type = mqueue.get_device().is_gpu() ? "GPU" : "CPU"; + std::cerr << device_type << ": " + << mqueue.get_device().get_info() + << std::endl; + // Device memory (max_mem_alloc_size) in MB + std::cerr << device_type << ": " + << mqueue.get_device().get_info() / (1024 * 1024) + << " MB" + << std::endl; + // Device clock frequency (max_clock_frequency) + std::cerr << device_type << " clock frequency: " + << mqueue.get_device().get_info() + << " MHz" + << std::endl; + // L2 cache capacity (local_mem_size) in MB + std::cerr << "L2 cache capacity: " + << mqueue.get_device().get_info() / (1024) + << " KB" + << std::endl; + // Global memory size (global_mem_size) in MB + std::cerr << "Global memory size: " + << mqueue.get_device().get_info() / (1024 * 1024) + << " MB" + << std::endl; + } }; template