Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions build-sycl.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ set CUDNN_PATH=%CUDA_PATH%
set OPENBLAS_PATH=C:\OpenBLAS
set MKL_PATH=C:\Program Files (x86)\Intel\oneAPI\mkl\latest\
set DNNL_PATH=C:\Program Files (x86)\Intel\oneAPI\dnnl\latest\cpu_iomp
set OPENCL_LIB_PATH=%CUDA_PATH%\lib\x64
set OPENCL_INCLUDE_PATH=%CUDA_PATH%\include
set SYCL_PATH=C:\Program Files (x86)\Intel\oneAPI\compiler\2025.0
set OPENCL_LIB_PATH=%SYCL_PATH%\lib
set OPENCL_INCLUDE_PATH=%SYCL_PATH%\include\sycl\CL

rem 3. In most cases you won't need to change anything further down.
echo Deleting build directory:
Expand All @@ -48,7 +49,7 @@ meson setup build --buildtype release -Ddx=%DX12% -Dcudnn=%CUDNN% -Dplain_cuda=%
-Dopencl=%OPENCL% -Dblas=%BLAS% -Dmkl=%MKL% -Dopenblas=%OPENBLAS% -Ddnnl=%DNNL% -Dgtest=%TEST% ^
-Dcudnn_include="%CUDNN_INCLUDE_PATH%" -Dcudnn_libdirs="%CUDNN_LIB_PATH%" ^
-Dmkl_include="%MKL_PATH%\include" -Dmkl_libdirs="%MKL_PATH%\lib\intel64" -Ddnnl_dir="%DNNL_PATH%" ^
-Dopencl_libdirs="%OPENCL_LIB_PATH%" -Dopencl_include="%OPENCL_INCLUDE_PATH%" ^
-Dpopcnt=false -Dopencl_libdirs="%OPENCL_LIB_PATH%" -Dopencl_include="%OPENCL_INCLUDE_PATH%" ^
-Dopenblas_include="%OPENBLAS_PATH%\include" -Dopenblas_libdirs="%OPENBLAS_PATH%\lib" ^
-Ddefault_library=static -Dsycl=%SYCL% -Db_vscrt=md

Expand Down
2 changes: 1 addition & 1 deletion meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ if cc.get_id() == 'clang'
endif
if cc.get_id() != 'msvc'
if get_option('buildtype') == 'release'
add_project_arguments(cc.get_supported_arguments(['-march=native']), language : 'cpp')
#add_project_arguments(cc.get_supported_arguments(['-march=native']), language : 'cpp')
endif
endif
if cc.get_id() == 'msvc'
Expand Down
4 changes: 2 additions & 2 deletions src/neural/sycl/inputs_outputs.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,9 @@ struct InputsOutputs {
void** offset_pointers_ = nullptr;
void** head_offset_pointers_ = nullptr;

// cuda stream used to run the network
// sycl queue used to run the network
sycl::queue& q_ct1;
};

} // namespace cudnn_backend
} // namespace sycldnn_backend
} // namespace lczero
128 changes: 92 additions & 36 deletions src/neural/sycl/network_sycl.cc.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,29 +201,51 @@ class SyclNetwork : public Network {
nf.network() == NF::NETWORK_ATTENTIONBODY_WITH_MULTIHEADFORMAT;

max_batch_size_ = options.GetOrDefault<int>("max_batch", 1024);

std::string device_plat_ = options.GetOrDefault<std::string>("platform", "OpenCL");

// Get all the available platforms
auto platforms = sycl::platform::get_platforms();

// Look only for OpenCL platforms
for (const auto& platform : platforms) {
if (platform.get_info<sycl::info::platform::name>().find(device_plat_) != std::string::npos) {
auto platform_devices = platform.get_devices();
devices.insert(devices.end(), platform_devices.begin(), platform_devices.end());
}
}

// Count the GPU's.
for (const auto& device : devices) {
if (device.is_gpu()) { total_gpus_++; }
}

// Get the sycl device.
device_ = devices[gpu_id_];

sycl::context context{device_};

// Get the number of compute units(execution units).
compute_units_ = (int)device_.get_info<sycl::info::device::max_compute_units>();

int total_gpus = dpct::dev_mgr::instance().device_count();

if (gpu_id_ >= total_gpus)
if (gpu_id_ >= (int)devices.size())
throw Exception("Invalid GPU Id: " + std::to_string(gpu_id_));


//dpct::dev_mgr::instance().get_device(gpu_id_).get_device_info(deviceProp);

sycl_queue_ = new sycl::queue{dpct::dev_mgr::instance().get_device(gpu_id_), [] (sycl::exception_list exceptions) {

auto exceptions_handler = [&] (sycl::exception_list exceptions) {
for (std::exception_ptr const& e : exceptions) {
try {
std::rethrow_exception(e);
} catch(sycl::exception const& e) {

std::cout << "Caught asynchronous SYCL exception during GEMM:\n" << e.what() << std::endl;
}

}
}, sycl::property_list{sycl::property::queue::in_order{}}};
try {
std::rethrow_exception(e);
} catch(sycl::exception const& e) {
std::cout
<< "Caught asynchronous SYCL exception during GEMM:\n"
<< e.what()
<< std::endl;
std::terminate();
}
}
};

sycl_queue_ = new sycl::queue{context, device_,
exceptions_handler, sycl::property_list{sycl::property::queue::in_order{}} };

showDeviceInfo(*sycl_queue_);

Expand All @@ -233,7 +255,7 @@ class SyclNetwork : public Network {

// Select GPU to run on (for *the current* thread).
multi_stream_ = options.GetOrDefault<bool>("multi_stream", false);

// layout used by cuda backend is nchw.
has_tensor_cores_ = false;
constexpr bool fp16 = std::is_same<sycl::half, DataType>::value;
Expand Down Expand Up @@ -915,15 +937,23 @@ class SyclNetwork : public Network {
const NetworkCapabilities& GetCapabilities() const override {
return capabilities_;
}

// A vector to store all sycl devices.
std::vector<sycl::device> devices;

// Check if device is the cpu for thread handling.
bool IsCpu() const override { return device_.is_cpu(); }

// 2 threads for cpu and 1 + total_gpu's for the multiple gpu's.
int GetThreads() const override { return device_.is_cpu() ? 1 : 1 + total_gpus_; }

int GetMiniBatchSize() const override {
if (device_.is_cpu()) { return 47; }
// Simple heuristic that seems to work for a wide range of GPUs.
return 2 * compute_units_;
}

std::unique_ptr<NetworkComputation> NewComputation() override {
// Set correct gpu id for this computation (as it might have been called
// from a different thread).
/*
DPCT1093:90: The "gpu_id_" device may be not the one intended for use.
Adjust the selected device if needed.
*/
dpct::select_device(gpu_id_);
std::unique_ptr<NetworkComputation> NewComputation() override {
return std::make_unique<SyclNetworkComputation<DataType>>(this, wdl_,
moves_left_);
}
Expand Down Expand Up @@ -953,6 +983,8 @@ class SyclNetwork : public Network {
int gpu_id_;
int l2_cache_size_;
int max_batch_size_;
int compute_units_;
int total_gpus_;
bool wdl_;
bool moves_left_;
bool use_res_block_winograd_fuse_opt_; // fuse operations inside the residual
Expand All @@ -964,6 +996,8 @@ class SyclNetwork : public Network {
// by allocating more memory).
mutable std::mutex lock_;
sycl::queue * sycl_queue_;
sycl::device device_;



int numBlocks_;
Expand Down Expand Up @@ -997,15 +1031,37 @@ class SyclNetwork : public Network {
mutable std::mutex inputs_outputs_lock_;
std::list<std::unique_ptr<InputsOutputs>> free_inputs_outputs_;

void showDeviceInfo(const sycl::queue & mqueue) const {
CERR << "PLATFORM: " << mqueue.get_device().get_platform().get_info<sycl::info::platform::name>();
CERR << "GPU: " << mqueue.get_device().get_info<sycl::info::device::name>();
CERR << "GPU memory: " << mqueue.get_device().get_info<sycl::info::device::max_mem_alloc_size>();
CERR << "GPU clock frequency: " << mqueue.get_device().get_info<sycl::info::device::max_clock_frequency>();
CERR << "L2 cache capacity: " << mqueue.get_device().get_info<sycl::info::device::local_mem_size>();
CERR << "Global memory Size: " << mqueue.get_device().get_info<sycl::info::device::global_mem_size>();

}
void showDeviceInfo(const sycl::queue &mqueue) const {
// Platform name
std::cerr << "PLATFORM: "
<< mqueue.get_device().get_platform().get_info<sycl::info::platform::name>()
<< std::endl;
// Device name
std::string device_type = mqueue.get_device().is_gpu() ? "GPU" : "CPU";
std::cerr << device_type << ": "
<< mqueue.get_device().get_info<sycl::info::device::name>()
<< std::endl;
// Device memory (max_mem_alloc_size) in MB
std::cerr << device_type << ": "
<< mqueue.get_device().get_info<sycl::info::device::max_mem_alloc_size>() / (1024 * 1024)
<< " MB"
<< std::endl;
// Device clock frequency (max_clock_frequency)
std::cerr << device_type << " clock frequency: "
<< mqueue.get_device().get_info<sycl::info::device::max_clock_frequency>()
<< " MHz"
<< std::endl;
// L2 cache capacity (local_mem_size) in MB
std::cerr << "L2 cache capacity: "
<< mqueue.get_device().get_info<sycl::info::device::local_mem_size>() / (1024)
<< " KB"
<< std::endl;
// Global memory size (global_mem_size) in MB
std::cerr << "Global memory size: "
<< mqueue.get_device().get_info<sycl::info::device::global_mem_size>() / (1024 * 1024)
<< " MB"
<< std::endl;
}
};

template <typename DataType>
Expand Down