diff --git a/for_experiments/README.md b/for_experiments/README.md new file mode 100644 index 000000000..1d26cf3bc --- /dev/null +++ b/for_experiments/README.md @@ -0,0 +1,12 @@ +# runtests.py +Инструмент для конфигурации параметров тестов и Vortex, сборки тестов, их запуска под симуляциями/на FPGA, сбора статистики и отрисовки графиков. + +Для того, чтобы тесты запускались TESTS_NUM раз, требуется обернуть код хоста, отвечающий за запуск тестов и сбор результатов, в цикл. Пример можно увидеть в `vortex/tests/opencl/kernel1/main.cc`. Вывод со всех запусков одного теста сохраняется в один файл. + +# Эксперименты с локальной памятью и стеком + +В `vortex/hw/rtl/VX_config.vh` задаются начальные адреса стека и локальной памяти. + +Там была добавлена переменная NEW_STACK_BASE_ADDR. На данный момент она вычисляется аналогично формуле из `vortex/hw/rtl/core/VX_lsu_slice.sv` (формула вычисления конца локальной памяти). + +Для смещения стека нужно изменить STACK_BASE_ADDR на NEW_STACK_BASE_ADDR в `vortex/kernel/src/vx_start.S` в данной строчке: `LOAD_IMMEDIATE64(sp, STACK_BASE_ADDR)` и пересобрать Vortex. \ No newline at end of file diff --git a/for_experiments/draw_time.py b/for_experiments/draw_time.py new file mode 100644 index 000000000..92e23a727 --- /dev/null +++ b/for_experiments/draw_time.py @@ -0,0 +1,93 @@ +import matplotlib.pyplot as plt +from scipy import stats +import numpy as np +from matplotlib.pyplot import figure + +# PUT YOUR PATH HERE +IMG_PATH="/home/spisladqo/Desktop/" + +n256_kernel1_res = [373.0, 366.0, 372.0, 373.0, 366.0, 371.0, 365.0, 365.0, 365.0, 370.0, 372.0, 364.0, 370.0, 365.0, 371.0, 374.0, 364.0, 372.0, 374.0, 369.0, 371.0, 377.0, 371.0, 370.0, 373.0, 367.0, 371.0, 365.0, 371.0, 370.0, 364.0, 364.0, 374.0, 369.0, 364.0, 369.0, 370.0, 372.0, 373.0, 370.0, 368.0, 364.0, 365.0, 364.0, 373.0, 371.0, 373.0, 370.0, 371.0, 365.0, 373.0, 374.0, 365.0, 388.0, 368.0, 372.0, 376.0, 371.0, 366.0, 372.0, 366.0, 365.0, 365.0, 370.0, 365.0, 372.0, 371.0, 367.0, 371.0, 371.0, 371.0, 374.0, 384.0, 365.0, 365.0, 369.0, 369.0, 373.0, 369.0, 373.0, 369.0, 366.0, 365.0, 364.0, 372.0, 365.0, 370.0, 364.0, 370.0, 366.0, 373.0, 371.0, 377.0, 367.0, 366.0] + +n256_kernel2_res = [1358.0, 1353.0, 1358.0, 1336.0, 1357.0, 1359.0, 1358.0, 1357.0, 1357.0, 1358.0, 1358.0, 1358.0, 1356.0, 1355.0, 1359.0, 1358.0, 1358.0, 1359.0, 1358.0, 1359.0, 1358.0, 1359.0, 1359.0, 1357.0, 1355.0, 1358.0, 1359.0, 1358.0, 1358.0, 1358.0, 1337.0, 1358.0, 1356.0, 1358.0, 1358.0, 1359.0, 1356.0, 1358.0, 1341.0, 1358.0, 1358.0, 1358.0, 1357.0, 1358.0, 1358.0, 1357.0, 1358.0, 1358.0, 1352.0, 1344.0, 1357.0, 1358.0, 1348.0, 1358.0, 1357.0, 1351.0, 1354.0, 1358.0, 1357.0, 1359.0, 1358.0, 1332.0, 1332.0, 1356.0, 1358.0, 1357.0, 1356.0, 1358.0, 1357.0, 1357.0, 1358.0, 1331.0, 1358.0, 1358.0, 1340.0, 1356.0, 1358.0, 1359.0, 1357.0, 1358.0, 1357.0, 1357.0, 1358.0, 1358.0, 1358.0, 1356.0, 1358.0, 1328.0, 1357.0, 1357.0, 1357.0, 1358.0, 1358.0, 1358.0, 1340.0] + +n256_kernel3_res = [6059.0, 6055.0, 6059.0, 6061.0, 6058.0, 6054.0, 6060.0, 6056.0, 6057.0, 6054.0, 6059.0, 6062.0, 6057.0, 6057.0, 6053.0, 6059.0, 6059.0, 6054.0, 6060.0, 6057.0, 6057.0, 6058.0, 6057.0, 6059.0, 6058.0, 6060.0, 6055.0, 6054.0, 6058.0, 6049.0, 6058.0, 6057.0, 6060.0, 6058.0, 6060.0, 6060.0, 6054.0, 6060.0, 6049.0, 6057.0, 6062.0, 6056.0, 6058.0, 6058.0, 6050.0, 6059.0, 6056.0, 6054.0, 6056.0, 6058.0, 6056.0, 6063.0, 6052.0, 6052.0, 6054.0, 6056.0, 6060.0, 6060.0, 6060.0, 6051.0, 6064.0, 6056.0, 6057.0, 6058.0, 6057.0, 6056.0, 6062.0, 6051.0, 6058.0, 6054.0, 6061.0, 6050.0, 6060.0, 6059.0, 6057.0, 6039.0, 6056.0, 6056.0, 6044.0, 6058.0, 6046.0, 6059.0, 6052.0, 6062.0, 6057.0, 6057.0, 6054.0, 6053.0, 6056.0, 6056.0, 6060.0, 6058.0, 6061.0, 6058.0, 6055.0] + + +n512_kernel1_res = [3448.0, 3279.0, 3587.0, 3495.0, 3293.0, 3556.0, 3307.0, 3240.0, 3400.0, 3170.0, 3645.0, 3302.0, 3558.0, 3425.0, 3192.0, 3290.0, 3375.0, 3280.0, 3372.0, 3412.0, 3486.0, 3058.0, 3146.0, 3110.0, 3114.0, 3385.0, 3349.0, 3558.0, 3309.0, 3157.0, 3260.0, 3445.0, 3304.0, 3434.0, 3417.0, 3015.0, 3545.0, 3430.0, 3016.0, 3418.0, 3016.0, 3392.0, 3416.0, 3260.0, 3211.0, 3143.0, 3375.0, 3421.0, 3432.0, 3517.0, 3438.0, 3430.0, 3143.0, 3398.0, 3121.0, 3452.0, 3673.0, 3014.0, 3457.0, 3398.0, 3248.0, 3340.0, 3260.0, 3145.0, 3401.0, 3474.0, 3233.0, 3363.0, 3405.0, 3015.0, 3336.0, 3471.0, 3179.0, 3398.0, 3078.0, 3154.0, 3456.0, 3433.0, 3396.0, 3417.0, 3349.0, 3309.0, 3146.0, 3573.0, 3016.0, 3275.0, 3294.0, 3312.0, 3474.0, 3407.0, 3342.0, 3467.0, 3634.0, 3110.0, 3394.0] + +n512_kernel2_res = [10405.0, 10419.0, 10407.0, 10396.0, 10407.0, 10400.0, 10404.0, 10406.0, 10410.0, 10404.0, 10407.0, 10408.0, 10398.0, 10399.0, 10417.0, 10405.0, 10400.0, 10409.0, 10401.0, 10418.0, 10410.0, 10404.0, 10397.0, 10411.0, 10421.0, 10413.0, 10407.0, 10406.0, 10397.0, 10400.0, 10413.0, 10412.0, 10405.0, 10409.0, 10401.0, 10397.0, 10405.0, 10396.0, 10403.0, 10408.0, 10402.0, 10399.0, 10395.0, 10404.0, 10405.0, 10402.0, 10399.0, 10397.0, 10404.0, 10405.0, 10405.0, 10412.0, 10400.0, 10410.0, 10405.0, 10413.0, 10414.0, 10404.0, 10410.0, 10405.0, 10406.0, 10410.0, 10411.0, 10414.0, 10408.0, 10396.0, 10401.0, 10413.0, 10403.0, 10410.0, 10410.0, 10404.0, 10406.0, 10403.0, 10403.0, 10414.0, 10412.0, 10416.0, 10399.0, 10403.0, 10409.0, 10394.0, 10398.0, 10395.0, 10400.0, 10401.0, 10407.0, 10403.0, 10411.0, 10399.0, 10418.0, 10401.0, 10402.0, 10401.0, 10406.0] + +n512_kernel3_res = [47582.0, 47570.0, 47583.0, 47582.0, 47573.0, 47585.0, 47574.0, 47581.0, 47569.0, 47583.0, 47582.0, 47590.0, 47576.0, 47584.0, 47578.0, 47581.0, 47601.0, 47582.0, 47587.0, 47578.0, 47587.0, 47596.0, 47590.0, 47577.0, 47574.0, 47575.0, 47581.0, 47582.0, 47588.0, 47582.0, 47587.0, 47600.0, 47605.0, 47577.0, 47582.0, 47594.0, 47572.0, 47586.0, 47595.0, 47595.0, 47587.0, 47577.0, 47580.0, 47560.0, 47590.0, 47577.0, 47574.0, 47584.0, 47592.0, 47584.0, 47575.0, 47584.0, 47581.0, 47576.0, 47593.0, 47580.0, 47575.0, 47576.0, 47573.0, 47591.0, 47576.0, 47591.0, 47597.0, 47579.0, 47560.0, 47598.0, 47600.0, 47575.0, 47573.0, 47587.0, 47570.0, 47592.0, 47577.0, 47573.0, 47596.0, 47554.0, 47591.0, 47564.0, 47582.0, 47583.0, 47606.0, 47582.0, 47581.0, 47578.0, 47568.0, 47596.0, 47586.0, 47597.0, 47578.0, 47591.0, 47589.0, 47575.0, 47576.0, 47569.0, 47576.0] + + +n1024_kernel1_res = [28150.0, 28862.0, 29544.0, 30206.0, 28569.0, 28654.0, 30181.0, 30150.0, 29474.0, 30106.0, 28512.0, 30042.0, 28701.0, 28152.0, 28859.0, 30155.0, 30224.0, 29432.0, 28911.0, 28687.0, 30063.0, 30143.0, 28861.0, 28211.0, 28954.0, 29875.0, 29986.0, 30087.0, 30087.0, 28651.0, 28900.0, 28697.0, 28846.0, 29990.0, 28665.0, 30013.0, 28193.0, 29111.0, 30110.0, 29093.0, 28558.0, 28558.0, 28635.0, 30102.0, 30165.0, 29002.0, 29136.0, 28254.0, 28054.0, 30038.0, 30063.0, 30144.0, 30051.0, 29017.0, 28873.0, 28867.0, 28559.0, 28502.0, 28640.0, 29110.0, 28493.0, 29467.0, 28175.0, 28537.0, 30084.0, 29158.0, 28046.0, 28292.0, 28527.0, 28814.0, 28063.0, 28756.0, 30168.0, 29314.0, 29363.0, 28504.0, 28446.0, 29029.0, 28272.0, 29360.0, 29034.0, 28835.0, 28704.0, 28443.0, 28983.0, 29316.0, 30144.0, 28205.0, 28947.0, 29641.0, 29991.0, 30107.0, 30053.0, 28549.0, 28545.0] + +n1024_kernel2_res = [82354.0, 82365.0, 82404.0, 82295.0, 82304.0, 82384.0, 82319.0, 82314.0, 82291.0, 82312.0, 82352.0, 82349.0, 82393.0, 82389.0, 82373.0, 82363.0, 82416.0, 82292.0, 82394.0, 82397.0, 82371.0, 82316.0, 82255.0, 82382.0, 82385.0, 82385.0, 82339.0, 82405.0, 82378.0, 82342.0, 82413.0, 82401.0, 82385.0, 82328.0, 82299.0, 82389.0, 82341.0, 82394.0, 82318.0, 82378.0, 82407.0, 82337.0, 82396.0, 82360.0, 82323.0, 82374.0, 82358.0, 82356.0, 82314.0, 82334.0, 82327.0, 82396.0, 82366.0, 82348.0, 82346.0, 82381.0, 82424.0, 82278.0, 82314.0, 82365.0, 82350.0, 82363.0, 82378.0, 82415.0, 82378.0, 82458.0, 82366.0, 82250.0, 82399.0, 82376.0, 82357.0, 82350.0, 82209.0, 82382.0, 82396.0, 82282.0, 82315.0, 82313.0, 82385.0, 82291.0, 82395.0, 82312.0, 82324.0, 82313.0, 82327.0, 82419.0, 82381.0, 82362.0, 82324.0, 82372.0, 82327.0, 82389.0, 82339.0, 82320.0, 82265.0] + +n1024_kernel3_res = [391810.0, 386761.0, 393515.0, 395321.0, 397165.0, 394845.0, 392653.0, 392496.0, 401358.0, 395533.0, 396324.0, 387395.0, 384694.0, 394182.0, 393119.0, 390446.0, 387248.0, 394546.0, 387006.0, 392582.0, 394817.0, 399672.0, 396924.0, 391020.0, 385367.0, 389838.0, 390885.0, 387743.0, 395776.0, 393097.0, 390371.0, 392909.0, 387943.0, 392467.0, 384945.0, 394151.0, 394899.0, 386740.0, 395799.0, 392502.0, 388976.0, 390435.0, 395814.0, 390241.0, 390657.0, 390629.0, 395900.0, 403441.0, 389058.0, 395286.0, 394472.0, 390110.0, 399185.0, 393488.0, 393870.0, 395265.0, 395560.0, 391591.0, 391356.0, 389840.0, 393721.0, 398932.0, 393984.0, 386239.0, 385353.0, 390503.0, 392246.0, 403124.0, 394703.0, 395079.0, 389900.0, 391337.0, 392764.0, 398275.0, 396458.0, 390787.0, 393488.0, 404521.0, 389960.0, 393945.0, 392484.0, 387752.0, 386394.0, 390836.0, 397008.0, 394390.0, 389267.0, 394980.0, 396399.0, 389091.0, 390432.0, 395251.0, 394293.0, 391675.0, 396709.0] + +n256_res = [n256_kernel1_res, n256_kernel2_res, n256_kernel3_res] +n512_res = [n512_kernel1_res, n512_kernel2_res, n512_kernel3_res] +n1024_res = [n1024_kernel1_res, n1024_kernel2_res, n1024_kernel3_res] + +def find_list_means(ls: list) -> list: + return list(map(lambda x: (np.mean(x)), ls)) + +def find_list_stds(ls: list) -> list: + return list(map(lambda x: (np.std(x, ddof=1)), ls)) + +def find_list_margin_of_err(ls : list) -> list: + return list(map(lambda x: stats.t.ppf(0.975, df=len(x)-1)*stats.sem(x), ls)) + +testnames = ("No opt", "LT", "LT + More WPT") + +matrices = ["Matrices 256x256", "Matrices 512x512", "Matrices 1024x1024"] + +n256_means = find_list_means(n256_res) +n512_means = find_list_means(n512_res) +n1024_means = find_list_means(n1024_res) + +time_means = { + matrices[0]: n256_means, + matrices[1]: n512_means, + matrices[2]: n1024_means, +} + +n256_stds = find_list_stds(n256_res) +n512_stds = find_list_stds(n512_res) +n1024_stds = find_list_stds(n1024_res) + +time_stds = { + matrices[0]: n256_stds, + matrices[1]: n512_stds, + matrices[2]: n1024_stds, +} + +time_margins_of_err = { + matrices[0]: find_list_margin_of_err(n256_res), + matrices[1]: find_list_margin_of_err(n512_res), + matrices[2]: find_list_margin_of_err(n1024_res), +} + +for i in matrices: + print(i) + for j in range(3): + print("Testname:", testnames[j]) + print("Mean:", time_means[i][j]) + print("Std: ", time_stds[i][j]) + print("Margin of err:", time_margins_of_err[i][j]) + print("Margin of err, relative (% of mean):", time_margins_of_err[i][j] * 100 / time_means[i][j]) + print("Margin of err, relative (% of std):", time_margins_of_err[i][j] * 100 / time_stds[i][j]) + +matrices = ["Matrices 256x256", "Matrices 512x512", "Matrices 1024x1024"] +sizes = [256, 512, 1024] + +for i in range(len(matrices)): + key = matrices[i] + figure(figsize=(6.5, 8), dpi=80) + plt.bar(testnames, time_means[key]) + plt.errorbar(testnames, time_means[key], yerr=time_stds[matrices[i]], capsize=3, ecolor="black", fmt="o") + plt.title(f"Avg time of matrix multiplication on FPGA\n{key}") + plt.xlabel("Test") + plt.ylabel("Time (ms)") + plt.savefig(f"{IMG_PATH}/time_graph_fpga_{sizes[i]}") diff --git a/for_experiments/graphics/.keep b/for_experiments/graphics/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/for_experiments/outputs/.keep b/for_experiments/outputs/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/for_experiments/runtests.py b/for_experiments/runtests.py new file mode 100644 index 000000000..100263f2c --- /dev/null +++ b/for_experiments/runtests.py @@ -0,0 +1,243 @@ +import subprocess +import matplotlib.pyplot as plt +from dataclasses import dataclass +import pandas as pd +from pathlib import Path +import re +import sys +from scipy import stats +import numpy as np + +# architecture parameters +@dataclass +class arch: + warps: int + cores: int + threads: int +@dataclass +# fpga parameters (CHANGE HERE) +class fpga_data: + platform = "xilinx_u50_gen3x16_xdma_5_202210_1" + dirpref = "test1" + sim = "hw" +# running parameters +@dataclass +class run: + arch: arch + fpga_data: fpga_data + perf: int + kernel: str + driver: str + msize: int + +path_to_vortex = Path.cwd().parent +tile_size = 'TS' +work_per_thread = 'WPT' +width = 'WIDTH' +tests_num = 'TESTS_NUM' + +def error_running (run_params: run, error_text: str) -> str: + return f"error running in {run_params.kernel} : warps={run_params.arch.warps} cores={run_params.arch.cores} threads={run_params.arch.threads}" \ + f" driver={run_params.driver} args=-n{run_params.msize} error message - {error_text}/n" + +def error_verification (run_params: run, number_of_errors: str) -> str: + return f"error in verifing results {run_params.kernel} : warps={run_params.arch.warps} cores={run_params.arch.cores} threads={run_params.arch.threads}" \ + f" driver={run_params.driver} args=-n{run_params.msize} Number of errors : {number_of_errors}'\n'" + +def create_common_h (params: dict, kernel_name: str): + file_name = f"{path_to_vortex}/tests/opencl/{kernel_name}/common.h" + with open(file_name, 'w') as file: + file.write("#ifndef COMMON_H\n" + "#define COMMON_H\n" + "\n") + file.write(f"#define TESTS_NUM {params[tests_num]}\n" + "\n") + if tile_size in params: + file.write(f"#define TS {params[tile_size]}\n") + if work_per_thread in params: + file.write(f"#define WPT {params[work_per_thread]}\n") + file.write("#define RTS (TS/WPT)\n") + if width in params: + file.write(f"#define WIDTH {params[width]}\n") + file.write('\n' + "#endif // COMMON_H") + # open main.cc file to recompile before run with new common.h + Path(f"{path_to_vortex}/tests/opencl/{kernel_name}/main.cc").touch(exist_ok=True) + +def runtest(run_params: run, path_to_output_file: str) -> int: + perf = f"--perf={run_params.perf}" + run_args = f"-n{run_params.msize}" + vortex = f"--warps={run_params.arch.warps} --cores={run_params.arch.cores} --threads={run_params.arch.threads}" + if run_params.driver == "simx" or run_params.driver == "rtlsim": + command = f"cd {path_to_vortex}/build && ./ci/blackbox.sh {vortex} {perf} --driver={run_params.driver} --app={run_params.kernel} --args={run_args}" + elif run_params.driver == "xrt": + fpga_data = run_params.fpga_data + fpga_pref = f"FPGA_BIN_DIR={path_to_vortex}/hw/syn/xilinx/xrt/{fpga_data.dirpref}_{fpga_data.platform}_{fpga_data.sim}/bin TARGET={fpga_data.sim} PLATFORM={fpga_data.platform}" + command = f"cd {path_to_vortex}/build && {fpga_pref} ./ci/blackbox.sh {perf} --driver={run_params.driver} --app={run_params.kernel} --args={run_args}" + print(command) + result = subprocess.run(f"{command} >> {path_to_output_file}", shell=True) + return result.returncode + +def collect(run_params: run, path_to_output_file: str) -> pd.DataFrame: + with open(path_to_output_file, 'r') as file: + lines = file.readlines() + error_message = "" + perf_dict = {} + + # matches all string currently starting with "PERF", such as + # "core0: lmem reads=2134241" and "instrs=123, cycles=123, IPC=1.0" + pattern = r"(?:core\d+: )?([a-zA-Z0-9\.\-_+]+(?: [a-zA-Z0-9\.\-_+]+)*)=(\d+\.?\d*)" + + for line in lines: + if line.startswith("PERF:"): + parts = line.split(',') + for part in parts: + matches = re.findall(pattern, part) + for key, value in matches: + perf_dict[key] = float(value) + elif line.startswith("Elapsed time:"): + matches = re.findall(r'(\S+)\s*[:]\s*(\d+)', line) # matches "time: 1234 ms" + for key, value in matches: + perf_dict[key] = float(value) + # check for errors + if "FAILED" in line: + error_message = error_verification(run_params, line[line.find("FAILED! - "):]) + if "Error" in line: + error_message = error_running(run_params, line[line.find("Error:"):]) + + # parse string with perf statistic of running kernel + if perf_dict["cycles"] <= 0: + error_message = error_running(run_params, "Invalid number of cycles") + # (ADD MORE IF NEEDED) + # write result to data frame + run_result = pd.DataFrame([{"kernel": run_params.kernel[-1], "driver": run_params.driver, "cores": run_params.arch.cores, + "warps": run_params.arch.warps, "threads": run_params.arch.threads, "n": run_params.msize, + "instrs": perf_dict["instrs"], "cycles": perf_dict["cycles"], + "IPC": perf_dict["IPC"], "lmem reads": perf_dict["lmem reads"], "lmem writes": perf_dict["lmem writes"], + "local memory requests": perf_dict["lmem reads"] + perf_dict["lmem writes"], "global memory requests": perf_dict["memory requests"], + "time": perf_dict["time"],"error": error_message}]) + return run_result + +def draw(data_frame: pd.DataFrame, x_label: str, y: str, y_label: str, title: str, path: str): + data_frame.plot(kind = "bar", x = x_label, y = y) + plt.title(title) + plt.xlabel(x_label) + plt.ylabel(y_label) + plt.savefig(path) + +def check_time_stats(data_frame: pd.DataFrame) -> int: + t = data_frame["time"].tolist() + plt.hist(t) + _, pval1 = stats.normaltest(t) + _, pval2 = stats.shapiro(t) + mean = np.mean(t) + instr_error = 1 + error = stats.sem(t) + instr_error + std = np.std(t, ddof=1) + + print(f"Mean is {mean}") + print(f"Error is {error}") + print(f"Standard deviation is {std}") + if pval1 <= 0.05: + print(f"P-value in D'Agostino and Pearson's test is {pval1}, which is less than 0.05") + return -1 + if pval2 <= 0.05: + print(f"P-value in Shapiro-Wilk test is {pval2}, which is less than 0.05") + return -1 + if std / mean > 0.05 * mean: + print("Standard deviation is greater than 5 percents of mean") + return -1 + + return 0 + + +if len(sys.argv) > 1 and (sys.argv[1] == "xrt" or sys.argv[1] == "fpga"): + drivers = ["xrt"] +else: + drivers = ["simx", "rtlsim"] + +TILESIZE = 4 +WORKPERTHREAD = 4 +WIDTH = 4 +TESTSNUM = 100 + +# create common.h files for each kernel +params1 = { + tests_num: TESTSNUM, + tile_size: TILESIZE +} +create_common_h(params1, "kernel1") +create_common_h(params1, "kernel2") + +params3 = { + tests_num: TESTSNUM, + tile_size: TILESIZE, + work_per_thread: WORKPERTHREAD +} +create_common_h(params3, "kernel3") + +params4 = { + tests_num: TESTSNUM, + tile_size: TILESIZE, + width: WIDTH +} +create_common_h(params4, "kernel4") + +kernels = ["kernel1", "kernel2", "kernel3"] + +experiment_dir = f"{path_to_vortex}/for_experiments" +output_dir = f"{experiment_dir}/outputs" +graphics_dir = f"{experiment_dir}/graphics" +stats1 = ["local memory requests", "global memory requests"] +stats2 = "IPC" +stats3 = "time" + +mat_sizes = [32, 128] # square matrix sizes +THREADS = 16 +WARPS = 2 +CORES = 2 +# 0 for nothing, 1 for cores info (stalls, fetches etc), 2 for memory info (lmem reads/writes etc) +PERFTYPE = 2 +fpga_d = fpga_data() + +for n, W in zip(mat_sizes, WARPS): + for driver in drivers: + driver_dfs = [] + for kernel in kernels: + if kernel == "kernel3": + T = int(THREADS / WORKPERTHREAD) + elif kernel == "kernel4": + T = int(THREADS / WIDTH) + else: + T = THREADS + + C = CORES + arch_p = arch(threads=T, cores=C, warps=W) + run_p = run(arch_p, kernel=kernel, driver=driver, msize=n, perf=PERFTYPE, fpga_data=fpga_d) + + # run kernel + output_file = f"{output_dir}/output_{driver}_n{n}_{kernel}_TS{TILESIZE}_WPT{WORKPERTHREAD}_WID{WIDTH}_t{THREADS}w{W}_c{CORES}.txt" + open(output_file, 'w').close() + ret = runtest(run_p, output_file) + if ret: + sys.exit("Error occured when running latest command") + # collect kernel statistics + kernel_df = collect(run_p, output_file) + driver_dfs.append(kernel_df) + + # put different kernel statistics into one data frame + df = pd.concat(driver_dfs, ignore_index=True) + if driver == "simx": + sim_type = "Cycle-approximate simulation" + elif driver == "rtlsim": + sim_type = "RTL simulation" + elif driver == "xrt": + sim_type == "FPGA" + ret = check_time_stats(df) + if ret: + print("Normality tests haven't passed") + + # draw graphs based on the recived statistic + draw(df, "kernel", stats1, "Memory requests", f"Number of memory requests, {sim_type}", + f"{graphics_dir}/mem_graph_{driver}_n{n}_TS{TILESIZE}_WPT{WORKPERTHREAD}_WID{WIDTH}_t{THREADS}_w{W}_c{CORES}.png") + draw(df, "kernel", stats2, "", f"Instructions per cycle, {sim_type}", + f"{graphics_dir}/ipc_graph_{driver}_n{n}_TS{TILESIZE}_WPT{WORKPERTHREAD}_WID{WIDTH}_t{THREADS}_w{W}_c{CORES}.png") + if driver == "xrt": + draw(df, "kernel", stats3, "", f"Elapsed time, {sim_type}", + f"{graphics_dir}/time_graph_{driver}_n{n}_TS{TILESIZE}_WPT{WORKPERTHREAD}_WID{WIDTH}_t{THREADS}_w{W}_c{CORES}.png") diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index c61b1d5e9..6b0549019 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -256,6 +256,10 @@ `define LMEM_BASE_ADDR `STACK_BASE_ADDR `endif +`ifndef NEW_STACK_BASE_ADDR +`define NEW_STACK_BASE_ADDR `LMEM_BASE_ADDR + (`XLEN'(1 << `LMEM_LOG_SIZE)) +`endif + `ifndef IO_COUT_ADDR `define IO_COUT_ADDR `IO_BASE_ADDR `endif diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile index e60cd6ec7..f6d582fa1 100644 --- a/tests/opencl/Makefile +++ b/tests/opencl/Makefile @@ -22,6 +22,10 @@ all: $(MAKE) -C kmeans $(MAKE) -C blackscholes $(MAKE) -C bfs + $(MAKE) -C kernel1 + $(MAKE) -C kernel2 + $(MAKE) -C kernel3 + $(MAKE) -C kernel4 run-simx: $(MAKE) -C vecadd run-simx @@ -43,6 +47,10 @@ run-simx: $(MAKE) -C kmeans run-simx $(MAKE) -C blackscholes run-simx $(MAKE) -C bfs run-simx + $(MAKE) -C kernel1 run-simx + $(MAKE) -C kernel2 run-simx + $(MAKE) -C kernel3 run-simx + $(MAKE) -C kernel4 run-simx run-rtlsim: $(MAKE) -C vecadd run-rtlsim @@ -64,6 +72,10 @@ run-rtlsim: $(MAKE) -C kmeans run-rtlsim $(MAKE) -C blackscholes run-rtlsim $(MAKE) -C bfs run-rtlsim + $(MAKE) -C kernel1 run-rtlsim + $(MAKE) -C kernel2 run-rtlsim + $(MAKE) -C kernel3 run-rtlsim + $(MAKE) -C kernel4 run-rtlsim clean: $(MAKE) -C vecadd clean @@ -85,4 +97,8 @@ clean: $(MAKE) -C guassian clean $(MAKE) -C kmeans clean $(MAKE) -C blackscholes clean - $(MAKE) -C bfs clean \ No newline at end of file + $(MAKE) -C bfs clean + $(MAKE) -C kernel1 clean + $(MAKE) -C kernel2 clean + $(MAKE) -C kernel3 clean + $(MAKE) -C kernel4 clean diff --git a/tests/opencl/common.mk b/tests/opencl/common.mk index bb7b1e0d6..7fa69ddf5 100644 --- a/tests/opencl/common.mk +++ b/tests/opencl/common.mk @@ -32,6 +32,9 @@ VX_CFLAGS += -I$(ROOT_DIR)/hw -I$(VORTEX_HOME)/kernel/include -DXLEN_$(XLEN) -D VX_CFLAGS += -Xclang -target-feature -Xclang +vortex VX_CFLAGS += -Xclang -target-feature -Xclang +zicond VX_CFLAGS += -mllvm -disable-loop-idiom-all + +VX_CFLAGS += -save-temps + #VX_CFLAGS += -mllvm -vortex-branch-divergence=0 #VX_CFLAGS += -mllvm -print-after-all diff --git a/tests/opencl/kernel1/Makefile b/tests/opencl/kernel1/Makefile new file mode 100644 index 000000000..4ed15d16e --- /dev/null +++ b/tests/opencl/kernel1/Makefile @@ -0,0 +1,21 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := kernel1 + +SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cc + +common.h: $(SRC_DIR)/common.h + cp $< $@ + +kernel.cl: $(SRC_DIR)/kernel.cl + cp $< $@ + +KERNEL_SRCS := kernel.cl common.h + +OPTS ?= + +include ../common.mk + diff --git a/tests/opencl/kernel1/common.h b/tests/opencl/kernel1/common.h new file mode 100644 index 000000000..5fec4ff92 --- /dev/null +++ b/tests/opencl/kernel1/common.h @@ -0,0 +1,8 @@ +#ifndef COMMON_H +#define COMMON_H + +#define TESTS_NUM 100 + +#define TS 4 + +#endif // COMMON_H \ No newline at end of file diff --git a/tests/opencl/kernel1/kernel.cl b/tests/opencl/kernel1/kernel.cl new file mode 100644 index 000000000..e5ebd4958 --- /dev/null +++ b/tests/opencl/kernel1/kernel.cl @@ -0,0 +1,18 @@ +__kernel void myGEMM1(const int M, const int N, const int K, + const __global float* A, + const __global float* B, + __global float* C) { + // Thread identifiers + const int globalRow = get_global_id(0); // Row ID of C (0..M) + const int globalCol = get_global_id(1); // Col ID of C (0..N) + + // Compute a single element (loop over K) + float acc = 0.0f; + for (int k=0; k +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +int M = 0, N = 0, K = 0; + +static void show_usage() { + printf("Usage: [-M number of rows in first matrix] [-N number of columns in first matrix] [-K number of columns in first matrix and rows in second matrix] [-h: help]\n"); +} + +#define CL_CHECK(_expr) \ + do { \ + cl_int _err = _expr; \ + if (_err == CL_SUCCESS) \ + break; \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } while (0) + +#define CL_CHECK2(_expr) \ + ({ \ + cl_int _err = CL_INVALID_VALUE; \ + decltype(_expr) _ret = _expr; \ + if (_err != CL_SUCCESS) { \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } \ + _ret; \ + }) + +static int read_kernel_file(const char *filename, uint8_t **data, + size_t *size) { + if (NULL == filename || NULL == data || 0 == size) + return -1; + + FILE *fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + + fseek(fp, 0, SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t *)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + +static void sgemm_cpu(float *C, const float *A, const float *B, int M, int N, + int K) { + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float acc = 0; + for (int k = 0; k < K; ++k) { + acc += A[k * M + m] * B[n * K + k]; + } + C[n * M + m] = acc; + } + } +} + +cl_platform_id platform_id = NULL; +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue command_queue = NULL; +cl_program program = NULL; +cl_kernel kernel = NULL; +cl_mem a_memobj = NULL; +cl_mem b_memobj = NULL; +cl_mem c_memobj = NULL; +uint8_t *kernel_bin = NULL; + +static void cleanup() { + if (command_queue) + clReleaseCommandQueue(command_queue); + if (kernel) + clReleaseKernel(kernel); + if (program) + clReleaseProgram(program); + if (a_memobj) + clReleaseMemObject(a_memobj); + if (b_memobj) + clReleaseMemObject(b_memobj); + if (c_memobj) + clReleaseMemObject(c_memobj); + if (context) + clReleaseContext(context); + if (device_id) + clReleaseDevice(device_id); + if (kernel_bin) + free(kernel_bin); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:M:N:K:h?")) != -1) { + switch (c) { + case 'n': + M = N = K = atoi(optarg); + break; + case 'M': + M = atoi(optarg); + break; + case 'N': + N = atoi(optarg); + break; + case 'K': + K = atoi(optarg); + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } + + if (M < 2 || N < 2 || K < 2 || TS < 2) { + printf("Error: invalid size!\n"); + exit(-1); + } +} + +int main(int argc, char **argv) { + // parse command arguments + parse_args(argc, argv); + + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + + // create context + cl_context_properties context_properties[]{ + CL_CONTEXT_PLATFORM, cl_context_properties(platform_id), 0}; + cl_device_id devices[]{device_id}; + context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); + + char device_string[1024]; + clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL); + printf("Using device: %s\n", device_string); + + // create command queue + command_queue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); + + // generate input data + float *A, *B, *C; + A = (float *)(malloc(M * K * sizeof(float))); + B = (float *)(malloc(N * K * sizeof(float))); + C = (float *)(malloc(M * N * sizeof(float))); + if (A == NULL || B == NULL || C == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + srand(time(NULL)); + for (int i = 0; i < M * K; i++) + A[i] = (int)((float)rand() / (float)RAND_MAX); + for (int i = 0; i < N * K; i++) + B[i] = (int)((float)rand() / (float)RAND_MAX); + + // create buffers + a_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + M * K * sizeof(float), A, &_err)); + b_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + N * K * sizeof(float), B, &_err)); + c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, + M * N * sizeof(float), NULL, &_err)); + + // load kernel text + size_t kernel_size; + if (read_kernel_file("kernel.cl", &kernel_bin, &kernel_size) != 0) { + cleanup(); + return -1; + } + program = CL_CHECK2(clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, + &kernel_size, &_err)); + if (program == NULL) { + cleanup(); + return -1; + } + + // build program + cl_int build_status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); + + // create kernel + kernel = CL_CHECK2(clCreateKernel(program, "myGEMM1", &_err)); + + // check building info + size_t log_size = 0; + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, + NULL, &log_size)); + char *log = (char *)malloc(log_size * sizeof(char)); + if (log == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, + log_size, log, NULL)); + if (log_size > 1) { + printf("Log:\n"); + printf("%s", log); + printf("\n"); + } + CL_CHECK(build_status); + + // set kernel arguments + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&a_memobj)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&b_memobj)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&c_memobj)); + + // calculate matrices on CPU for later checks + float *C_cpu = (float *)malloc(M * N * sizeof(float)); + if (C_cpu == NULL) { + printf("Not enough memory for sgemm on CPU"); + cleanup(); + return -1; + } + sgemm_cpu(C_cpu, A, B, M, N, K); + + // double times[TESTS_NUM] = {0}; + const size_t local[2] = {TS, TS}; + const size_t global[2] = {M, N}; + int errors = 0; + + // run kernel TESTS_NUM times and verify results + for (int i = 0; i < TESTS_NUM; i++) { + errors = 0; + + printf("Execute the kernel, iteration %d\n", i); + auto time_start = std::chrono::high_resolution_clock::now(); + CL_CHECK(clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, local, + 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + // CL_CHECK(clWaitForEvents(1, &event)); + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast( + time_end - time_start) + .count(); + // times[i] = elapsed; + printf("Elapsed time: %lg ms\n", elapsed); + + // get results from VRAM + CL_CHECK(clEnqueueReadBuffer(command_queue, c_memobj, CL_TRUE, 0, + M * N * sizeof(float), C, 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + + // verify results + printf("Verify results\n"); + for (size_t i = 0; i < size_t(M * N); i++) + if (C_cpu[i] != C[i]) + errors++; + if (errors != 0) + printf("FAILED! - %d errors\n", errors); + else + printf("PASSED!\n"); + } + + // free resources + cleanup(); + free(A); + free(B); + free(C); + free(log); + free(C_cpu); + return errors; +} diff --git a/tests/opencl/kernel2/Makefile b/tests/opencl/kernel2/Makefile new file mode 100644 index 000000000..fa9b7f379 --- /dev/null +++ b/tests/opencl/kernel2/Makefile @@ -0,0 +1,20 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := kernel2 + +SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cc + +kernel.cl: $(SRC_DIR)/kernel.cl + cp $< $@ + +common.h: $(SRC_DIR)/common.h + cp $< $@ + +KERNEL_SRCS := kernel.cl common.h + +OPTS ?= + +include ../common.mk diff --git a/tests/opencl/kernel2/common.h b/tests/opencl/kernel2/common.h new file mode 100644 index 000000000..5fec4ff92 --- /dev/null +++ b/tests/opencl/kernel2/common.h @@ -0,0 +1,8 @@ +#ifndef COMMON_H +#define COMMON_H + +#define TESTS_NUM 100 + +#define TS 4 + +#endif // COMMON_H \ No newline at end of file diff --git a/tests/opencl/kernel2/kernel.cl b/tests/opencl/kernel2/kernel.cl new file mode 100644 index 000000000..692318cde --- /dev/null +++ b/tests/opencl/kernel2/kernel.cl @@ -0,0 +1,46 @@ +#include "common.h" + +__kernel void myGEMM2(const int M, const int N, const int K, + const __global float* A, + const __global float* B, + __global float* C) { + + // Thread identifiers + const int row = get_local_id(0); // Local row ID (max: TS) + const int col = get_local_id(1); // Local col ID (max: TS) + const int globalRow = TS*get_group_id(0) + row; // Row ID of C (0..M) + const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N) + + // Local memory to fit a tile of TS*TS elements of A and B + __local float Asub[TS][TS]; + __local float Bsub[TS][TS]; + + // Initialise the accumulation register + float acc = 0.0f; + + // Loop over all tiles + const int numTiles = K/TS; + for (int t=0; t +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +int M = 256, N = 256, K = 256; + +static void show_usage() { + printf("Usage: [-M number of rows in first matrix] [-N number of columns in first matrix] [-K number of columns in first matrix and rows in second matrix] [-h: help]\n"); +} + +#define CL_CHECK(_expr) \ + do { \ + cl_int _err = _expr; \ + if (_err == CL_SUCCESS) \ + break; \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } while (0) + +#define CL_CHECK2(_expr) \ + ({ \ + cl_int _err = CL_INVALID_VALUE; \ + decltype(_expr) _ret = _expr; \ + if (_err != CL_SUCCESS) { \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } \ + _ret; \ + }) + +static int read_kernel_file(const char *filename, uint8_t **data, + size_t *size) { + if (NULL == filename || NULL == data || 0 == size) + return -1; + + FILE *fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + + fseek(fp, 0, SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t *)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + +static void sgemm_cpu(float *C, const float *A, const float *B, int M, int N, + int K) { + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float acc = 0; + for (int k = 0; k < K; ++k) { + acc += A[k * M + m] * B[n * K + k]; + } + C[n * M + m] = acc; + } + } +} + +cl_platform_id platform_id = NULL; +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue command_queue = NULL; +cl_program program = NULL; +cl_kernel kernel = NULL; +cl_mem a_memobj = NULL; +cl_mem b_memobj = NULL; +cl_mem c_memobj = NULL; +uint8_t *kernel_bin = NULL; + +static void cleanup() { + if (command_queue) + clReleaseCommandQueue(command_queue); + if (kernel) + clReleaseKernel(kernel); + if (program) + clReleaseProgram(program); + if (a_memobj) + clReleaseMemObject(a_memobj); + if (b_memobj) + clReleaseMemObject(b_memobj); + if (c_memobj) + clReleaseMemObject(c_memobj); + if (context) + clReleaseContext(context); + if (device_id) + clReleaseDevice(device_id); + if (kernel_bin) + free(kernel_bin); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:M:N:K:h?")) != -1) { + switch (c) { + case 'n': + M = N = K = atoi(optarg); + break; + case 'M': + M = atoi(optarg); + break; + case 'N': + N = atoi(optarg); + break; + case 'K': + K = atoi(optarg); + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } + + if (M < 2 || N < 2 || K < 2 || TS < 2) { + printf("Error: invalid size!\n"); + exit(-1); + } +} + +int main(int argc, char **argv) { + // parse command arguments + parse_args(argc, argv); + + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + + // create context + cl_context_properties context_properties[]{ + CL_CONTEXT_PLATFORM, cl_context_properties(platform_id), 0}; + cl_device_id devices[]{device_id}; + context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); + + char device_string[1024]; + clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL); + printf("Using device: %s\n", device_string); + + // create command queue + command_queue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); + + // generate input data + float *A, *B, *C; + A = (float *)(malloc(M * K * sizeof(float))); + B = (float *)(malloc(N * K * sizeof(float))); + C = (float *)(malloc(M * N * sizeof(float))); + if (A == NULL || B == NULL || C == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + srand(time(NULL)); + for (int i = 0; i < M * K; i++) + A[i] = (int)((float)rand() / (float)RAND_MAX); + for (int i = 0; i < N * K; i++) + B[i] = (int)((float)rand() / (float)RAND_MAX); + + // create buffers + a_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + M * K * sizeof(float), A, &_err)); + b_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + N * K * sizeof(float), B, &_err)); + c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, + M * N * sizeof(float), NULL, &_err)); + + // load kernel text + size_t kernel_size; + if (read_kernel_file("kernel.cl", &kernel_bin, &kernel_size) != 0) { + cleanup(); + return -1; + } + program = CL_CHECK2(clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, + &kernel_size, &_err)); + if (program == NULL) { + cleanup(); + return -1; + } + + // build program + cl_int build_status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); + + // create kernel + kernel = CL_CHECK2(clCreateKernel(program, "myGEMM2", &_err)); + + // check building info + size_t log_size = 0; + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, + NULL, &log_size)); + char *log = (char *)malloc(log_size * sizeof(char)); + if (log == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, + log_size, log, NULL)); + if (log_size > 1) { + printf("Log:\n"); + printf("%s", log); + printf("\n"); + } + CL_CHECK(build_status); + + // set kernel arguments + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&a_memobj)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&b_memobj)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&c_memobj)); + + // calculate matrices on CPU for later checks + float *C_cpu = (float *)malloc(M * N * sizeof(float)); + if (C_cpu == NULL) { + printf("Not enough memory for sgemm on CPU"); + cleanup(); + return -1; + } + sgemm_cpu(C_cpu, A, B, M, N, K); + + // double times[TESTS_NUM] = {0}; + const size_t local[2] = {TS, TS}; + const size_t global[2] = {M, N}; + int errors = 0; + + // run kernel TESTS_NUM times and verify results + for (int i = 0; i < TESTS_NUM; i++) { + errors = 0; + + printf("Execute the kernel, iteration %d\n", i); + auto time_start = std::chrono::high_resolution_clock::now(); + CL_CHECK(clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, local, + 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + // CL_CHECK(clWaitForEvents(1, &event)); + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast( + time_end - time_start) + .count(); + // times[i] = elapsed; + printf("Elapsed time: %lg ms\n", elapsed); + + // get results from VRAM + CL_CHECK(clEnqueueReadBuffer(command_queue, c_memobj, CL_TRUE, 0, + M * N * sizeof(float), C, 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + + // verify results + printf("Verify results\n"); + for (size_t i = 0; i < size_t(M * N); i++) + if (C_cpu[i] != C[i]) + errors++; + if (errors != 0) + printf("FAILED! - %d errors\n", errors); + else + printf("PASSED!\n"); + } + + // free resources + cleanup(); + free(A); + free(B); + free(C); + free(log); + free(C_cpu); + return errors; +} \ No newline at end of file diff --git a/tests/opencl/kernel3/Makefile b/tests/opencl/kernel3/Makefile new file mode 100644 index 000000000..b7e0c3133 --- /dev/null +++ b/tests/opencl/kernel3/Makefile @@ -0,0 +1,20 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := kernel3 + +SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cc + +kernel.cl: $(SRC_DIR)/kernel.cl + cp $< $@ + +common.h: $(SRC_DIR)/common.h + cp $< $@ + +KERNEL_SRCS := kernel.cl common.h + +OPTS ?= + +include ../common.mk diff --git a/tests/opencl/kernel3/common.h b/tests/opencl/kernel3/common.h new file mode 100644 index 000000000..1f0f29f98 --- /dev/null +++ b/tests/opencl/kernel3/common.h @@ -0,0 +1,10 @@ +#ifndef COMMON_H +#define COMMON_H + +#define TESTS_NUM 100 + +#define TS 4 +#define WPT 4 +#define RTS (TS/WPT) + +#endif // COMMON_H \ No newline at end of file diff --git a/tests/opencl/kernel3/kernel.cl b/tests/opencl/kernel3/kernel.cl new file mode 100644 index 000000000..fbc30034b --- /dev/null +++ b/tests/opencl/kernel3/kernel.cl @@ -0,0 +1,55 @@ +#include "common.h" + +__kernel void myGEMM3(const int M, const int N, const int K, + const __global float* A, + const __global float* B, + __global float* C) { + + // Thread identifiers + const int row = get_local_id(0); // Local row ID (max: TS) + const int col = get_local_id(1); // Local col ID (max: TS/WPT == RTS) + const int globalRow = TS*get_group_id(0) + row; // Row ID of C (0..M) + const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N) + + // Local memory to fit a tile of TS*TS elements of A and B + __local float Asub[TS][TS]; + __local float Bsub[TS][TS]; + + // Initialise the accumulation registers + float acc[WPT]; + for (int w=0; w +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +int M = 256, N = 256, K = 256; + +static void show_usage() { + printf("Usage: [-M number of rows in first matrix] [-N number of columns in first matrix] [-K number of columns in first matrix and rows in second matrix] [-h: help]\n"); +} + +#define CL_CHECK(_expr) \ + do { \ + cl_int _err = _expr; \ + if (_err == CL_SUCCESS) \ + break; \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } while (0) + +#define CL_CHECK2(_expr) \ + ({ \ + cl_int _err = CL_INVALID_VALUE; \ + decltype(_expr) _ret = _expr; \ + if (_err != CL_SUCCESS) { \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } \ + _ret; \ + }) + +static int read_kernel_file(const char *filename, uint8_t **data, + size_t *size) { + if (NULL == filename || NULL == data || 0 == size) + return -1; + + FILE *fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + + fseek(fp, 0, SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t *)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + +static void sgemm_cpu(float *C, const float *A, const float *B, int M, int N, + int K) { + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float acc = 0; + for (int k = 0; k < K; ++k) { + acc += A[k * M + m] * B[n * K + k]; + } + C[n * M + m] = acc; + } + } +} + +cl_platform_id platform_id = NULL; +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue command_queue = NULL; +cl_program program = NULL; +cl_kernel kernel = NULL; +cl_mem a_memobj = NULL; +cl_mem b_memobj = NULL; +cl_mem c_memobj = NULL; +uint8_t *kernel_bin = NULL; + +static void cleanup() { + if (command_queue) + clReleaseCommandQueue(command_queue); + if (kernel) + clReleaseKernel(kernel); + if (program) + clReleaseProgram(program); + if (a_memobj) + clReleaseMemObject(a_memobj); + if (b_memobj) + clReleaseMemObject(b_memobj); + if (c_memobj) + clReleaseMemObject(c_memobj); + if (context) + clReleaseContext(context); + if (device_id) + clReleaseDevice(device_id); + if (kernel_bin) + free(kernel_bin); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:M:N:K:h?")) != -1) { + switch (c) { + case 'n': + M = N = K = atoi(optarg); + break; + case 'M': + M = atoi(optarg); + break; + case 'N': + N = atoi(optarg); + break; + case 'K': + K = atoi(optarg); + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } + + if (M < 2 || N < 2 || K < 2) { + printf("Error: invalid size!\n"); + exit(-1); + } +} + +int main(int argc, char **argv) { + // parse command arguments + parse_args(argc, argv); + + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + + // create context + cl_context_properties context_properties[]{ + CL_CONTEXT_PLATFORM, cl_context_properties(platform_id), 0}; + cl_device_id devices[]{device_id}; + context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); + + char device_string[1024]; + clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL); + printf("Using device: %s\n", device_string); + + // create command queue + command_queue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); + + // generate input data + float *A, *B, *C; + A = (float *)(malloc(M * K * sizeof(float))); + B = (float *)(malloc(N * K * sizeof(float))); + C = (float *)(malloc(M * N * sizeof(float))); + if (A == NULL || B == NULL || C == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + srand(time(NULL)); + for (int i = 0; i < M * K; i++) + A[i] = (int)((float)rand() / (float)RAND_MAX); + for (int i = 0; i < N * K; i++) + B[i] = (int)((float)rand() / (float)RAND_MAX); + + // create buffers + a_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + M * K * sizeof(float), A, &_err)); + b_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + N * K * sizeof(float), B, &_err)); + c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, + M * N * sizeof(float), NULL, &_err)); + + // load kernel text + size_t kernel_size; + if (read_kernel_file("kernel.cl", &kernel_bin, &kernel_size) != 0) { + cleanup(); + return -1; + } + program = CL_CHECK2(clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, + &kernel_size, &_err)); + if (program == NULL) { + cleanup(); + return -1; + } + + // build program + cl_int build_status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); + + // create kernel + kernel = CL_CHECK2(clCreateKernel(program, "myGEMM3", &_err)); + + // check building info + size_t log_size = 0; + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, + NULL, &log_size)); + char *log = (char *)malloc(log_size * sizeof(char)); + if (log == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, + log_size, log, NULL)); + if (log_size > 1) { + printf("Log:\n"); + printf("%s", log); + printf("\n"); + } + CL_CHECK(build_status); + + // set kernel arguments + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&a_memobj)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&b_memobj)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&c_memobj)); + + // calculate matrices on CPU for later checks + float *C_cpu = (float *)malloc(M * N * sizeof(float)); + if (C_cpu == NULL) { + printf("Not enough memory for sgemm on CPU"); + cleanup(); + return -1; + } + sgemm_cpu(C_cpu, A, B, M, N, K); + + // double times[TESTS_NUM] = {0}; + const size_t local[2] = {TS, TS/WPT}; + const size_t global[2] = {M, N/WPT}; + int errors = 0; + + // run kernel TESTS_NUM times and verify results + for (int i = 0; i < TESTS_NUM; i++) { + errors = 0; + + printf("Execute the kernel, iteration %d\n", i); + auto time_start = std::chrono::high_resolution_clock::now(); + CL_CHECK(clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, local, + 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + // CL_CHECK(clWaitForEvents(1, &event)); + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast( + time_end - time_start) + .count(); + // times[i] = elapsed; + printf("Elapsed time: %lg ms\n", elapsed); + + // get results from VRAM + CL_CHECK(clEnqueueReadBuffer(command_queue, c_memobj, CL_TRUE, 0, + M * N * sizeof(float), C, 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + + // verify results + printf("Verify results\n"); + for (size_t i = 0; i < size_t(M * N); i++) + if (C_cpu[i] != C[i]) + errors++; + if (errors != 0) + printf("FAILED! - %d errors\n", errors); + else + printf("PASSED!\n"); + } + + // free resources + cleanup(); + free(A); + free(B); + free(C); + free(log); + free(C_cpu); + return errors; +} diff --git a/tests/opencl/kernel4/Makefile b/tests/opencl/kernel4/Makefile new file mode 100644 index 000000000..312ec638d --- /dev/null +++ b/tests/opencl/kernel4/Makefile @@ -0,0 +1,20 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := kernel4 + +SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cc + +kernel.cl: $(SRC_DIR)/kernel.cl + cp $< $@ + +common.h: $(SRC_DIR)/common.h + cp $< $@ + +KERNEL_SRCS := kernel.cl common.h + +OPTS ?= + +include ../common.mk diff --git a/tests/opencl/kernel4/common.h b/tests/opencl/kernel4/common.h new file mode 100644 index 000000000..ab51ec914 --- /dev/null +++ b/tests/opencl/kernel4/common.h @@ -0,0 +1,9 @@ +#ifndef COMMON_H +#define COMMON_H + +#define TESTS_NUM 100 + +#define TS 4 +#define WIDTH 4 + +#endif // COMMON_H \ No newline at end of file diff --git a/tests/opencl/kernel4/kernel.cl b/tests/opencl/kernel4/kernel.cl new file mode 100644 index 000000000..179789d97 --- /dev/null +++ b/tests/opencl/kernel4/kernel.cl @@ -0,0 +1,110 @@ +#include "common.h" + +#if WIDTH == 1 + typedef float floatX; +#elif WIDTH == 2 + typedef float2 floatX; +#elif WIDTH == 4 + typedef float4 floatX; +#elif WIDTH == 8 + typedef float8 floatX; +#endif + +__kernel void myGEMM4(const int M, const int N, const int K, + const __global floatX* A, + const __global floatX* B, + __global floatX* C) { + + // Thread identifiers + const int row = get_local_id(0); // Local row ID (max: TS/WIDTH) + const int col = get_local_id(1); // Local col ID (max: TS) + const int globalRow = (TS/WIDTH)*get_group_id(0) + row; // Row ID of C (0..M/WIDTH) + const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N) + + // Local memory to fit a tile of TS*TS elements of A and B + __local floatX Asub[TS][TS/WIDTH]; + __local floatX Bsub[TS][TS/WIDTH]; + + // Initialise the accumulation registers + #if WIDTH == 1 + floatX acc = 0.0f; + #elif WIDTH == 2 + floatX acc = { 0.0f, 0.0f }; + #elif WIDTH == 4 + floatX acc = { 0.0f, 0.0f, 0.0f, 0.0f }; + #elif WIDTH == 8 + floatX acc = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; + #endif + + // Loop over all tiles + const int numTiles = K/TS; + for (int tile=0; tile +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +int M = 256, N = 256, K = 256; + +static void show_usage() { + printf("Usage: [-M number of rows in first matrix] [-N number of columns in first matrix] [-K number of columns in first matrix and rows in second matrix] [-h: help]\n"); +} + +#define CL_CHECK(_expr) \ + do { \ + cl_int _err = _expr; \ + if (_err == CL_SUCCESS) \ + break; \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } while (0) + +#define CL_CHECK2(_expr) \ + ({ \ + cl_int _err = CL_INVALID_VALUE; \ + decltype(_expr) _ret = _expr; \ + if (_err != CL_SUCCESS) { \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } \ + _ret; \ + }) + +static int read_kernel_file(const char *filename, uint8_t **data, + size_t *size) { + if (NULL == filename || NULL == data || 0 == size) + return -1; + + FILE *fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + + fseek(fp, 0, SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t *)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + +static void sgemm_cpu(float *C, const float *A, const float *B, int M, int N, + int K) { + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float acc = 0; + for (int k = 0; k < K; ++k) { + acc += A[k * M + m] * B[n * K + k]; + } + C[n * M + m] = acc; + } + } +} + +cl_platform_id platform_id = NULL; +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue command_queue = NULL; +cl_program program = NULL; +cl_kernel kernel = NULL; +cl_mem a_memobj = NULL; +cl_mem b_memobj = NULL; +cl_mem c_memobj = NULL; +uint8_t *kernel_bin = NULL; + +static void cleanup() { + if (command_queue) + clReleaseCommandQueue(command_queue); + if (kernel) + clReleaseKernel(kernel); + if (program) + clReleaseProgram(program); + if (a_memobj) + clReleaseMemObject(a_memobj); + if (b_memobj) + clReleaseMemObject(b_memobj); + if (c_memobj) + clReleaseMemObject(c_memobj); + if (context) + clReleaseContext(context); + if (device_id) + clReleaseDevice(device_id); + if (kernel_bin) + free(kernel_bin); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:M:N:K:h?")) != -1) { + switch (c) { + case 'n': + M = N = K = atoi(optarg); + break; + case 'M': + M = atoi(optarg); + break; + case 'N': + N = atoi(optarg); + break; + case 'K': + K = atoi(optarg); + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } + + if (M < 2 || N < 2 || K < 2) { + printf("Error: invalid size!\n"); + exit(-1); + } +} + +int main(int argc, char **argv) { + // parse command arguments + parse_args(argc, argv); + + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + + // create context + cl_context_properties context_properties[]{ + CL_CONTEXT_PLATFORM, cl_context_properties(platform_id), 0}; + cl_device_id devices[]{device_id}; + context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); + + char device_string[1024]; + clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL); + printf("Using device: %s\n", device_string); + + // create command queue + command_queue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); + + // generate input data + float *A, *B, *C; + A = (float *)(malloc(M * K * sizeof(float))); + B = (float *)(malloc(N * K * sizeof(float))); + C = (float *)(malloc(M * N * sizeof(float))); + if (A == NULL || B == NULL || C == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + srand(time(NULL)); + for (int i = 0; i < M * K; i++) + A[i] = (int)((float)rand() / (float)RAND_MAX); + for (int i = 0; i < N * K; i++) + B[i] = (int)((float)rand() / (float)RAND_MAX); + + // create buffers + a_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + M * K * sizeof(float), A, &_err)); + b_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + N * K * sizeof(float), B, &_err)); + c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, + M * N * sizeof(float), NULL, &_err)); + + // load kernel text + size_t kernel_size; + if (read_kernel_file("kernel.cl", &kernel_bin, &kernel_size) != 0) { + cleanup(); + return -1; + } + program = CL_CHECK2(clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, + &kernel_size, &_err)); + if (program == NULL) { + cleanup(); + return -1; + } + + // build program + cl_int build_status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); + + // create kernel + kernel = CL_CHECK2(clCreateKernel(program, "myGEMM4", &_err)); + + // check building info + size_t log_size = 0; + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, + NULL, &log_size)); + char *log = (char *)malloc(log_size * sizeof(char)); + if (log == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, + log_size, log, NULL)); + if (log_size > 1) { + printf("Log:\n"); + printf("%s", log); + printf("\n"); + } + CL_CHECK(build_status); + + // set kernel arguments + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&a_memobj)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&b_memobj)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&c_memobj)); + + // calculate matrices on CPU for later checks + float *C_cpu = (float *)malloc(M * N * sizeof(float)); + if (C_cpu == NULL) { + printf("Not enough memory for sgemm on CPU"); + cleanup(); + return -1; + } + sgemm_cpu(C_cpu, A, B, M, N, K); + + // double times[TESTS_NUM] = {0}; + const size_t local[2] = {TS, TS}; + const size_t global[2] = {M, N}; + int errors = 0; + + // run kernel TESTS_NUM times and verify results + for (int i = 0; i < TESTS_NUM; i++) { + errors = 0; + + printf("Execute the kernel, iteration %d\n", i); + auto time_start = std::chrono::high_resolution_clock::now(); + CL_CHECK(clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, local, + 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + // CL_CHECK(clWaitForEvents(1, &event)); + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast( + time_end - time_start) + .count(); + // times[i] = elapsed; + printf("Elapsed time: %lg ms\n", elapsed); + + // get results from VRAM + CL_CHECK(clEnqueueReadBuffer(command_queue, c_memobj, CL_TRUE, 0, + M * N * sizeof(float), C, 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + + // verify results + printf("Verify results\n"); + for (size_t i = 0; i < size_t(M * N); i++) + if (C_cpu[i] != C[i]) + errors++; + if (errors != 0) + printf("FAILED! - %d errors\n", errors); + else + printf("PASSED!\n"); + } + + // free resources + cleanup(); + free(A); + free(B); + free(C); + free(log); + free(C_cpu); + return errors; +}