diff --git a/experimental/kernels/Makefile b/experimental/kernels/Makefile index 5817e23..0644955 100644 --- a/experimental/kernels/Makefile +++ b/experimental/kernels/Makefile @@ -29,6 +29,10 @@ endif default: run-native +build/reduce: reduce.cpp kernels.h + $(CC) $(CFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $< + $(LIBSPEC) && build/reduce + run_llm.c: ./build/test_gpt2 dawnlib $(LIBSPEC) && $< diff --git a/experimental/kernels/kernels.h b/experimental/kernels/kernels.h index 1bce081..62a461e 100644 --- a/experimental/kernels/kernels.h +++ b/experimental/kernels/kernels.h @@ -781,6 +781,78 @@ fn main(@builtin(global_invocation_id) global_id : vec3) { } )"; +static const char *kSum = R"( +@group(0) @binding(0) var inp: array<{{precision}}>; +@group(0) @binding(1) var out: array<{{precision}}>; +var buffer: array<{{precision}}, 1024>; +@compute @workgroup_size({{workgroupSize}}) +fn main( + @builtin(global_invocation_id) globalID : vec3, + @builtin(local_invocation_id) localID : vec3, + @builtin(workgroup_id) groupid : vec3, + @builtin(num_workgroups) numGroups : vec3) { + let blockSize3d: vec3 = vec3({{workgroupSize}}); + let blockSize: u32 = blockSize3d.x; + let threadId: u32 = localID.x; + let blockId: u32 = groupid.x + groupid.y * numGroups.x; + let blockStart = blockId * blockSize * 2 + threadId; + + buffer[threadId] = inp[blockStart] + inp[blockStart + blockSize]; + workgroupBarrier(); + var stride: u32 = blockSize / 2; + + if (blockSize >= 1024 && threadId < 512) { + buffer[threadId] += buffer[threadId + 512]; + } + workgroupBarrier(); + + if (blockSize >= 512 && threadId < 256) { + buffer[threadId] += buffer[threadId + 256]; + } + workgroupBarrier(); + + if (blockSize >= 256 && threadId < 128) { + buffer[threadId] += buffer[threadId + 128]; + } + workgroupBarrier(); + + if (threadId < 64) { + buffer[threadId] += buffer[threadId + 64]; + } + workgroupBarrier(); + + if (threadId < 32) { + buffer[threadId] += buffer[threadId + 32]; + } + workgroupBarrier(); + + if (threadId < 16) { + buffer[threadId] += buffer[threadId + 16]; + } + workgroupBarrier(); + + if (threadId < 8) { + buffer[threadId] += buffer[threadId + 8]; + } + workgroupBarrier(); + + if (threadId < 4) { + buffer[threadId] += buffer[threadId + 4]; + } + workgroupBarrier(); + + if (threadId < 2) { + buffer[threadId] += buffer[threadId + 2]; + } + workgroupBarrier(); + + if (threadId == 0) { + buffer[0] += buffer[1]; + out[blockId] = buffer[0]; + } +} +)"; + } // namespace gpu #endif // KERNELS_H diff --git a/experimental/kernels/reduce.cpp b/experimental/kernels/reduce.cpp new file mode 100644 index 0000000..38cb6a7 --- /dev/null +++ b/experimental/kernels/reduce.cpp @@ -0,0 +1,514 @@ +#include "gpu.hpp" +#include +#include +#include +#include +#include +#include "utils/array_utils.hpp" // show, isclose, randn, randint +#include "kernels.h" + +using namespace gpu; + +#define LIMITS { \ + .nextInChain = nullptr, \ + .limits = { \ + .maxTextureDimension1D=8192, \ + .maxTextureDimension2D=8192, \ + .maxTextureDimension3D=2048, \ + .maxTextureArrayLayers=256, \ + .maxBindGroups=4, \ + .maxBindGroupsPlusVertexBuffers=24, \ + .maxBindingsPerBindGroup=1000, \ + .maxDynamicUniformBuffersPerPipelineLayout=8, \ + .maxDynamicStorageBuffersPerPipelineLayout=4, \ + .maxSampledTexturesPerShaderStage=16, \ + .maxSamplersPerShaderStage=16, \ + .maxStorageBuffersPerShaderStage=8, \ + .maxStorageTexturesPerShaderStage=4, \ + .maxUniformBuffersPerShaderStage=12, \ + .maxUniformBufferBindingSize=65536, \ + .maxStorageBufferBindingSize=1073741824, \ + .minUniformBufferOffsetAlignment=256, \ + .minStorageBufferOffsetAlignment=256, \ + .maxVertexBuffers=8, \ + .maxBufferSize=0x80000000, \ + .maxVertexAttributes=16, \ + .maxVertexBufferArrayStride=2048, \ + .maxInterStageShaderComponents=64, \ + .maxInterStageShaderVariables=16, \ + .maxColorAttachments=8, \ + .maxColorAttachmentBytesPerSample=32, \ + .maxComputeWorkgroupStorageSize=16384, \ + .maxComputeInvocationsPerWorkgroup=1024, \ + .maxComputeWorkgroupSizeX=1024, \ + .maxComputeWorkgroupSizeY=1024, \ + .maxComputeWorkgroupSizeZ=64, \ + .maxComputeWorkgroupsPerDimension=65535 \ + } \ + } + + +struct DurationTime { + std::chrono::high_resolution_clock::time_point start; + std::chrono::high_resolution_clock::time_point end; + std::chrono::microseconds duration; + std::string src; + bool verbose; + int num; + + inline DurationTime(const std::string& src, bool verbose = true, int num = 1) { + this->src = src; + this->verbose = verbose; + this->num = num; + start = std::chrono::high_resolution_clock::now(); + } + + inline ~DurationTime() { + end = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end - start); + if (this->verbose) { + printf("Duration(%s): %.1f microseconds\n", src.c_str(), static_cast(duration.count()) / static_cast(num)); + } + } +}; + +static const char *kSumVersion1 = R"( +@group(0) @binding(0) var inp: array<{{precision}}>; +@group(0) @binding(1) var out: array<{{precision}}>; +var buffer: array<{{precision}}, 1024>; +@compute @workgroup_size({{workgroupSize}}) +fn main( + @builtin(local_invocation_id) localID : vec3, + @builtin(workgroup_id) groupid : vec3, + @builtin(num_workgroups) numGroups : vec3) { + let blockSize3d: vec3 = vec3({{workgroupSize}}); + let blockSize: u32 = blockSize3d.x; + let threadId: u32 = localID.x; + let blockId: u32 = groupid.x + groupid.y * numGroups.x; + let blockStart = blockId * blockSize * 2 + threadId; + + buffer[threadId] = inp[blockStart] + inp[blockStart + blockSize]; + workgroupBarrier(); + + for (var stride: u32 = blockSize / 2; stride > 0; stride /= 2) { + if (threadId < stride) { + buffer[threadId] += buffer[threadId + stride]; + } + workgroupBarrier(); + } + + if (threadId == 0) { + out[blockId] = buffer[0]; + } +} +)"; + +static const char *kSumVersion2 = R"( +@group(0) @binding(0) var inp: array<{{precision}}>; +@group(0) @binding(1) var out: array<{{precision}}>; +var buffer: array<{{precision}}, 1024>; +@compute @workgroup_size({{workgroupSize}}) +fn main( + @builtin(global_invocation_id) globalID : vec3, + @builtin(local_invocation_id) localID : vec3, + @builtin(workgroup_id) groupid : vec3, + @builtin(num_workgroups) numGroups : vec3) { + let blockSize3d: vec3 = vec3({{workgroupSize}}); + let blockSize: u32 = blockSize3d.x; + let threadId: u32 = localID.x; + let blockId: u32 = groupid.x + groupid.y * numGroups.x; + let n: u32 = arrayLength(&inp); + let blockStart = blockId * blockSize * 2 + threadId; + + buffer[threadId] = inp[blockStart] + inp[blockStart + blockSize]; + workgroupBarrier(); + var stride: u32 = blockSize / 2; + + if (threadId < stride) { + buffer[threadId] += buffer[threadId + stride]; + } + workgroupBarrier(); + + stride /= 2; // 1/4 + if (threadId < stride) { + buffer[threadId] += buffer[threadId + stride]; + } + workgroupBarrier(); + + stride /= 2; // 1/8 + if (threadId < stride) { + buffer[threadId] += buffer[threadId + stride]; + } + workgroupBarrier(); + + stride /= 2; // 1/16 + if (threadId < stride) { + buffer[threadId] += buffer[threadId + stride]; + } + workgroupBarrier(); + + stride /= 2; // 1/32 + if (threadId < stride) { + buffer[threadId] += buffer[threadId + stride]; + } + workgroupBarrier(); + + stride /= 2; // 1/64 + if (threadId < stride) { + buffer[threadId] += buffer[threadId + stride]; + } + workgroupBarrier(); + + stride /= 2; // 1/128 + if (threadId < stride) { + buffer[threadId] += buffer[threadId + stride]; + } + workgroupBarrier(); + + stride /= 2; // 1/256 + if (threadId < stride) { + buffer[threadId] += buffer[threadId + stride]; + } + workgroupBarrier(); + + stride /= 2; // 1/512 + if (threadId < stride) { + buffer[threadId] += buffer[threadId + stride]; + } + workgroupBarrier(); + + stride /= 2; // 1/1024 + if (threadId < stride) { + buffer[threadId] += buffer[threadId + stride]; + } + + if (threadId == 0) { + out[blockId] = buffer[0]; + } +} +)"; + +static const char *kSum2d = R"( +@group(0) @binding(0) var inp: array<{{precision}}>; +@group(0) @binding(1) var out: array<{{precision}}>; +@group(0) @binding(2) var params : Params; +struct Params { + N: u32, + C: u32, +}; +var buffer: array<{{precision}}, 1024>; +@compute @workgroup_size({{workgroupSize}}) +fn main( + @builtin(local_invocation_id) localID : vec3, + @builtin(workgroup_id) groupid : vec3, + @builtin(num_workgroups) numGroups : vec3) { + let N : u32 = params.N; + let C : u32 = params.C; + let blockSize3d: vec3 = vec3({{workgroupSize}}); + let blockSize: u32 = blockSize3d.x; + let threadId: u32 = localID.x; + let blockId: u32 = groupid.x + groupid.y * numGroups.x; + + for (var i: u32 = 0; i= N) { + } else if(blockStart + blockSize >= N) { + buffer[threadId] = inp[blockStart * C + i]; + } else { + buffer[threadId] = inp[blockStart * C + i] + inp[(blockStart + blockSize) * C + i]; + } + workgroupBarrier(); + + for (var stride: u32 = blockSize / 2; stride > 0; stride /= 2) { + if (threadId < stride) { + buffer[threadId] += buffer[threadId + stride]; + } + workgroupBarrier(); + } + + if (threadId == 0) { + out[blockId * C + i] = buffer[0]; + } + workgroupBarrier(); + } +} +)"; + +float sum_cpu(const float* data, size_t size) { + float result = 0; + for (size_t i = 0; i < size; ++i) { + result += data[i]; + } + return result; +} + +void sum_cpu_2d(const float* data, float* out, size_t size0, size_t size1) { + float result = 0; + for (size_t j = 0; j < size1; ++j) { + out[j] = 0; + } + for (size_t i = 0; i < size0; ++i) { + for (size_t j = 0; j < size1; ++j) { + out[j] += data[(i * size1) + j]; + } + } +} + +Kernel createSumKernel(Context& ctx, Tensor& input, Tensor& output, size_t size, uint32_t num_threads = 1024) { + uint32_t num_blocks = ((size + num_threads -1) / num_threads); + uint32_t size_x = 32768u < num_blocks ? 32768u : num_blocks; + uint32_t size_y = size_x == 32768u ? num_blocks / 32768u : 1; + size_x /= 2; + size_x = size_x < 1 ? 1 : size_x; + // print size_x, size_y + printf("size_x: %u, size_y: %u, num_blocks: %u\n", size_x, size_y, num_blocks); + return createKernel(ctx, {kSum, num_threads, kf32}, Bindings{input, output}, {size_x, size_y, 1}); +} + +Kernel createSumKernel2d(Context& ctx, Tensor& input, Tensor& output, size_t size0, size_t size1, uint32_t num_threads = 1024) { + struct Params { + uint32_t N; + uint32_t C; + }; + uint32_t num_blocks = ((size0 + num_threads -1) / num_threads); + uint32_t size_x = num_blocks; + uint32_t size_y = size1; + size_x /= 2; + size_x = size_x < 1 ? 1 : size_x; + printf("size_x: %u, size_y: %u, num_blocks: %u\n", size_x, size_y, num_blocks); + return createKernel(ctx, + {kSum2d, num_threads, kf32}, + Bindings{input, output}, + {size_x, size_y, 1}, + Params{ + static_cast(size0), + static_cast(size1), + }); +} + +struct SumKernel { + std::vector outputs; + std::vector ops; + SumKernel(Context& ctx, size_t size, uint32_t num_threads = 1024) { + int input_size = size; + unsigned long output_size = size; + outputs.push_back(createTensor(ctx, Shape{std::max(size, static_cast(num_threads*2))}, kf32)); + for(int j=0;output_size>1;j++){ + output_size = (output_size + (num_threads * 2) - 1) / (num_threads * 2); + outputs.push_back(createTensor(ctx, Shape{std::max(output_size, static_cast(num_threads*2))}, kf32)); + ops.push_back(createSumKernel(ctx, outputs[j], outputs[j+1], input_size, num_threads)); + input_size = output_size; + } + } + void dispatchKernel(Context& ctx) { + for(int i=0;i promise; + std::future future = promise.get_future(); + gpu::dispatchKernel(ctx, ops[i], promise); + wait(ctx, future); + resetCommandBuffer(ctx.device, ops[i]); + } + } + void toGPU(Context& ctx, const float* data, size_t size) { + gpu::toGPU(ctx, data, outputs[0], size); + } + void toCPU(Context& ctx, float* data, size_t size) { + gpu::toCPU(ctx, outputs[outputs.size()-1], data, size); + } +}; + +struct SumKernel2d { + std::vector outputs; + std::vector ops; + bool debug; + SumKernel2d(Context& ctx, size_t size0, size_t size1, uint32_t num_threads = 1024) { + debug = false; + int input_size = size0; + unsigned long output_size = size0; + outputs.push_back(createTensor(ctx, Shape{std::max(size0, static_cast(num_threads*2)),size1}, kf32)); + for(int j=0;output_size>1;j++){ + output_size = (output_size + (num_threads * 2) - 1) / (num_threads * 2); + if (debug) + printf("size0: %d, num_threads: %d, output_size: %d\n", size0, num_threads, output_size); + outputs.push_back(createTensor(ctx, Shape{std::max(output_size, static_cast(num_threads*2)), size1}, kf32)); + ops.push_back(createSumKernel2d(ctx, outputs[j], outputs[j+1], input_size, size1, num_threads)); + input_size = output_size; + } + if (debug) + printf("ops.size(): %d\n", ops.size()); + } + void dispatchKernel(Context& ctx) { + for(int i=0;i promise; + std::future future = promise.get_future(); + gpu::dispatchKernel(ctx, ops[i], promise); + wait(ctx, future); + resetCommandBuffer(ctx.device, ops[i]); + } + if (debug) { + std::unique_ptr buffer = std::make_unique(8); + for(int i=0;i inputArr = std::make_unique(M * N); + std::unique_ptr buffer = std::make_unique(BUF_SIZE); + std::mt19937 gen(314159); + printf("Initializing %zu values\n", M*N); + randn(inputArr.get(), M*N, gen); + // for(int i=0;i= 1e-0f) { + printf("Error: diff = %.6f\n", diff); + } else { + printf("Success: diff = %.6f\n", diff); + } + + printf("Computed %zu values of kSum(x)\n\n", M*N); + return 0; +} + +int main_2d(int argc, char **argv) { + static constexpr size_t M = 4096; + static constexpr size_t N = 4096; + std::unique_ptr inputArr = std::make_unique(M * N); + std::unique_ptr outputCpuArr = std::make_unique(N); + std::unique_ptr outputGpuArr = std::make_unique(N); + std::mt19937 gen(314159); + printf("Initializing %zu values\n", M*N); + randn(inputArr.get(), M*N, gen); + for(int i=0;i= 1e-0f) { + printf("Error: diff = %.6f\n", diff); + } else { + printf("Success: diff = %.6f\n", diff); + } + + return 0; +} + +int main(int argc, char **argv) { + printf("================================\n"); + printf("Start testing reduce-1d\n"); + main_1d(argc,argv); + printf("================================\n"); + printf("Start testing reduce-2d\n"); + main_2d(argc,argv); + return 0; +} diff --git a/gpu.hpp b/gpu.hpp index 941656e..b1dd1cb 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -1187,6 +1187,18 @@ inline void toGPU(Context &ctx, const int *data, Tensor &tensor) { wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, tensor.data.size); } + +inline void toGPU(Context &ctx, const float *data, Tensor &tensor, size_t size) { + wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size); +} + +inline void toGPU(Context &ctx, const half *data, Tensor &tensor, size_t size) { + wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size); +} + +inline void toGPU(Context &ctx, const int *data, Tensor &tensor, size_t size) { + wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size); +} template inline void toGPU(Context &ctx, Params ¶ms, Kernel &op) {