From f032688b6c02a9ac93986aac3e6848102c47f1a9 Mon Sep 17 00:00:00 2001 From: statusfailed Date: Thu, 5 Mar 2026 01:48:47 +0000 Subject: [PATCH 1/6] Updated kernarg struct to match AMDHSA alignment - Added uint32_t _pad0 after float a. - Added uint32_t _pad1 after uint32_t n. - Initialized the new padding fields to 0: - Kept existing logical arguments unchanged (y, x, a, n), but fixed their byte layout so: - n lands at byte offset 24. - total copied kernarg size is 32 bytes (matching runtime-reported kernarg=32). --- examples/launch_saxpy.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/launch_saxpy.c b/examples/launch_saxpy.c index c255f12..9870c21 100644 --- a/examples/launch_saxpy.c +++ b/examples/launch_saxpy.c @@ -34,7 +34,7 @@ int main(int argc, char *argv[]) return 1; } - /* kernarg layout: {float *y, float *x, float a, int n} = 24 bytes */ + /* kernarg layout follows AMDHSA alignment; keep n at byte offset 24. */ bc_kernel_t kern; rc = bc_load_kernel(&dev, hsaco, "saxpy", &kern); if (rc != BC_RT_OK) { @@ -72,13 +72,17 @@ int main(int argc, char *argv[]) void *y; void *x; float a; + uint32_t _pad0; uint32_t n; + uint32_t _pad1; } args; args.y = d_y; args.x = d_x; args.a = A_VAL; + args._pad0 = 0; args.n = N; + args._pad1 = 0; uint32_t num_blocks = (N + BLOCK - 1) / BLOCK; printf(" dispatch: %u blocks x %d threads\n", num_blocks, BLOCK); From df39cf488f187802fc12945b8bf83311e98fa8b3 Mon Sep 17 00:00:00 2001 From: statusfailed Date: Fri, 6 Mar 2026 02:57:02 +0000 Subject: [PATCH 2/6] add flake.nix --- flake.nix | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 flake.nix diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..9d1e777 --- /dev/null +++ b/flake.nix @@ -0,0 +1,89 @@ +{ + description = "BarraCUDA flake (devShell sets LD_LIBRARY_PATH for libhsa-runtime64.so)"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = import nixpkgs { inherit system; }; + lib = pkgs.lib; + + rocmPkgs = + if pkgs ? rocmPackages then pkgs.rocmPackages + else throw "This nixpkgs does not provide pkgs.rocmPackages; use a newer nixpkgs."; + + # Pick a package that provides libhsa-runtime64.so (ROCR / HSA runtime). + hsaRuntime = + let + candidates = [ + "hsa-rocr" + "hsa-rocr-runtime" + "rocm-runtime" + "rocm-runtime-unwrapped" + ]; + name = lib.findFirst (n: builtins.hasAttr n rocmPkgs) null candidates; + in + if name == null then + throw "Could not find an ROCm HSA runtime in pkgs.rocmPackages (tried: ${builtins.toString candidates})." + else + rocmPkgs.${name}; + + barracuda = pkgs.stdenv.mkDerivation { + pname = "barracuda"; + version = "unstable"; + src = ./.; + + nativeBuildInputs = [ pkgs.gnumake pkgs.makeWrapper ]; + + buildPhase = "make"; + + installPhase = '' + runHook preInstall + install -Dm755 barracuda $out/bin/barracuda + runHook postInstall + ''; + + # Convenience for `nix run .` so barracuda can dlopen() libhsa-runtime64.so. + postFixup = '' + wrapProgram $out/bin/barracuda \ + --prefix LD_LIBRARY_PATH : ${hsaRuntime}/lib:${hsaRuntime}/lib64 + ''; + + meta = { + homepage = "https://github.com/Zaneham/BarraCUDA"; + license = lib.licenses.asl20; + mainProgram = "barracuda"; + platforms = lib.platforms.linux; + }; + }; + in + { + packages = { + default = barracuda; + barracuda = barracuda; + hsaRuntime = hsaRuntime; + }; + + apps.default = { + type = "app"; + program = "${barracuda}/bin/barracuda"; + }; + + devShells.default = pkgs.mkShell { + packages = [ + pkgs.gcc + pkgs.gnumake + hsaRuntime + ]; + + shellHook = '' + # bc_runtime.c dlopen("libhsa-runtime64.so") relies on the loader search path. + export LD_LIBRARY_PATH=${hsaRuntime}/lib:${hsaRuntime}/lib64''${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} + ''; + }; + }); +} From 7ddad8714aed6e0a51e58b1077c86cb8de321d02 Mon Sep 17 00:00:00 2001 From: statusfailed Date: Thu, 5 Mar 2026 01:44:50 +0000 Subject: [PATCH 3/6] Updated kernel_code_properties bits in emit.c - Set ENABLE_SGPR_DISPATCH_PTR (bit 1). - Set ENABLE_SGPR_KERNARG_SEGMENT_PTR (bit 3). - Set IS_PTR64 (bit 19). - Set ENABLE_WAVEFRONT_SIZE32 (bit 10) for non-CDNA targets. --- src/amdgpu/amdgpu.h | 4 ++-- src/amdgpu/emit.c | 29 ++++++++++++++++++----------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/amdgpu/amdgpu.h b/src/amdgpu/amdgpu.h index f80c17c..4ef7a0c 100644 --- a/src/amdgpu/amdgpu.h +++ b/src/amdgpu/amdgpu.h @@ -372,8 +372,8 @@ typedef struct { uint32_t compute_pgm_rsrc3; uint32_t compute_pgm_rsrc1; uint32_t compute_pgm_rsrc2; - uint16_t kernel_code_properties; - uint8_t reserved2[6]; + uint32_t kernel_code_properties; + uint8_t reserved2[4]; } amd_kernel_descriptor_t; /* 64 bytes */ /* ---- Module ---- */ diff --git a/src/amdgpu/emit.c b/src/amdgpu/emit.c index d2a9231..9364ade 100644 --- a/src/amdgpu/emit.c +++ b/src/amdgpu/emit.c @@ -926,17 +926,21 @@ int amdgpu_emit_elf(amd_module_t *A, const char *path) (1u << 27); /* MEM_ORDERED (RDNA only) */ } - /* compute_pgm_rsrc2 — [0] SCRATCH_EN, [5:1] USER_SGPR_COUNT, - [7] TGID_X, [8] TGID_Y, [9] TGID_Z, [12:11] VGPR_WORKITEM_ID. - Layout matches what isel's scan_kernel_needs() decided. */ + /* compute_pgm_rsrc2. + * CDNA/GFX9 uses TGID bits at 11/12/13. RDNA keeps 7/8/9 and + * VGPR_WORKITEM_ID at 12:11. */ { - uint32_t user_sgpr = 2u; /* s[0:1] = kernarg only */ + uint32_t tgid_x = cdna ? 11u : 7u; + uint32_t tgid_y = cdna ? 12u : 8u; + uint32_t tgid_z = cdna ? 13u : 9u; + uint32_t user_sgpr = 4u; /* dispatch_ptr + kernarg_ptr */ uint32_t rsrc2 = ((F->scratch_bytes > 0) ? 1u : 0u) | (user_sgpr << 1) | - (1u << 7); /* TGID_X always enabled */ - if (F->max_dim >= 1) rsrc2 |= (1u << 8); /* TGID_Y */ - if (F->max_dim >= 2) rsrc2 |= (1u << 9); /* TGID_Z */ - rsrc2 |= ((uint32_t)F->max_dim << 11); /* VGPR_WORKITEM_ID */ + (1u << tgid_x); + if (F->max_dim >= 1) rsrc2 |= (1u << tgid_y); + if (F->max_dim >= 2) rsrc2 |= (1u << tgid_z); + if (!cdna) + rsrc2 |= ((uint32_t)F->max_dim << 11); kd.compute_pgm_rsrc2 = rsrc2; } @@ -950,9 +954,12 @@ int amdgpu_emit_elf(amd_module_t *A, const char *path) kd.compute_pgm_rsrc3 = accum_off & 0x3F; } - /* kernel_code_properties — only KERNARG_PTR. - * No dispatch_ptr; blockDim/gridDim come from hidden kernarg. */ - kd.kernel_code_properties = (1u << 3); /* ENABLE_SGPR_KERNARG_PTR */ + /* kernel_code_properties (amd_hsa_kernel_code.h) */ + kd.kernel_code_properties = (1u << 1) | /* ENABLE_SGPR_DISPATCH_PTR */ + (1u << 3) | /* ENABLE_SGPR_KERNARG_PTR */ + (1u << 19); /* IS_PTR64 */ + if (!cdna) + kd.kernel_code_properties |= (1u << 10); /* WAVEFRONT_SIZE32 */ if (rodata_len + 64 <= sizeof(rodata)) { memcpy(rodata + rodata_len, &kd, 64); From ad7aa660c475ce66bb19b61adb1480318e7da44d Mon Sep 17 00:00:00 2001 From: statusfailed Date: Thu, 5 Mar 2026 01:32:48 +0000 Subject: [PATCH 4/6] Fix ELF header in src/amdgpu/emit.c - Switched ELF type to `ET_DYN` for code objects. - Added program headers (`PT_LOAD` for .text, `PT_NOTE` for metadata note). - Set valid program-header table fields `(e_phoff, e_phentsize, e_phnum)`. - Set AMDGPU HSA ABI version byte (`e_ident[EI_ABIVERSION] = 1`, HSA V3-compatible). --- src/amdgpu/emit.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/amdgpu/emit.c b/src/amdgpu/emit.c index 9364ade..cc8ef84 100644 --- a/src/amdgpu/emit.c +++ b/src/amdgpu/emit.c @@ -794,6 +794,17 @@ typedef struct { uint64_t sh_entsize; } elf64_shdr_t; /* 64 bytes */ +typedef struct { + uint32_t p_type; + uint32_t p_flags; + uint64_t p_offset; + uint64_t p_vaddr; + uint64_t p_paddr; + uint64_t p_filesz; + uint64_t p_memsz; + uint64_t p_align; +} elf64_phdr_t; /* 56 bytes */ + typedef struct { uint32_t st_name; uint8_t st_info; @@ -1344,10 +1355,11 @@ int amdgpu_emit_elf(amd_module_t *A, const char *path) ehdr.e_ident[5] = 1; /* ELFDATA2LSB */ ehdr.e_ident[6] = 1; /* EV_CURRENT */ ehdr.e_ident[7] = ELFOSABI_AMDGPU_HSA; - ehdr.e_ident[8] = 4; /* ABI version 4 — code object v6 */ + ehdr.e_ident[8] = 1; /* ELFABIVERSION_AMDGPU_HSA_V3 */ ehdr.e_type = 3; /* ET_DYN */ ehdr.e_machine = EM_AMDGPU; ehdr.e_version = 1; + ehdr.e_entry = 0; ehdr.e_phoff = phdr_off; ehdr.e_shoff = shdr_off; ehdr.e_flags = A->elf_mach; From 6a6b8786d3d9b3e8fd99eb2b797bb9b81c2dd15b Mon Sep 17 00:00:00 2001 From: statusfailed Date: Fri, 6 Mar 2026 03:07:27 +0000 Subject: [PATCH 5/6] remove duplicate phdr_t struct in emit.c --- src/amdgpu/emit.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/amdgpu/emit.c b/src/amdgpu/emit.c index cc8ef84..4ce33a9 100644 --- a/src/amdgpu/emit.c +++ b/src/amdgpu/emit.c @@ -820,17 +820,6 @@ typedef struct { uint32_t n_type; } elf64_nhdr_t; /* 12 bytes */ -typedef struct { - uint32_t p_type; - uint32_t p_flags; - uint64_t p_offset; - uint64_t p_vaddr; - uint64_t p_paddr; - uint64_t p_filesz; - uint64_t p_memsz; - uint64_t p_align; -} elf64_phdr_t; /* 56 bytes */ - typedef struct { int64_t d_tag; uint64_t d_val; From a1ce1bb9dd0416bf8008014ea8410ff42628c48a Mon Sep 17 00:00:00 2001 From: statusfailed Date: Fri, 6 Mar 2026 04:37:00 +0000 Subject: [PATCH 6/6] Fix dispatch ABI mismatch - Add hidden dispatch kernarg fields (block_count_*, group_size_*) in launch_saxpy and populate them from launch dims. - Make isel SGPR layout conditional: use dispatch + kernarg SGPR pairs when needs_dispatch=1, otherwise keep kernarg-only layout. --- examples/launch_saxpy.c | 14 +++++++++++++- src/amdgpu/isel.c | 21 ++++++++++++++------- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/examples/launch_saxpy.c b/examples/launch_saxpy.c index 9870c21..2a0daf7 100644 --- a/examples/launch_saxpy.c +++ b/examples/launch_saxpy.c @@ -75,6 +75,12 @@ int main(int argc, char *argv[]) uint32_t _pad0; uint32_t n; uint32_t _pad1; + uint32_t hidden_block_count_x; + uint32_t hidden_block_count_y; + uint32_t hidden_block_count_z; + uint16_t hidden_group_size_x; + uint16_t hidden_group_size_y; + uint16_t hidden_group_size_z; } args; args.y = d_y; @@ -83,8 +89,14 @@ int main(int argc, char *argv[]) args._pad0 = 0; args.n = N; args._pad1 = 0; - uint32_t num_blocks = (N + BLOCK - 1) / BLOCK; + args.hidden_block_count_x = num_blocks; + args.hidden_block_count_y = 1; + args.hidden_block_count_z = 1; + args.hidden_group_size_x = BLOCK; + args.hidden_group_size_y = 1; + args.hidden_group_size_z = 1; + printf(" dispatch: %u blocks x %d threads\n", num_blocks, BLOCK); rc = bc_dispatch(&dev, &kern, num_blocks, 1, 1, BLOCK, 1, 1, diff --git a/src/amdgpu/isel.c b/src/amdgpu/isel.c index c0cceba..aec62ff 100644 --- a/src/amdgpu/isel.c +++ b/src/amdgpu/isel.c @@ -1839,13 +1839,20 @@ static void scan_kernel_needs(const bir_func_t *F) } if (S.max_dim > 2) S.max_dim = 2; /* clamp */ - /* SGPR layout — always: s[0:1]=kernarg, s2+=TGID. - * No dispatch_ptr; blockDim/gridDim read from hidden kernarg - * (same approach as hipcc — dispatch_ptr + multi-arg is cursed). */ - S.sgpr_dispatch = 0xFFFF; - S.sgpr_kernarg = 0; /* s[0:1] = kernarg ptr */ - S.sgpr_wg_base = 2; /* s2+ = workgroup IDs */ - S.kern_reserved = 2 + 1 + S.max_dim; /* after last TGID */ + /* SGPR layout depends on dispatch usage. + * With dispatch_ptr: s[0:1]=dispatch, s[2:3]=kernarg, s4+=TGID. + * Without dispatch_ptr: s[0:1]=kernarg, s2+=TGID. */ + if (S.needs_dispatch) { + S.sgpr_dispatch = 0; + S.sgpr_kernarg = 2; + S.sgpr_wg_base = 4; + S.kern_reserved = 4 + 1 + S.max_dim; + } else { + S.sgpr_dispatch = 0xFFFF; + S.sgpr_kernarg = 0; + S.sgpr_wg_base = 2; + S.kern_reserved = 2 + 1 + S.max_dim; + } /* Align reserved to even for SGPR pair loads */ if (S.kern_reserved & 1) S.kern_reserved++; }