Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions examples/launch_saxpy.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ int main(int argc, char *argv[])
return 1;
}

/* kernarg layout: {float *y, float *x, float a, int n} = 24 bytes */
/* kernarg layout follows AMDHSA alignment; keep n at byte offset 24. */
bc_kernel_t kern;
rc = bc_load_kernel(&dev, hsaco, "saxpy", &kern);
if (rc != BC_RT_OK) {
Expand Down Expand Up @@ -72,15 +72,31 @@ int main(int argc, char *argv[])
void *y;
void *x;
float a;
uint32_t _pad0;
uint32_t n;
uint32_t _pad1;
uint32_t hidden_block_count_x;
uint32_t hidden_block_count_y;
uint32_t hidden_block_count_z;
uint16_t hidden_group_size_x;
uint16_t hidden_group_size_y;
uint16_t hidden_group_size_z;
} args;

args.y = d_y;
args.x = d_x;
args.a = A_VAL;
args._pad0 = 0;
args.n = N;

args._pad1 = 0;
uint32_t num_blocks = (N + BLOCK - 1) / BLOCK;
args.hidden_block_count_x = num_blocks;
args.hidden_block_count_y = 1;
args.hidden_block_count_z = 1;
args.hidden_group_size_x = BLOCK;
args.hidden_group_size_y = 1;
args.hidden_group_size_z = 1;

printf(" dispatch: %u blocks x %d threads\n", num_blocks, BLOCK);

rc = bc_dispatch(&dev, &kern, num_blocks, 1, 1, BLOCK, 1, 1,
Expand Down
89 changes: 89 additions & 0 deletions flake.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
{
description = "BarraCUDA flake (devShell sets LD_LIBRARY_PATH for libhsa-runtime64.so)";

inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
flake-utils.url = "github:numtide/flake-utils";
};

outputs = { self, nixpkgs, flake-utils }:
flake-utils.lib.eachDefaultSystem (system:
let
pkgs = import nixpkgs { inherit system; };
lib = pkgs.lib;

rocmPkgs =
if pkgs ? rocmPackages then pkgs.rocmPackages
else throw "This nixpkgs does not provide pkgs.rocmPackages; use a newer nixpkgs.";

# Pick a package that provides libhsa-runtime64.so (ROCR / HSA runtime).
hsaRuntime =
let
candidates = [
"hsa-rocr"
"hsa-rocr-runtime"
"rocm-runtime"
"rocm-runtime-unwrapped"
];
name = lib.findFirst (n: builtins.hasAttr n rocmPkgs) null candidates;
in
if name == null then
throw "Could not find an ROCm HSA runtime in pkgs.rocmPackages (tried: ${builtins.toString candidates})."
else
rocmPkgs.${name};

barracuda = pkgs.stdenv.mkDerivation {
pname = "barracuda";
version = "unstable";
src = ./.;

nativeBuildInputs = [ pkgs.gnumake pkgs.makeWrapper ];

buildPhase = "make";

installPhase = ''
runHook preInstall
install -Dm755 barracuda $out/bin/barracuda
runHook postInstall
'';

# Convenience for `nix run .` so barracuda can dlopen() libhsa-runtime64.so.
postFixup = ''
wrapProgram $out/bin/barracuda \
--prefix LD_LIBRARY_PATH : ${hsaRuntime}/lib:${hsaRuntime}/lib64
'';

meta = {
homepage = "https://github.com/Zaneham/BarraCUDA";
license = lib.licenses.asl20;
mainProgram = "barracuda";
platforms = lib.platforms.linux;
};
};
in
{
packages = {
default = barracuda;
barracuda = barracuda;
hsaRuntime = hsaRuntime;
};

apps.default = {
type = "app";
program = "${barracuda}/bin/barracuda";
};

devShells.default = pkgs.mkShell {
packages = [
pkgs.gcc
pkgs.gnumake
hsaRuntime
];

shellHook = ''
# bc_runtime.c dlopen("libhsa-runtime64.so") relies on the loader search path.
export LD_LIBRARY_PATH=${hsaRuntime}/lib:${hsaRuntime}/lib64''${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}
'';
};
});
}
4 changes: 2 additions & 2 deletions src/amdgpu/amdgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -372,8 +372,8 @@ typedef struct {
uint32_t compute_pgm_rsrc3;
uint32_t compute_pgm_rsrc1;
uint32_t compute_pgm_rsrc2;
uint16_t kernel_code_properties;
uint8_t reserved2[6];
uint32_t kernel_code_properties;
uint8_t reserved2[4];
} amd_kernel_descriptor_t; /* 64 bytes */

/* ---- Module ---- */
Expand Down
54 changes: 31 additions & 23 deletions src/amdgpu/emit.c
Original file line number Diff line number Diff line change
Expand Up @@ -794,6 +794,17 @@ typedef struct {
uint64_t sh_entsize;
} elf64_shdr_t; /* 64 bytes */

typedef struct {
uint32_t p_type;
uint32_t p_flags;
uint64_t p_offset;
uint64_t p_vaddr;
uint64_t p_paddr;
uint64_t p_filesz;
uint64_t p_memsz;
uint64_t p_align;
} elf64_phdr_t; /* 56 bytes */

typedef struct {
uint32_t st_name;
uint8_t st_info;
Expand All @@ -809,17 +820,6 @@ typedef struct {
uint32_t n_type;
} elf64_nhdr_t; /* 12 bytes */

typedef struct {
uint32_t p_type;
uint32_t p_flags;
uint64_t p_offset;
uint64_t p_vaddr;
uint64_t p_paddr;
uint64_t p_filesz;
uint64_t p_memsz;
uint64_t p_align;
} elf64_phdr_t; /* 56 bytes */

typedef struct {
int64_t d_tag;
uint64_t d_val;
Expand Down Expand Up @@ -926,17 +926,21 @@ int amdgpu_emit_elf(amd_module_t *A, const char *path)
(1u << 27); /* MEM_ORDERED (RDNA only) */
}

/* compute_pgm_rsrc2 — [0] SCRATCH_EN, [5:1] USER_SGPR_COUNT,
[7] TGID_X, [8] TGID_Y, [9] TGID_Z, [12:11] VGPR_WORKITEM_ID.
Layout matches what isel's scan_kernel_needs() decided. */
/* compute_pgm_rsrc2.
* CDNA/GFX9 uses TGID bits at 11/12/13. RDNA keeps 7/8/9 and
* VGPR_WORKITEM_ID at 12:11. */
{
uint32_t user_sgpr = 2u; /* s[0:1] = kernarg only */
uint32_t tgid_x = cdna ? 11u : 7u;
uint32_t tgid_y = cdna ? 12u : 8u;
uint32_t tgid_z = cdna ? 13u : 9u;
uint32_t user_sgpr = 4u; /* dispatch_ptr + kernarg_ptr */
uint32_t rsrc2 = ((F->scratch_bytes > 0) ? 1u : 0u) |
(user_sgpr << 1) |
(1u << 7); /* TGID_X always enabled */
if (F->max_dim >= 1) rsrc2 |= (1u << 8); /* TGID_Y */
if (F->max_dim >= 2) rsrc2 |= (1u << 9); /* TGID_Z */
rsrc2 |= ((uint32_t)F->max_dim << 11); /* VGPR_WORKITEM_ID */
(1u << tgid_x);
if (F->max_dim >= 1) rsrc2 |= (1u << tgid_y);
if (F->max_dim >= 2) rsrc2 |= (1u << tgid_z);
if (!cdna)
rsrc2 |= ((uint32_t)F->max_dim << 11);
kd.compute_pgm_rsrc2 = rsrc2;
}

Expand All @@ -950,9 +954,12 @@ int amdgpu_emit_elf(amd_module_t *A, const char *path)
kd.compute_pgm_rsrc3 = accum_off & 0x3F;
}

/* kernel_code_properties — only KERNARG_PTR.
* No dispatch_ptr; blockDim/gridDim come from hidden kernarg. */
kd.kernel_code_properties = (1u << 3); /* ENABLE_SGPR_KERNARG_PTR */
/* kernel_code_properties (amd_hsa_kernel_code.h) */
kd.kernel_code_properties = (1u << 1) | /* ENABLE_SGPR_DISPATCH_PTR */
(1u << 3) | /* ENABLE_SGPR_KERNARG_PTR */
(1u << 19); /* IS_PTR64 */
if (!cdna)
kd.kernel_code_properties |= (1u << 10); /* WAVEFRONT_SIZE32 */

if (rodata_len + 64 <= sizeof(rodata)) {
memcpy(rodata + rodata_len, &kd, 64);
Expand Down Expand Up @@ -1337,10 +1344,11 @@ int amdgpu_emit_elf(amd_module_t *A, const char *path)
ehdr.e_ident[5] = 1; /* ELFDATA2LSB */
ehdr.e_ident[6] = 1; /* EV_CURRENT */
ehdr.e_ident[7] = ELFOSABI_AMDGPU_HSA;
ehdr.e_ident[8] = 4; /* ABI version 4 — code object v6 */
ehdr.e_ident[8] = 1; /* ELFABIVERSION_AMDGPU_HSA_V3 */
ehdr.e_type = 3; /* ET_DYN */
ehdr.e_machine = EM_AMDGPU;
ehdr.e_version = 1;
ehdr.e_entry = 0;
ehdr.e_phoff = phdr_off;
ehdr.e_shoff = shdr_off;
ehdr.e_flags = A->elf_mach;
Expand Down
21 changes: 14 additions & 7 deletions src/amdgpu/isel.c
Original file line number Diff line number Diff line change
Expand Up @@ -1839,13 +1839,20 @@ static void scan_kernel_needs(const bir_func_t *F)
}
if (S.max_dim > 2) S.max_dim = 2; /* clamp */

/* SGPR layout — always: s[0:1]=kernarg, s2+=TGID.
* No dispatch_ptr; blockDim/gridDim read from hidden kernarg
* (same approach as hipcc — dispatch_ptr + multi-arg is cursed). */
S.sgpr_dispatch = 0xFFFF;
S.sgpr_kernarg = 0; /* s[0:1] = kernarg ptr */
S.sgpr_wg_base = 2; /* s2+ = workgroup IDs */
S.kern_reserved = 2 + 1 + S.max_dim; /* after last TGID */
/* SGPR layout depends on dispatch usage.
* With dispatch_ptr: s[0:1]=dispatch, s[2:3]=kernarg, s4+=TGID.
* Without dispatch_ptr: s[0:1]=kernarg, s2+=TGID. */
if (S.needs_dispatch) {
S.sgpr_dispatch = 0;
S.sgpr_kernarg = 2;
S.sgpr_wg_base = 4;
S.kern_reserved = 4 + 1 + S.max_dim;
} else {
S.sgpr_dispatch = 0xFFFF;
S.sgpr_kernarg = 0;
S.sgpr_wg_base = 2;
S.kern_reserved = 2 + 1 + S.max_dim;
}
/* Align reserved to even for SGPR pair loads */
if (S.kern_reserved & 1) S.kern_reserved++;
}
Expand Down