diff --git a/Bender.lock b/Bender.lock index d48a9c04d8..bc2a82c405 100644 --- a/Bender.lock +++ b/Bender.lock @@ -56,7 +56,7 @@ packages: Git: https://github.com/pulp-platform/common_verification.git dependencies: [] fpnew: - revision: a8e0cba6dd50f357ece73c2c955d96efc3c6c315 + revision: c527911ae0fa394e37deb39fc1a20e92a7ea0b5b version: null source: Git: https://github.com/pulp-platform/cvfpu.git diff --git a/Bender.yml b/Bender.yml index 29fc783b54..6cdc7e2c19 100644 --- a/Bender.yml +++ b/Bender.yml @@ -22,7 +22,7 @@ dependencies: axi: { git: https://github.com/pulp-platform/axi, rev: 4e54ac6766b160217a83a74d5a23af9bbf59e6ee } axi_riscv_atomics: { git: https://github.com/pulp-platform/axi_riscv_atomics, version: 0.6.0 } common_cells: { git: https://github.com/pulp-platform/common_cells, version: 1.35.0 } - FPnew: { git: "https://github.com/pulp-platform/cvfpu.git", rev: pulp-v0.1.3 } + FPnew: { git: "https://github.com/pulp-platform/cvfpu.git", rev: shfl } register_interface: { git: https://github.com/pulp-platform/register_interface, version: 0.4.2 } tech_cells_generic: { git: https://github.com/pulp-platform/tech_cells_generic, version: 0.2.11 } riscv-dbg: { git: https://github.com/pulp-platform/riscv-dbg, version: 0.8.0 } diff --git a/hw/snitch/src/riscv_instr.sv b/hw/snitch/src/riscv_instr.sv index b415628afb..de1f34a608 100644 --- a/hw/snitch/src/riscv_instr.sv +++ b/hw/snitch/src/riscv_instr.sv @@ -839,6 +839,12 @@ package riscv_instr; localparam logic [31:0] VL2R_V = 32'b000001101000?????101?????0000111; localparam logic [31:0] VL4R_V = 32'b000011101000?????110?????0000111; localparam logic [31:0] VL8R_V = 32'b000111101000?????111?????0000111; + localparam logic [31:0] VFSHUFFLE_S = 32'b1011111??????????000?????0110011; + localparam logic [31:0] VFSHUFFLE_H = 32'b1011111??????????010?????0110011; + localparam logic [31:0] VFSHUFFLE_B = 32'b1011111??????????011?????0110011; + localparam logic [31:0] VFSHUFFLE2_S = 32'b1011111??????????100?????0110011; + localparam logic [31:0] VFSHUFFLE2_H = 32'b1011111??????????110?????0110011; + localparam logic [31:0] VFSHUFFLE2_B = 32'b1011111??????????111?????0110011; localparam logic [31:0] IMV_X_W = 32'b111000000000?????000?????1011011; localparam logic [31:0] IMV_W_X = 32'b111100000000?????000?????1011011; localparam logic [31:0] IADDI = 32'b?????????????????000?????1111011; diff --git a/hw/snitch/src/snitch.sv b/hw/snitch/src/snitch.sv index 59723954a3..1cc10b464c 100644 --- a/hw/snitch/src/snitch.sv +++ b/hw/snitch/src/snitch.sv @@ -1705,6 +1705,33 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( illegal_inst = 1'b1; end end + VFSHUFFLE_S, + VFSHUFFLE2_S: begin + if (FP_EN && XFVEC && FLEN >= 64) begin + write_rd = 1'b0; + acc_qvalid_o = valid_instr; + end else begin + illegal_inst = 1'b1; + end + end + VFSHUFFLE_H, + VFSHUFFLE2_H: begin + if (FP_EN && XFVEC && FLEN >= 32) begin + write_rd = 1'b0; + acc_qvalid_o = valid_instr; + end else begin + illegal_inst = 1'b1; + end + end + VFSHUFFLE_B, + VFSHUFFLE2_B: begin + if (FP_EN && XFVEC && FLEN >= 16) begin + write_rd = 1'b0; + acc_qvalid_o = valid_instr; + end else begin + illegal_inst = 1'b1; + end + end // Offload FP-Int Instructions - fire and forget // Double Precision Floating-Point FLE_D, diff --git a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl index b3c35d9ace..d33cad29e1 100644 --- a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl +++ b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl @@ -144,7 +144,8 @@ package ${cfg['pkg_name']}; ${cfg['timing']['lat_sdotp']}, ${cfg['timing']['lat_sdotp']}, ${cfg['timing']['lat_sdotp']}, - ${cfg['timing']['lat_sdotp']}} // DOTP + ${cfg['timing']['lat_sdotp']}}, // DOTP + '{0, 0, 0, 0, 0, 0} // SHFL TODO: Make configurable }, UnitTypes: '{'{fpnew_pkg::MERGED, fpnew_pkg::MERGED, @@ -185,14 +186,29 @@ package ${cfg['pkg_name']}; fpnew_pkg::MERGED, fpnew_pkg::MERGED, fpnew_pkg::MERGED, - fpnew_pkg::MERGED}}, // DOTP + fpnew_pkg::MERGED}, // DOTP % else: '{fpnew_pkg::DISABLED, fpnew_pkg::DISABLED, fpnew_pkg::DISABLED, fpnew_pkg::DISABLED, fpnew_pkg::DISABLED, - fpnew_pkg::DISABLED}}, // DOTP + fpnew_pkg::DISABLED}, // DOTP +% endif +% if c["xfvec"]: + '{fpnew_pkg::MERGED, + fpnew_pkg::MERGED, + fpnew_pkg::MERGED, + fpnew_pkg::MERGED, + fpnew_pkg::MERGED, + fpnew_pkg::MERGED}}, // SHFL +% else: + '{fpnew_pkg::DISABLED, + fpnew_pkg::DISABLED, + fpnew_pkg::DISABLED, + fpnew_pkg::DISABLED, + fpnew_pkg::DISABLED, + fpnew_pkg::DISABLED}}, // SHFL % endif PipeConfig: fpnew_pkg::${cfg['timing']['fpu_pipe_config']} }${',\n' if not loop.last else '\n'}\ diff --git a/hw/snitch_cluster/src/snitch_fp_ss.sv b/hw/snitch_cluster/src/snitch_fp_ss.sv index 65c3e86e4a..a38f31557a 100644 --- a/hw/snitch_cluster/src/snitch_fp_ss.sv +++ b/hw/snitch_cluster/src/snitch_fp_ss.sv @@ -528,6 +528,18 @@ module snitch_fp_ss import snitch_pkg::*; #( vectorial_op = 1'b1; set_dyn_rm = 1'b1; end + riscv_instr::VFSHUFFLE_S, + riscv_instr::VFSHUFFLE2_S: begin + fpu_op = fpnew_pkg::VFSHFL; + op_select[0] = RegA; + op_select[1] = RegB; + op_select[2] = RegDest; + src_fmt = fpnew_pkg::FP32; + dst_fmt = fpnew_pkg::FP32; + vectorial_op = 1'b1; + use_fpu = 1'b1; + if (acc_req_q.data_op inside {riscv_instr::VFSHUFFLE2_S}) op_mode = 1'b1; + end // Double Precision riscv_instr::FADD_D: begin fpu_op = fpnew_pkg::ADD; @@ -1096,6 +1108,18 @@ module snitch_fp_ss import snitch_pkg::*; #( set_dyn_rm = 1'b1; if (acc_req_q.data_op inside {riscv_instr::VFNSUMEX_S_H}) op_mode = 1'b1; end + riscv_instr::VFSHUFFLE_H, + riscv_instr::VFSHUFFLE2_H: begin + fpu_op = fpnew_pkg::VFSHFL; + op_select[0] = RegA; + op_select[1] = RegB; + op_select[2] = RegDest; + src_fmt = fpnew_pkg::FP16; + dst_fmt = fpnew_pkg::FP16; + vectorial_op = 1'b1; + use_fpu = 1'b1; + if (acc_req_q.data_op inside {riscv_instr::VFSHUFFLE2_H}) op_mode = 1'b1; + end // [Alternate] Quarter Precision riscv_instr::FADD_B: begin fpu_op = fpnew_pkg::ADD; @@ -1616,6 +1640,18 @@ module snitch_fp_ss import snitch_pkg::*; #( set_dyn_rm = 1'b1; if (acc_req_q.data_op inside {riscv_instr::VFNSUMEX_H_B}) op_mode = 1'b1; end + riscv_instr::VFSHUFFLE_B, + riscv_instr::VFSHUFFLE2_B: begin + fpu_op = fpnew_pkg::VFSHFL; + op_select[0] = RegA; + op_select[1] = RegB; + op_select[2] = RegDest; + src_fmt = fpnew_pkg::FP8; + dst_fmt = fpnew_pkg::FP8; + vectorial_op = 1'b1; + use_fpu = 1'b1; + if (acc_req_q.data_op inside {riscv_instr::VFSHUFFLE2_B}) op_mode = 1'b1; + end // ------------------- // From float to int // ------------------- diff --git a/sw/deps/riscv-opcodes b/sw/deps/riscv-opcodes index 94caf0e0fe..19132ec7f6 160000 --- a/sw/deps/riscv-opcodes +++ b/sw/deps/riscv-opcodes @@ -1 +1 @@ -Subproject commit 94caf0e0fefff1009ba144bccb6d8f7d425ea2f5 +Subproject commit 19132ec7f6f56e73aa828b184cbe769f0021dbc7 diff --git a/sw/dnn/conv2d/src/conv2d.h b/sw/dnn/conv2d/src/conv2d.h index 6dbb74ba10..80231b9e09 100644 --- a/sw/dnn/conv2d/src/conv2d.h +++ b/sw/dnn/conv2d/src/conv2d.h @@ -2457,7 +2457,7 @@ void conv2d_layer(const conv_layer *l) { l->FH * l->FW * l->TILE_CI + 1, 1, &ofmap[write_buf * ofmap_stride + compute_id * ofmap_co_stride], - 0, &alpha, setup_SSR); + 0, alpha, setup_SSR); } else { const uint32_t alpha = 1; @@ -2469,7 +2469,7 @@ void conv2d_layer(const conv_layer *l) { l->FH * l->FW * l->TILE_CI + 1, 1, &ofmap[write_buf * ofmap_stride + compute_id * ofmap_co_stride], - 0, &alpha, setup_SSR); + 0, alpha, setup_SSR); } } // Toggle read and write buffer diff --git a/sw/tests/shuffle.c b/sw/tests/shuffle.c new file mode 100644 index 0000000000..1f04df6b95 --- /dev/null +++ b/sw/tests/shuffle.c @@ -0,0 +1,306 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +#include + +int main() { + if (snrt_is_compute_core() && (snrt_cluster_core_idx() == 0)) { + int errs = 0; + + uint32_t a = 0x4048F5C3; // 3.14 0 + uint32_t an = 0xC048F5C3; // -3.14 + uint32_t b = 0x3FCF1AA0; // 1.618 2 + uint32_t bn = 0xBFCF1AA0; // -1.618 + uint32_t c = 0x4018FFEB; // 2.39062 + uint32_t cn = 0xC018FFEB; // -2.39062 + uint32_t d = 0x3E801FFB; // 0.250244 6 + uint32_t dn = 0xBE801FFB; // -0.250244 + uint32_t e = 0x3F000000; // 0.5 + uint32_t en = 0xBF000000; // -0.5 + uint32_t f = 0x42C83F36; // 100.123456789 10 + uint32_t fn = 0xC2C83F36; // -100.123456789 + uint32_t g = 0x40B80000; // 5.75 + uint32_t gn = 0xC0B80000; // -5.75 + uint32_t h = 0x410428F6; // 8.26 + uint32_t hn = 0xC10428F6; // -8.26 + + int res0 = 0; + int res1 = 0; + uint32_t mask_a, mask_b; + + // vfshuffle.s + mask_a = 0x10; // -> [vec[0][1], vec[0][0]] + mask_b = 0x98; // -> [vec[0][1], vec[0][0]] + + asm volatile( + "fmv.s.x ft0, %[a]\n" // ft0 = a + "fmv.s.x ft1, %[b]\n" // ft1 = b + // Pack input & solution vectors + "vfcpka.s.s ft2, ft0, ft1\n" // ft2 = [a, b] + // Load mask + "fmv.s.x ft3, %[mask_a]\n" // ft3 = mask_a + "fmv.s.x ft1, %[mask_b]\n" // ft1 = mask_b + // Shuffle input vectors with mask + "vfshuffle.s ft0, ft2, ft3\n" // ft0 = [a, b] + "vfshuffle.s ft1, ft2, ft1\n" // ft1 = [a, b] + // Compare + "vfeq.s %[res0], ft0, ft2\n" // res0 = (ft0 == ft2) = 0x3 + "vfeq.s %[res1], ft1, ft2\n" // res1 = (ft1 == ft2) = 0x3 + : [ res0 ] "+r"(res0), [ res1 ] "+r"(res1) + : [ a ] "r"(a), [ b ] "r"(b), [ mask_a ] "r"(mask_a), + [ mask_b ] "r"(mask_b) + : "ft0", "ft1", "ft2", "ft3"); + + mask_a = 0x01; // -> [vec[0][0], vec[0][1]] + mask_b = 0x89; // -> [vec[1][0], vec[1][1]] + + asm volatile( + "fmv.s.x ft0, %[a]\n" // ft0 = a + "fmv.s.x ft1, %[b]\n" // ft1 = b + // Pack input vectors + "vfcpka.s.s ft2, ft0, ft1\n" // ft2 = [a, b] + // Pack solution vectors + "vfcpka.s.s ft3, ft1, ft0\n" // ft3 = [b, a] + // Load mask + "fmv.s.x ft4, %[mask_a]\n" // ft4 = mask_a + "fmv.s.x ft5, %[mask_b]\n" // ft5 = mask_b + // Shuffle input vectors with mask + "vfshuffle.s ft6, ft2, ft4\n" // ft6 = [b, a] + "vfshuffle.s ft7, ft2, ft5\n" // ft7 = [b, a] + // Compare + "vfeq.s %[res0], ft6, ft3\n" // res0 = (ft6 == ft3) = 0x3 + "vfeq.s %[res1], ft7, ft3\n" // res1 = (ft7 == ft3) = 0x3 + : [ res0 ] "+r"(res0), [ res1 ] "+r"(res1) + : [ a ] "r"(a), [ b ] "r"(b), [ mask_a ] "r"(mask_a), + [ mask_b ] "r"(mask_b) + : "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7"); + errs += (res0 != 0x3); + errs += (res1 != 0x3); + + // vfshuffle2.s + mask_a = 0x99; // -> [vec[1][1], vec[1][1]] + mask_b = 0x81; // -> [vec[1][0], vec[0][1]] + + asm volatile( + "fmv.s.x ft0, %[a]\n" // ft0 = a + "fmv.s.x ft1, %[b]\n" // ft1 = b + "fmv.s.x ft2, %[c]\n" // ft2 = c + "fmv.s.x ft3, %[d]\n" // ft3 = d + // Pack input vectors + "vfcpka.s.s ft4, ft0, ft1\n" // ft4 = [b, a] + "vfcpka.s.s ft5, ft2, ft3\n" // ft5 = [d, c] + // Pack solution vectors + "vfcpka.s.s ft6, ft1, ft1\n" // ft6 = [b, b] + // Load mask + "fmv.s.x ft7, %[mask_a]\n" // ft7 = mask_a + // Shuffle input vectors with mask + "vfshuffle2.s ft4, ft5, ft7\n" // ft4 = [b, b] + // Compare + "vfeq.s %[res0], ft4, ft6\n" // res0 = (ft4 == ft6) = 0x3 + : [ res0 ] "+r"(res0) + : [ a ] "r"(a), [ b ] "r"(b), [ c ] "r"(c), [ d ] "r"(d), + [ mask_a ] "r"(mask_a) + : "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7"); + + asm volatile( + "fmv.s.x ft0, %[a]\n" // ft0 = a + "fmv.s.x ft1, %[b]\n" // ft1 = b + "fmv.s.x ft2, %[c]\n" // ft2 = c + "fmv.s.x ft3, %[d]\n" // ft3 = d + // Pack input vectors + "vfcpka.s.s ft4, ft0, ft1\n" // ft4 = [b, a] + "vfcpka.s.s ft5, ft2, ft3\n" // ft5 = [d, c] + // Pack solution vectors + "vfcpka.s.s ft6, ft3, ft0\n" // ft6 = [a, d] + // Load mask + "fmv.s.x ft7, %[mask_b]\n" // ft7 = mask_b + // Shuffle input vectors with mask + "vfshuffle2.s ft4, ft5, ft7\n" // ft4 = [a, d] + // Compare + "vfeq.s %[res1], ft4, ft6\n" // res1 = (ft4 == ft6) = 0x3 + : [ res1 ] "+r"(res1) + : [ a ] "r"(a), [ b ] "r"(b), [ c ] "r"(c), [ d ] "r"(d), + [ mask_b ] "r"(mask_b) + : "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7"); + + errs += (res0 != 0x3); + errs += (res1 != 0x3); + + // vfshuffle.h + mask_a = 0x0000; // -> [vec[0][0], vec[0][0], vec[0][0], vec[0][0]] + mask_b = 0x0123; // -> [vec[0][0], vec[0][1], vec[0][2], vec[0][3]] + + asm volatile( + "fmv.s.x ft0, %[a]\n" // ft0 = a + "fmv.s.x ft1, %[b]\n" // ft1 = b + "fmv.s.x ft2, %[c]\n" // ft2 = c + "fmv.s.x ft3, %[d]\n" // ft3 = d + // Pack input vectors + "vfcpka.h.s ft4, ft0, ft1\n" // ft4 = [-, -, b, a] + "vfcpkb.h.s ft4, ft2, ft3\n" // ft4 = [d, c, b, a] + "vfcpka.h.s ft5, ft0, ft0\n" // ft5 = [-, -, a, a] + "vfcpkb.h.s ft5, ft0, ft0\n" // ft5 = [a, a, a, a] + // Pack solution vectors + "vfcpka.h.s ft6, ft3, ft2\n" // ft6 = [-, -, c, d] + "vfcpkb.h.s ft6, ft1, ft0\n" // ft6 = [a, b, c, d] + // Load mask + "fmv.s.x ft7, %[mask_a]\n" // ft7 = mask_a + "fmv.s.x ft8, %[mask_b]\n" // ft8 = mask_b + // Shuffle input vectors with mask + "vfshuffle.h ft9, ft4, ft7\n" // ft9 = [a, a, a, a] + "vfshuffle.h ft10, ft4, ft8\n" // ft10 = [d, c, b, a] + // Compare + "vfeq.h %[res0], ft5, ft9\n" // res0 = (ft5 == ft9) = 0xf + "vfeq.h %[res1], ft6, ft10\n" // res1 = (ft6 == ft10) = 0xf + : [ res0 ] "+r"(res0), [ res1 ] "+r"(res1) + : [ a ] "r"(a), [ b ] "r"(b), [ c ] "r"(c), [ d ] "r"(d), + [ mask_a ] "r"(mask_a), [ mask_b ] "r"(mask_b) + : "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", + "ft9", "ft10"); + + errs += (res0 != 0xf); + errs += (res1 != 0xf); + + // vfshuffle2.h + mask_a = 0x092b; // -> [vec[0][0], vec[1][1], vec[0][2], vec[1][3]] + mask_b = 0x8800; // -> [vec[1][0], vec[1][0], vec[0][0], vec[0][0]] + + asm volatile( + "fmv.s.x ft0, %[a]\n" // ft0 = a + "fmv.s.x ft1, %[b]\n" // ft1 = b + "fmv.s.x ft2, %[c]\n" // ft2 = c + "fmv.s.x ft3, %[d]\n" // ft3 = d + "fmv.s.x ft4, %[e]\n" // ft4 = e + "fmv.s.x ft5, %[f]\n" // ft5 = f + "fmv.s.x ft6, %[g]\n" // ft6 = g + "fmv.s.x ft7, %[h]\n" // ft7 = h + // Pack input vectors + "vfcpka.h.s ft8, ft0, ft1\n" // ft8 = [-, -, b, a] + "vfcpkb.h.s ft8, ft2, ft3\n" // ft8 = [d, c, b, a] + "vfcpka.h.s ft9, ft4, ft5\n" // ft9 = [-, -, f, e] + "vfcpkb.h.s ft9, ft6, ft7\n" // ft9 = [h, g, f, e] + // Copy second input vector + "fmv.d ft10, ft9\n" // ft10 = [d, c, b, a] + // Pack solution vectors + "vfcpka.h.s ft11, ft7, ft2\n" // ft11 = [-, -, c, h] + "vfcpkb.h.s ft11, ft5, ft0\n" // ft11 = [a, f, c, h] + "vfcpka.h.s fa0, ft0, ft0\n" // fa0 = [-, -, a, a] + "vfcpkb.h.s fa0, ft4, ft4\n" // fa0 = [e, e, a, a] + // Load mask + "fmv.s.x fa1, %[mask_a]\n" // fa1 = mask_a + "fmv.s.x fa2, %[mask_b]\n" // fa2 = mask_b + // Shuffle input vectors with mask + "vfshuffle2.h ft9, ft8, fa1\n" // ft8 = [a, g, c, h] + "vfshuffle2.h ft10, ft8, fa2\n" // ft9 = [e, e, a, a] + // Compare + "vfeq.h %[res0], ft9, ft11\n" // res0 = (ft10 == ft8) = 0xf + "vfeq.h %[res1], ft10, fa0\n" // res1 = (ft11 == ft9) = 0xf + : [ res0 ] "+r"(res0), [ res1 ] "+r"(res1) + : [ a ] "r"(a), [ b ] "r"(b), [ c ] "r"(c), [ d ] "r"(d), + [ e ] "r"(e), [ f ] "r"(f), [ g ] "r"(g), [ h ] "r"(h), + [ mask_a ] "r"(mask_a), [ mask_b ] "r"(mask_b) + : "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", + "ft9", "ft10", "ft11", "fa0", "fa1"); + + errs += (res0 != 0xf); + errs += (res1 != 0xf); + + // vfshuffle.b + mask_a = 0x01234567; // -> [vec[0][0], vec[0][1], vec[0][2], vec[0][3], + // vec[0][4], vec[0][5], vec[0][6], vec[0][7]] + + asm volatile( + "fmv.s.x ft0, %[a]\n" // ft0 = a + "fmv.s.x ft1, %[b]\n" // ft1 = b + "fmv.s.x ft2, %[c]\n" // ft2 = c + "fmv.s.x ft3, %[d]\n" // ft3 = d + "fmv.s.x ft4, %[e]\n" // ft4 = e + "fmv.s.x ft5, %[f]\n" // ft5 = f + "fmv.s.x ft6, %[g]\n" // ft6 = g + "fmv.s.x ft7, %[h]\n" // ft7 = h + // Pack input vectors + "vfcpka.b.s ft8, ft0, ft1\n" // ft8 = [-, - ,-, -, -, -, b, a] + "vfcpkb.b.s ft8, ft2, ft3\n" // ft8 = [-, - ,-, -, d, c, b, a] + "vfcpkc.b.s ft8, ft4, ft5\n" // ft8 = [-, -, f, e, d, c, b, a] + "vfcpkd.b.s ft8, ft6, ft7\n" // ft8 = [h, g, f, e, d, c, b, a] + // Pack solution vectors + "vfcpka.b.s ft9, ft7, ft6\n" // ft9 = [-, -, -, -, -, -, g, h] + "vfcpkb.b.s ft9, ft5, ft4\n" // ft9 = [-, -, -, -, e, f, g, h] + "vfcpkc.b.s ft9, ft3, ft2\n" // ft9 = [-, -, c, d, e, f, g, h] + "vfcpkd.b.s ft9, ft1, ft0\n" // ft9 = [a, b, c, d, e, f, g, h] + // Load mask + "fmv.s.x ft10, %[mask_a]\n" // ft10 = mask_a + // Shuffle input vectors with mask + "vfshuffle.b ft11, ft8, ft10\n" // ft11 = [a, b, c, d, e, f, g, h] + // Compare + "vfeq.b %[res0], ft11, ft9\n" // res0 = (ft11 == ft9) = 0xff + : [ res0 ] "+r"(res0) + : [ a ] "r"(a), [ b ] "r"(b), [ c ] "r"(c), [ d ] "r"(d), + [ e ] "r"(e), [ f ] "r"(f), [ g ] "r"(g), [ h ] "r"(h), + [ mask_a ] "r"(mask_a) + : "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", + "ft9", "ft10", "ft11"); + + errs += (res0 != 0xff); + + // vfshuffle2.b + mask_a = 0x89ab4567; // -> [vec[1][0], vec[1][1], vec[1][2], vec[1][3], + // vec[0][4], vec[0][5], vec[0][6], vec[0][7]] + + asm volatile( + "fmv.s.x ft0, %[a]\n" // ft0 = a + "fmv.s.x ft1, %[b]\n" // ft1 = b + "fmv.s.x ft2, %[c]\n" // ft2 = c + "fmv.s.x ft3, %[d]\n" // ft3 = d + "fmv.s.x ft4, %[e]\n" // ft4 = e + "fmv.s.x ft5, %[f]\n" // ft5 = f + "fmv.s.x ft6, %[g]\n" // ft6 = g + "fmv.s.x ft7, %[h]\n" // ft7 = h + "fmv.s.x ft8, %[an]\n" // ft8 = an + "fmv.s.x ft9, %[bn]\n" // ft9 = bn + "fmv.s.x ft10, %[cn]\n" // ft10 = cn + "fmv.s.x ft11, %[dn]\n" // ft11 = dn + "fmv.s.x fa0, %[en]\n" // fa0 = en + "fmv.s.x fa1, %[fn]\n" // fa1 = fn + "fmv.s.x fa2, %[gn]\n" // fa2 = gn + "fmv.s.x fa3, %[hn]\n" // fa3 = hn + // Pack input vectors + "vfcpka.b.s fa4, ft0, ft1\n" // fa4 = [-, - ,-, -, -, -, b, a] + "vfcpkb.b.s fa4, ft2, ft3\n" // fa4 = [-, - ,-, -, d, c, b, a] + "vfcpkc.b.s fa4, ft4, ft5\n" // fa4 = [-, -, f, e, d, c, b, a] + "vfcpkd.b.s fa4, ft6, ft7\n" // fa4 = [h, g, f, e, d, c, b, a] + "vfcpka.b.s fa5, ft8, ft9\n" // fa5 = [-, - ,-, -, -, -, bn, an] + "vfcpkb.b.s fa5, ft10, ft11\n" // fa5 = [-, - ,-, -, dn, cn, bn, + // an] + "vfcpkc.b.s fa5, fa0, fa1\n" // fa5 = [-, -, fn, en, dn, cn, bn, + // an] + "vfcpkd.b.s fa5, fa2, fa3\n" // fa5 = [hn, gn, fn, en, dn, cn, bn, + // an] + // Pack solution vectors + "vfcpka.b.s fa6, ft7, ft6\n" // fa6 = [-, -, -, -, -, -, g, h] + "vfcpkb.b.s fa6, ft5, ft4\n" // fa6 = [-, -, -, -, e, f, g, h] + "vfcpkc.b.s fa6, ft11, ft10\n" // fa6 = [-, -, cn, dn, e, f, g, h] + "vfcpkd.b.s fa6, ft9, ft8\n" // fa6 = [an, bn, cn, dn, e, f, g, h] + // Load mask + "fmv.s.x fa7, %[mask_a]\n" // ft10 = mask_a + // Shuffle input vectors with mask + "vfshuffle2.b fa5, fa4, fa7\n" // fa5 = [an, bn, cn, dn, e, f, g, + // h] + // Compare + "vfeq.b %[res0], fa5, fa6\n" // res0 = (fa5 == fa6) = 0xffff + : [ res0 ] "+r"(res0) + : [ a ] "r"(a), [ b ] "r"(b), [ c ] "r"(c), [ d ] "r"(d), + [ e ] "r"(e), [ f ] "r"(f), [ g ] "r"(g), [ h ] "r"(h), + [ an ] "r"(an), [ bn ] "r"(bn), [ cn ] "r"(cn), [ dn ] "r"(dn), + [ en ] "r"(en), [ fn ] "r"(fn), [ gn ] "r"(gn), [ hn ] "r"(hn), + [ mask_a ] "r"(mask_a) + : "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", + "ft9", "ft10", "ft11", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", + "fa6", "fa7"); + + errs += (res0 != 0xff); + + return errs; + } + return 0; +} diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml index d9e2f8c2fe..aa40e573a2 100644 --- a/target/snitch_cluster/sw/run.yaml +++ b/target/snitch_cluster/sw/run.yaml @@ -55,6 +55,8 @@ runs: simulators: [vsim, vcs, verilator] # banshee fails with illegal instruction # - elf: tests/build/fp64_conversions_scalar.elf # simulators: [vsim, vcs, verilator] + - elf: tests/build/shuffle.elf + simulators: [vsim, vcs, verilator] - elf: tests/build/interrupt_local.elf - elf: tests/build/multi_cluster.elf - elf: tests/build/openmp_parallel.elf diff --git a/target/snitch_cluster/sw/tests/tests.mk b/target/snitch_cluster/sw/tests/tests.mk index 35afa66ead..88a22cce3c 100644 --- a/target/snitch_cluster/sw/tests/tests.mk +++ b/target/snitch_cluster/sw/tests/tests.mk @@ -36,7 +36,7 @@ TEST_DWARFS = $(abspath $(addprefix $(TESTS_BUILDDIR)/,$(addsuffix .dwarf,$(TES TEST_OUTPUTS = $(TEST_ELFS) ifeq ($(DEBUG), ON) -TEST_OUTPUTS += $(DUMPS) $(DWARFS) +TEST_OUTPUTS += $(TEST_DUMPS) $(TEST_DWARFS) endif ######### diff --git a/util/generate-opcodes.sh b/util/generate-opcodes.sh index 82f35f391d..aad8b43b8f 100755 --- a/util/generate-opcodes.sh +++ b/util/generate-opcodes.sh @@ -8,7 +8,7 @@ set -e ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) RISCV_OPCODES=$ROOT/sw/deps/riscv-opcodes -OPCODES=(opcodes-pseudo opcodes-rv32i opcodes-rv64i opcodes-rv32m opcodes-rv64m opcodes-rv32a opcodes-rv64a opcodes-rv32h opcodes-rv64h opcodes-rv32f opcodes-rv64f opcodes-rv32d opcodes-rv64d opcodes-rv32q opcodes-rv64q opcodes-system opcodes-custom opcodes-rv32b_CUSTOM opcodes-dma_CUSTOM opcodes-frep_CUSTOM opcodes-ssr_CUSTOM opcodes-flt-occamy_CUSTOM opcodes-rvv-pseudo) +OPCODES=(opcodes-pseudo opcodes-rv32i opcodes-rv64i opcodes-rv32m opcodes-rv64m opcodes-rv32a opcodes-rv64a opcodes-rv32h opcodes-rv64h opcodes-rv32f opcodes-rv64f opcodes-rv32d opcodes-rv64d opcodes-rv32q opcodes-rv64q opcodes-system opcodes-custom opcodes-rv32b_CUSTOM opcodes-dma_CUSTOM opcodes-frep_CUSTOM opcodes-ssr_CUSTOM opcodes-flt-occamy_CUSTOM opcodes-rvv-pseudo opcodes-shuffling) ####### # RTL # diff --git a/util/trace/opcodes-flt-occamy_CUSTOM.csv b/util/trace/opcodes-flt-occamy_CUSTOM.csv index 00766424ae..b3f514de62 100644 --- a/util/trace/opcodes-flt-occamy_CUSTOM.csv +++ b/util/trace/opcodes-flt-occamy_CUSTOM.csv @@ -553,3 +553,9 @@ vfdotpex.ah.ab,8,8,16,4 vfdotpex.ah.r.ab,8,8,16,4 vfndotpex.ah.ab,8,8,16,4 vfndotpex.ah.r.ab,8,8,16,4 +vfshuffle.s,32,2,32,2 +vfshuffle.h,16,4,16,4 +vfshuffle.b,8,8,8,8 +vfshuffle2.s,32,2,32,2 +vfshuffle2.h,16,4,16,4 +vfshuffle2.b,8,8,8,8