Skip to content

Commit af6df0a

Browse files
committed
Cleanup DCA implementation
1 parent da71882 commit af6df0a

File tree

10 files changed

+389
-385
lines changed

10 files changed

+389
-385
lines changed

Bender.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ export_include_dirs:
3333
- hw/reqrsp_interface/include
3434
- hw/mem_interface/include
3535
- hw/tcdm_interface/include
36+
- hw/dca_interface/include
3637
- hw/snitch/include
3738
- hw/snitch_ssr/include
3839
- hw/generated
@@ -90,6 +91,10 @@ sources:
9091
- hw/tcdm_interface/test/reqrsp_to_tcdm_tb.sv
9192
- hw/tcdm_interface/test/tcdm_mux_tb.sv
9293

94+
# dca_interface
95+
- files:
96+
- hw/dca_interface/src/dca_fork.sv
97+
9398
# snitch
9499
- files:
95100
# Level 0
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
`ifndef DCA_INTERFACE_TYPEDEF_SVH_
2+
`define DCA_INTERFACE_TYPEDEF_SVH_
3+
4+
`include "reqrsp_interface/typedef.svh"
5+
6+
`define DCA_REQ_CHAN_STRUCT(__data_width, __tag_width) \
7+
struct packed { \
8+
logic [2:0][__data_width-1:0] dca_operands; \
9+
fpnew_pkg::roundmode_e dca_rnd_mode; \
10+
fpnew_pkg::operation_e dca_op_code; \
11+
logic dca_op_mode; \
12+
fpnew_pkg::fp_format_e dca_src_format; \
13+
fpnew_pkg::fp_format_e dca_dst_format; \
14+
fpnew_pkg::int_format_e dca_int_format; \
15+
logic dca_vector_op; \
16+
logic [__tag_width-1:0] dca_tag; \
17+
}
18+
19+
`define DCA_RSP_CHAN_STRUCT(__data_width, __tag_width) \
20+
struct packed { \
21+
logic [__tag_width-1:0] dca_tag; \
22+
fpnew_pkg::status_t dca_status; \
23+
logic [__data_width-1:0] dca_result; \
24+
}
25+
26+
`define DCA_REQ_STRUCT(__data_width, __tag_width) \
27+
`GENERIC_REQRSP_REQ_STRUCT(`DCA_REQ_CHAN_STRUCT(__data_width, __tag_width))
28+
29+
`define DCA_RSP_STRUCT(__data_width, __tag_width) \
30+
`GENERIC_REQRSP_RSP_STRUCT(`DCA_RSP_CHAN_STRUCT(__data_width, __tag_width))
31+
32+
`define DCA_TYPEDEF_REQ_CHAN_T(__name, __data_width, __tag_width) \
33+
typedef `DCA_REQ_CHAN_STRUCT(__data_width, __tag_width) __name``_req_chan_t;
34+
35+
`define DCA_TYPEDEF_RSP_CHAN_T(__name, __data_width, __tag_width) \
36+
typedef `DCA_RSP_CHAN_STRUCT(__data_width, __tag_width) __name``_rsp_chan_t;
37+
38+
`define DCA_TYPEDEF_ALL(__name, __data_width, __tag_width) \
39+
`DCA_TYPEDEF_REQ_CHAN_T(__name, __data_width, __tag_width) \
40+
`DCA_TYPEDEF_RSP_CHAN_T(__name, __data_width, __tag_width) \
41+
`GENERIC_REQRSP_TYPEDEF_ALL(__name, __name``_req_chan_t, __name``_rsp_chan_t)
42+
43+
`endif // DCA_INTERFACE_TYPEDEF_SVH_

hw/dca_interface/src/dca_fork.sv

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
// Copyright 2025 ETH Zurich and University of Bologna.
2+
// Solderpad Hardware License, Version 0.51, see LICENSE for details.
3+
// SPDX-License-Identifier: SHL-0.51
4+
5+
// Author: Luca Colagrande <[email protected]>
6+
7+
`include "dca_interface/typedef.svh"
8+
9+
// Forks a wide Direct Compute Access (DCA) request to multiple lanes,
10+
// operating in SIMD fashion.
11+
module dca_fork #(
12+
parameter int unsigned LaneDataWidth = 64,
13+
parameter int unsigned NumLanes = 8,
14+
parameter int unsigned TagWidth = 7,
15+
// Derived parameters
16+
localparam int unsigned DataWidth = LaneDataWidth * NumLanes,
17+
localparam type dca_req_t = `DCA_REQ_STRUCT(DataWidth, TagWidth),
18+
localparam type dca_rsp_t = `DCA_RSP_STRUCT(DataWidth, TagWidth),
19+
localparam type dca_lane_req_t = `DCA_REQ_STRUCT(LaneDataWidth, TagWidth),
20+
localparam type dca_lane_rsp_t = `DCA_RSP_STRUCT(LaneDataWidth, TagWidth)
21+
) (
22+
input logic clk_i,
23+
input logic rst_ni,
24+
input dca_req_t slv_req_i,
25+
output dca_rsp_t slv_rsp_o,
26+
output dca_lane_req_t [NumLanes-1:0] mst_req_o,
27+
input dca_lane_rsp_t [NumLanes-1:0] mst_rsp_i
28+
);
29+
30+
logic [NumLanes-1:0] flat_q_valids;
31+
logic [NumLanes-1:0] flat_q_readies;
32+
logic [NumLanes-1:0] flat_p_valids;
33+
logic [NumLanes-1:0] flat_p_readies;
34+
35+
// Fork the DCA request to all lanes
36+
stream_fork #(
37+
.N_OUP(NumLanes)
38+
) i_dca_fork_fpu (
39+
.clk_i (clk_i),
40+
.rst_ni (rst_ni),
41+
.valid_i(slv_req_i.q_valid),
42+
.ready_o(slv_rsp_o.q_ready),
43+
.valid_o(flat_q_valids),
44+
.ready_i(flat_q_readies)
45+
);
46+
47+
// Join the DCA responses from all lanes
48+
stream_join #(
49+
.N_INP(NumLanes)
50+
) i_dca_join_fpu (
51+
.inp_valid_i(flat_p_valids),
52+
.inp_ready_o(flat_p_readies),
53+
.oup_valid_o(slv_resp_o.p_valid),
54+
.oup_ready_i(slv_req_i.p_ready)
55+
);
56+
57+
for (genvar i = 0; i < NumLanes; i++) begin : gen_lane
58+
// The same operation flags are sent to all lanes
59+
assign mst_req_o[i].q.dca_rnd_mode = slv_req_i.q.dca_rnd_mode;
60+
assign mst_req_o[i].q.dca_op_code = slv_req_i.q.dca_op_code;
61+
assign mst_req_o[i].q.dca_op_mode = slv_req_i.q.dca_op_mode;
62+
assign mst_req_o[i].q.dca_src_format = slv_req_i.q.dca_src_format;
63+
assign mst_req_o[i].q.dca_dst_format = slv_req_i.q.dca_dst_format;
64+
assign mst_req_o[i].q.dca_int_format = slv_req_i.q.dca_int_format;
65+
assign mst_req_o[i].q.dca_vector_op = slv_req_i.q.dca_vector_op;
66+
assign mst_req_o[i].q.dca_tag = slv_req_i.q.dca_tag;
67+
// Data is split across lanes, to perform SIMD operation (both operands and result)
68+
assign mst_req_o[i].q.dca_operands[2][DataWidth-1:0] = slv_req_i.q.dca_operands[2][DataWidth*i+:DataWidth];
69+
assign mst_req_o[i].q.dca_operands[1][DataWidth-1:0] = slv_req_i.q.dca_operands[1][DataWidth*i+:DataWidth];
70+
assign mst_req_o[i].q.dca_operands[0][DataWidth-1:0] = slv_req_i.q.dca_operands[0][DataWidth*i+:DataWidth];
71+
assign slv_rsp_o.p.dca_result[DataWidth*i+:DataWidth] = mst_rsp_i[i].p.dca_result[DataWidth-1:0];
72+
// Connect the handshake signals
73+
assign mst_req_o[i].q_valid = flat_q_valids[i];
74+
assign mst_req_o[i].p_ready = flat_p_readies[i];
75+
assign flat_q_readies[i] = mst_rsp_i[i].q_ready;
76+
assign flat_p_valids[i] = mst_rsp_i[i].p_valid;
77+
// end else begin
78+
// // Disable the connection to the DMA core
79+
// assign mst_req_o[i].q = '0;
80+
// assign mst_req_o[i].q_valid = 1'b0;
81+
// assign mst_req_o[i].p_ready = 1'b1;
82+
// end
83+
end
84+
85+
// The response tag is the same across all responses so we simply take it from the first
86+
assign slv_rsp_o.p.dca_tag = dca_rsp[0].p.dca_tag;
87+
88+
// OR-reduce the status bits from all lanes
89+
// TODO(colluca): double-check that this is actually a bitwise OR
90+
always_comb begin
91+
slv_rsp_o.p.dca_status = '0;
92+
for (int i = 0; i < (NumLanes-1); i++) begin
93+
slv_rsp_o.p.dca_status |= mst_rsp_i[i].p.dca_status;
94+
end
95+
end
96+
97+
endmodule

hw/snitch_cluster/src/snitch_cc.sv

Lines changed: 46 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
`include "common_cells/assertions.svh"
88
`include "common_cells/registers.svh"
99
`include "snitch_vm/typedef.svh"
10+
`include "reqrsp_interface/typedef.svh"
1011

1112
/// Snitch Core Complex (CC)
1213
/// Contains the Snitch Integer Core + FPU + Private Accelerators
@@ -44,8 +45,8 @@ module snitch_cc #(
4445
parameter type hive_rsp_t = logic,
4546
parameter type acc_req_t = logic,
4647
parameter type acc_resp_t = logic,
47-
parameter type dca_req_t = logic,
48-
parameter type dca_resp_t = logic,
48+
parameter type dca_req_chan_t = logic,
49+
parameter type dca_rsp_chan_t = logic,
4950
parameter type dma_events_t = logic,
5051
parameter fpnew_pkg::fpu_implementation_t FPUImplementation = '0,
5152
/// Boot address of core.
@@ -68,16 +69,12 @@ module snitch_cc #(
6869
parameter bit Xfrep = 1,
6970
/// Has `SSR` support.
7071
parameter bit Xssr = 1,
71-
/// Has `DCA` support.
72-
parameter bit Xdca = 0,
7372
/// Has `COPIFT` support.
7473
parameter bit Xcopift = 1,
7574
/// Has `IPU` support.
7675
parameter bit Xipu = 1,
7776
/// Has virtual memory support.
7877
parameter bit VMSupport = 1,
79-
/// Width of the collective operation field
80-
parameter int unsigned CollectiveWidth = 1,
8178
parameter int unsigned NumIntOutstandingLoads = 0,
8279
parameter int unsigned NumIntOutstandingMem = 0,
8380
parameter int unsigned NumFPOutstandingLoads = 0,
@@ -110,10 +107,10 @@ module snitch_cc #(
110107
parameter bit RegisterFPUIn = 0,
111108
/// Insert Pipeline registers immediately after FPU datapath
112109
parameter bit RegisterFPUOut = 0,
113-
/// Insert Pipeline register between DCA from Router and FPU
114-
parameter bit RegisterDCAIn = 0,
115-
/// Insert Pipeline register between DCA from FPU and Router
116-
parameter bit RegisterDCAOut = 0,
110+
/// Cut DCA request to FPU
111+
parameter bit RegisterDcaReq = 0,
112+
/// Cut DCA response from FPU
113+
parameter bit RegisterDcaRsp = 0,
117114
parameter snitch_pma_pkg::snitch_pma_t SnitchPMACfg = '{default: 0},
118115
/// Consistency Address Queue (CAQ) parameters.
119116
parameter int unsigned CaqDepth = 0,
@@ -123,10 +120,16 @@ module snitch_cc #(
123120
/// Optional fixed TCDM alias.
124121
parameter bit TCDMAliasEnable = 1'b0,
125122
parameter logic [AddrWidth-1:0] TCDMAliasStart = '0,
123+
/// Width of the collective operation field
124+
parameter int unsigned CollectiveWidth = 1,
125+
/// Enable direct compute access (DCA).
126+
parameter bit EnableDca = 0,
126127
/// Derived parameter *Do not override*
127128
parameter int unsigned TCDMPorts = (NumSsrs > 1 ? NumSsrs : 1),
128129
parameter type addr_t = logic [AddrWidth-1:0],
129-
parameter type data_t = logic [DataWidth-1:0]
130+
parameter type data_t = logic [DataWidth-1:0],
131+
parameter type dca_req_t = `GENERIC_REQRSP_REQ_STRUCT(dca_req_chan_t),
132+
parameter type dca_rsp_t = `GENERIC_REQRSP_RSP_STRUCT(dca_rsp_chan_t)
130133
) (
131134
input logic clk_i,
132135
input logic clk_d2_i,
@@ -156,12 +159,8 @@ module snitch_cc #(
156159
output logic barrier_o,
157160
input logic barrier_i,
158161
// Direct Compute Access (DCA) interface
159-
input dca_req_t dca_req_i,
160-
input logic dca_req_valid_i,
161-
output logic dca_req_ready_o,
162-
output dca_resp_t dca_resp_o,
163-
output logic dca_resp_valid_o,
164-
input logic dca_resp_ready_i
162+
input dca_req_t dca_req_i,
163+
output dca_rsp_t dca_rsp_o
165164
);
166165

167166
// FMA architecture is "merged" -> mulexp and macexp instructions are supported
@@ -501,60 +500,29 @@ module snitch_cc #(
501500
logic ssr_streamctl_valid;
502501
logic ssr_streamctl_ready;
503502

504-
// Signals for the DCA
505-
dca_req_t dca_req_q; // Delayed Request by the (optional) Spill Register
506-
logic dca_req_valid_q;
507-
logic dca_req_ready_q;
508-
dca_resp_t dca_resp; // Response from the FPU in front of the (optional) Spill Register
509-
logic dca_resp_valid;
510-
logic dca_resp_ready;
511-
512-
// Cut off-DCA Interface Request
513-
if(Xdca) begin : gen_spill_register
514-
spill_register #(
515-
.T (dca_req_t),
516-
.Bypass (~RegisterDCAIn)
517-
) i_spill_reg_dca_req (
518-
.clk_i (clk_i),
519-
.rst_ni (rst_ni),
520-
.valid_i (dca_req_valid_i),
521-
.ready_o (dca_req_ready_o),
522-
.data_i (dca_req_i),
523-
.valid_o (dca_req_valid_q),
524-
.ready_i (dca_req_ready_q),
525-
.data_o (dca_req_q)
526-
);
527-
528-
// Cut off-DCA Interface Response
529-
spill_register #(
530-
.T (dca_resp_t),
531-
.Bypass (~RegisterDCAOut)
532-
) i_spill_reg_dca_resp (
533-
.clk_i (clk_i),
534-
.rst_ni (rst_ni),
535-
.valid_i (dca_resp_valid),
536-
.ready_o (dca_resp_ready),
537-
.data_i (dca_resp),
538-
.valid_o (dca_resp_valid_o),
539-
.ready_i (dca_resp_ready_i),
540-
.data_o (dca_resp_o)
541-
);
542-
end else begin
543-
assign dca_req_ready_o = 1'b0;
544-
assign dca_req_valid_q = 1'b0;
545-
assign dca_req_q = '0;
546-
547-
assign dca_resp_ready = 1'b0;
548-
assign dca_resp_valid_o = 1'b0;
549-
assign dca_resp_o = '0;
550-
end
551-
552503
if (FPEn) begin : gen_fpu
553504
snitch_pkg::core_events_t fp_ss_core_events;
554505

555506
dreq_t fpu_dreq;
556507
drsp_t fpu_drsp;
557508

509+
dca_req_t dca_req;
510+
dca_rsp_t dca_rsp;
511+
512+
generic_reqrsp_cut #(
513+
.req_chan_t(dca_req_chan_t),
514+
.rsp_chan_t(dca_rsp_chan_t),
515+
.BypassReq(!EnableDca || !RegisterDcaReq),
516+
.BypassRsp(!EnableDca || !RegisterDcaRsp)
517+
) i_dca_cut (
518+
.clk_i(clk_i),
519+
.rst_ni(rst_ni),
520+
.slv_req_i(dca_req_i),
521+
.slv_rsp_o(dca_rsp_o),
522+
.mst_req_o(dca_req),
523+
.mst_rsp_i(dca_rsp)
524+
);
525+
558526
snitch_fp_ss #(
559527
.AddrWidth (AddrWidth),
560528
.DataWidth (DataWidth),
@@ -570,13 +538,13 @@ module snitch_cc #(
570538
.acc_req_t (acc_req_t),
571539
.acc_resp_t (acc_resp_t),
572540
.dca_req_t (dca_req_t),
573-
.dca_resp_t (dca_resp_t),
541+
.dca_rsp_t (dca_rsp_t),
574542
.RegisterSequencer (RegisterSequencer),
575543
.RegisterFPUIn (RegisterFPUIn),
576544
.RegisterFPUOut (RegisterFPUOut),
577545
.Xfrep (Xfrep),
578546
.Xssr (Xssr),
579-
.Xdca (Xdca),
547+
.Xdca (EnableDca),
580548
.Xcopift (Xcopift),
581549
.RVF (RVF),
582550
.RVD (RVD),
@@ -621,12 +589,12 @@ module snitch_cc #(
621589
.streamctl_valid_i ( ssr_streamctl_valid ),
622590
.streamctl_ready_o ( ssr_streamctl_ready ),
623591
.core_events_o ( fp_ss_core_events ),
624-
.dca_req_i ( dca_req_q ),
625-
.dca_req_valid_i ( dca_req_valid_q ),
626-
.dca_req_ready_o ( dca_req_ready_q ),
627-
.dca_resp_o ( dca_resp ),
628-
.dca_resp_valid_o ( dca_resp_valid ),
629-
.dca_resp_ready_i ( dca_resp_ready )
592+
.dca_req_i ( dca_req.q ),
593+
.dca_req_valid_i ( dca_req.q_valid ),
594+
.dca_req_ready_o ( dca_req.p_ready ),
595+
.dca_rsp_o ( dca_rsp.p ),
596+
.dca_rsp_valid_o ( dca_rsp.p_valid ),
597+
.dca_rsp_ready_i ( dca_rsp.q_ready )
630598
);
631599

632600
reqrsp_mux #(
@@ -1032,11 +1000,6 @@ module snitch_cc #(
10321000
end
10331001
end
10341002

1035-
// If dca enabled then forward the trace port
1036-
if(Xdca) begin
1037-
extras_dca = dca_trace;
1038-
end
1039-
10401003
cycle++;
10411004
// Trace snitch iff:
10421005
// we are not stalled <==> we have issued and processed an instruction (including offloads)
@@ -1072,10 +1035,11 @@ module snitch_cc #(
10721035
end
10731036
end
10741037
end
1075-
if(Xdca) begin
1038+
if (EnableDca) begin
1039+
extras_dca = dca_trace;
10761040
// Trace DCA iff
10771041
// When either an input or output handshake occures
1078-
if(extras_dca.dca_in_hs || extras_dca.dca_out_hs) begin
1042+
if (extras_dca.dca_in_hs || extras_dca.dca_out_hs) begin
10791043
$sformat(trace_entry, "%t %1d %8d 0x%h DASM(%h) #; %s\n",
10801044
$time, cycle, i_snitch.priv_lvl_q, 32'hz, extras_dca.dca_in_op_code,
10811045
snitch_pkg::print_dca_trace(extras_dca));
@@ -1095,7 +1059,7 @@ module snitch_cc #(
10951059

10961060
`ASSERT_INIT(BootAddrAligned, BootAddr[1:0] == 2'b00)
10971061

1098-
// For the DCA Extension the is is required that each core has the FPU D-ext loaded
1099-
`ASSERT_INIT(DCACoreConfiguration, (~Xdca) || RVD)
1062+
// For the DCA extension it is required that each core has the FPU D-ext enabled
1063+
`ASSERT_INIT(DcaCoreConfiguration, (!EnableDca) || RVD)
11001064

11011065
endmodule

0 commit comments

Comments
 (0)