@@ -56,6 +56,82 @@ extern "C" void cgra_matmul(float* a_allocated, float* a_aligned, int64_t a_offs
56
56
*/
57
57
}
58
58
59
+ extern " C" void cgra_matmul_ (float * a_allocated, float * a_aligned, int64_t a_offset, int64_t a_size0, int64_t a_size1, int64_t a_stride0, int64_t a_stride1,
60
+ float * b_allocated, float * b_aligned, int64_t b_offset, int64_t b_size0, int64_t b_size1, int64_t b_stride0, int64_t b_stride1,
61
+ float * c_allocated, float * c_aligned, int64_t c_offset, int64_t c_size0, int64_t c_size1, int64_t c_stride0, int64_t c_stride1) {
62
+ cgra_matmul (a_allocated, a_aligned, a_offset, a_size0, a_size1, a_stride0, a_stride1,
63
+ b_allocated, b_aligned, b_offset, b_size0, b_size1, b_stride0, b_stride1,
64
+ c_allocated, c_aligned, c_offset, c_size0, c_size1, c_stride0, c_stride1);
65
+
66
+ }
67
+
68
+ extern " C" void cgra_batch_matmul (float * a_allocated, float * a_aligned, int64_t a_offset, int64_t a_size0, int64_t a_size1, int64_t a_size2, int64_t a_stride0, int64_t a_stride1, int64_t a_stride2,
69
+ float * b_allocated, float * b_aligned, int64_t b_offset, int64_t b_size0, int64_t b_size1, int64_t b_size2, int64_t b_stride0, int64_t b_stride1, int64_t b_stride2,
70
+ float * c_allocated, float * c_aligned, int64_t c_offset, int64_t c_size0, int64_t c_size1, int64_t c_size2, int64_t c_stride0, int64_t c_stride1, int64_t c_stride2) {
71
+
72
+ // prepare inputs
73
+ vector<int64_t > a_sizes = {a_size0, a_size1, a_size2};
74
+ vector<int64_t > a_strides = {a_stride0, a_stride1, a_stride2};
75
+ MemRef memRef0 (a_allocated, a_aligned, a_offset, a_sizes, a_strides, 3 );
76
+
77
+ vector<int64_t > b_sizes = {b_size0, b_size1, b_size2};
78
+ vector<int64_t > b_strides = {b_stride0, b_stride1, b_stride2};
79
+ MemRef memRef1 (b_allocated, b_aligned, b_offset, b_sizes, b_strides, 3 );
80
+
81
+ DataReq input;
82
+ input.assembleReq (memRef0);
83
+ input.assembleReq (memRef1);
84
+
85
+ // prepare outputs
86
+ vector<int64_t > c_sizes = {c_size0, c_size1, c_size2};
87
+ vector<int64_t > c_strides = {c_stride0, c_stride1, c_stride2};
88
+ MemRef memRef2 (c_allocated, c_aligned, c_offset, c_sizes, c_strides, 3 );
89
+
90
+ DataReq output;
91
+ output.assembleReq (memRef2);
92
+
93
+ // issue READ/EXECUTE/WRITE requests for simulation
94
+ cgra->issueRD (input);
95
+ cgra->issueEX (" batch_matmul" );
96
+ cgra->issueWR (output, true );
97
+ }
98
+
99
+ extern " C" void cgra_batch_matmul_ (float * a_allocated, float * a_aligned, int64_t a_offset, int64_t a_size0, int64_t a_size1, int64_t a_size2, int64_t a_stride0, int64_t a_stride1, int64_t a_stride2,
100
+ float * b_allocated, float * b_aligned, int64_t b_offset, int64_t b_size0, int64_t b_size1, int64_t b_size2, int64_t b_stride0, int64_t b_stride1, int64_t b_stride2,
101
+ float * c_allocated, float * c_aligned, int64_t c_offset, int64_t c_size0, int64_t c_size1, int64_t c_size2, int64_t c_stride0, int64_t c_stride1, int64_t c_stride2) {
102
+
103
+ // prepare inputs
104
+ vector<int64_t > a_sizes = {a_size0, a_size1, a_size2};
105
+ vector<int64_t > a_strides = {a_stride0, a_stride1, a_stride2};
106
+ MemRef memRef0 (a_allocated, a_aligned, a_offset, a_sizes, a_strides, 3 );
107
+
108
+ vector<int64_t > b_sizes = {b_size0, b_size1, b_size2};
109
+ vector<int64_t > b_strides = {b_stride0, b_stride1, b_stride2};
110
+ MemRef memRef1 (b_allocated, b_aligned, b_offset, b_sizes, b_strides, 3 );
111
+
112
+ DataReq input;
113
+ input.assembleReq (memRef0);
114
+ input.assembleReq (memRef1);
115
+
116
+ // prepare outputs
117
+ vector<int64_t > c_sizes = {c_size0, c_size1, c_size2};
118
+ vector<int64_t > c_strides = {c_stride0, c_stride1, c_stride2};
119
+ MemRef memRef2 (c_allocated, c_aligned, c_offset, c_sizes, c_strides, 3 );
120
+
121
+ DataReq output;
122
+ output.assembleReq (memRef2);
123
+
124
+ // issue READ/EXECUTE/WRITE requests for simulation
125
+ cgra->issueRD (input);
126
+ cgra->issueEX (" batch_matmul" );
127
+ cgra->issueWR (output, true );
128
+
129
+ cout<<" calculated output for cgra_batch_matmul() a_alloc: " <<a_allocated<<" ; a_aligned: " <<a_aligned<<" ; a_offset: " <<a_offset<<" ; a_size0: " <<a_size0<<" ; a_size1: " <<a_size1<<" ; a_size2: " <<a_size2<<" ; a_stride0: " <<a_stride0<<" ; a_stride1: " <<a_stride1<<" ; a_stride2: " <<a_stride2<<endl;
130
+ cout<<" calculated output for cgra_batch_matmul() b_alloc: " <<b_allocated<<" ; b_aligned: " <<b_aligned<<" ; b_offset: " <<b_offset<<" ; b_size0: " <<b_size0<<" ; b_size1: " <<b_size1<<" ; b_size2: " <<b_size2<<" ; b_stride0: " <<b_stride0<<" ; b_stride1: " <<b_stride1<<" ; b_stride2: " <<b_stride2<<endl;
131
+ cout<<" calculated output for cgra_batch_matmul() c_alloc: " <<c_allocated<<" ; c_aligned: " <<c_aligned<<" ; c_offset: " <<c_offset<<" ; c_size0: " <<c_size0<<" ; c_size1: " <<c_size1<<" ; c_size2: " <<c_size2<<" ; c_stride0: " <<c_stride0<<" ; c_stride1: " <<c_stride1<<" ; c_stride2: " <<c_stride2<<endl;
132
+ cout<<" check total cycles: " <<cgra->getTotalCycles ()<<endl;
133
+ }
134
+
59
135
// This fusion is an example for add+max+add. A robust fusion call should
60
136
// be able to figure out what type of operation chain is targeted.
61
137
extern " C" void cgra_fusion_add_max_add (float * a_allocated, float * a_aligned, int64_t a_offset, int64_t a_size0, int64_t a_size1, int64_t a_stride0, int64_t a_stride1,
0 commit comments