@@ -62,6 +62,16 @@ constexpr std::array<LayerAttentionType, kNum> FixedLayerConfig(
62
62
return config;
63
63
}
64
64
65
+ template <size_t kNum >
66
+ constexpr std::array<size_t , kNum > FixedAttentionWindowSizes (
67
+ size_t window_size) {
68
+ std::array<size_t , kNum > window_size_configs = {};
69
+ for (size_t & l : window_size_configs) {
70
+ l = window_size;
71
+ }
72
+ return window_size_configs;
73
+ }
74
+
65
75
template <size_t kNumLayers >
66
76
constexpr size_t NumLayersOfTypeBefore (
67
77
const std::array<LayerAttentionType, kNumLayers >& layers,
@@ -114,10 +124,16 @@ template <typename TWeight>
114
124
struct ConfigGemma27B : public ConfigCapNoSSM {
115
125
using Weight = TWeight; // make accessible where we only have a TConfig
116
126
117
- static constexpr int kSeqLen = gcpp:: kSeqLen ;
127
+ static constexpr int kSeqLen = 8192 ;
118
128
static constexpr int kVocabSize = 256000 ;
119
129
static constexpr std::array<LayerAttentionType, 46 > kLayerConfig =
120
130
FixedLayerConfig<46 >(LayerAttentionType::kGemma );
131
+ static constexpr std::array<size_t , 46 > kAttentionWindowSizes = {
132
+ 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen ,
133
+ 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen ,
134
+ 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen ,
135
+ 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen ,
136
+ 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen };
121
137
static constexpr int kLayers = kLayerConfig .size();
122
138
static constexpr int kGemmaLayers = kLayers ;
123
139
static constexpr int kModelDim = 4608 ;
@@ -134,10 +150,16 @@ template <typename TWeight>
134
150
struct ConfigGemma9B : public ConfigCapNoSSM {
135
151
using Weight = TWeight; // make accessible where we only have a TConfig
136
152
137
- static constexpr int kSeqLen = gcpp:: kSeqLen ;
153
+ static constexpr int kSeqLen = 8192 ;
138
154
static constexpr int kVocabSize = 256000 ;
139
155
static constexpr std::array<LayerAttentionType, 42 > kLayerConfig =
140
156
FixedLayerConfig<42 >(LayerAttentionType::kGemma );
157
+ static constexpr std::array<size_t , 42 > kAttentionWindowSizes = {
158
+ 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen ,
159
+ 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen ,
160
+ 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen ,
161
+ 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen , 4096 , kSeqLen ,
162
+ 4096 , kSeqLen };
141
163
static constexpr int kLayers = kLayerConfig .size();
142
164
static constexpr int kGemmaLayers = kLayers ;
143
165
static constexpr int kModelDim = 3584 ;
@@ -158,6 +180,8 @@ struct ConfigGemma7B : public ConfigNoCapNoSSM {
158
180
static constexpr int kVocabSize = 256000 ;
159
181
static constexpr std::array<LayerAttentionType, 28 > kLayerConfig =
160
182
FixedLayerConfig<28 >(LayerAttentionType::kGemma );
183
+ static constexpr std::array<size_t , 28 > kAttentionWindowSizes =
184
+ FixedAttentionWindowSizes<28 >(kSeqLen );
161
185
static constexpr int kLayers = kLayerConfig .size();
162
186
static constexpr int kGemmaLayers = kLayers ;
163
187
static constexpr int kModelDim = 3072 ;
@@ -178,6 +202,8 @@ struct ConfigGemma2B : public ConfigNoCapNoSSM {
178
202
static constexpr int kVocabSize = 256000 ;
179
203
static constexpr std::array<LayerAttentionType, 18 > kLayerConfig =
180
204
FixedLayerConfig<18 >(LayerAttentionType::kGemma );
205
+ static constexpr std::array<size_t , 18 > kAttentionWindowSizes =
206
+ FixedAttentionWindowSizes<18 >(kSeqLen );
181
207
static constexpr int kLayers = kLayerConfig .size();
182
208
static constexpr int kGemmaLayers = kLayers ;
183
209
static constexpr int kModelDim = 2048 ;
@@ -198,6 +224,8 @@ struct ConfigGemmaTiny : public ConfigNoSSM {
198
224
static constexpr int kVocabSize = 64 ;
199
225
static constexpr std::array<LayerAttentionType, 3 > kLayerConfig =
200
226
FixedLayerConfig<3 >(LayerAttentionType::kGemma );
227
+ static constexpr std::array<size_t , 3 > kAttentionWindowSizes =
228
+ FixedAttentionWindowSizes<3 >(kSeqLen );
201
229
static constexpr int kLayers = kLayerConfig .size();
202
230
static constexpr int kGemmaLayers = kLayers ;
203
231
static constexpr int kModelDim = 128 ;
@@ -250,6 +278,8 @@ struct ConfigGriffin2B {
250
278
LayerAttentionType::kGriffinRecurrentBlock ,
251
279
LayerAttentionType::kGriffinRecurrentBlock ,
252
280
};
281
+ static constexpr std::array<size_t , 26 > kAttentionWindowSizes =
282
+ FixedAttentionWindowSizes<26 >(kSeqLen );
253
283
static constexpr int kLayers = kLayerConfig .size();
254
284
static constexpr int kGemmaLayers =
255
285
NumLayersOfTypeBefore (kLayerConfig , LayerAttentionType::kGemma , kLayers );
0 commit comments