forked from fpgaminer/GPTQ-triton
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtable.html
105 lines (105 loc) · 19.3 KB
/
table.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
<style type="text/css"> .tg {border-collapse:collapse;border-spacing:0;} .tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px; overflow:hidden;padding:10px 5px;word-break:normal;} .tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px; font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;} .tg .tg-c3ow{border-color:inherit;text-align:center;vertical-align:top} </style><table class="tg">
<thead><tr><th class="tg-c3ow">name</th><th class="tg-c3ow">shape</th><th class="tg-c3ow">shape</th><th class="tg-c3ow">type</th></tr></thead>
<tbody>
<tr><th class="tg-c3ow">model.embed_tokens_input</th><th class="tg-c3ow">(bsz, seqlen)</th><th class="tg-c3ow">(32, 128)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.embed_tokens_output</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.embed_tokens_weight</th><th class="tg-c3ow">(vocal_size, dim)</th><th class="tg-c3ow">(32000, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.input_layernorm_input</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.input_layernorm_output</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.input_layernorm_weight</th><th class="tg-c3ow">(dim)</th><th class="tg-c3ow">(4096,)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_input</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_qweight</th><th class="tg-c3ow">(dim / 8, dim)</th><th class="tg-c3ow">(512, 4096)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_kweight</th><th class="tg-c3ow">(dim / 8, dim)</th><th class="tg-c3ow">(512, 4096)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_vweight</th><th class="tg-c3ow">(dim / 8, dim)</th><th class="tg-c3ow">(512, 4096)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_qzeros</th><th class="tg-c3ow">(dim / group_size, dim / 8)</th><th class="tg-c3ow">(32, 512)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_kzeros</th><th class="tg-c3ow">(dim / group_size, dim / 8)</th><th class="tg-c3ow">(32, 512)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_vzeros</th><th class="tg-c3ow">(dim / group_size, dim / 8)</th><th class="tg-c3ow">(32, 512)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_qscales</th><th class="tg-c3ow">(dim / group_size, dim)</th><th class="tg-c3ow">(32, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_kscales</th><th class="tg-c3ow">(dim / group_size, dim)</th><th class="tg-c3ow">(32, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_vscales</th><th class="tg-c3ow">(dim / group_size, dim)</th><th class="tg-c3ow">(32, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_q_before_rope</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_k_before_rope</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_Q</th><th class="tg-c3ow">(bsz, n_heads, seqlen, head_dim)</th><th class="tg-c3ow">(32, 32, 128, 128)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_K</th><th class="tg-c3ow">(bsz, n_heads, seqlen, head_dim)</th><th class="tg-c3ow">(32, 32, 128, 128)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_V</th><th class="tg-c3ow">(bsz, n_heads, seqlen, head_dim)</th><th class="tg-c3ow">(32, 32, 128, 128)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_QK^T</th><th class="tg-c3ow">(bsz, n_heads, seq_len, seq_len)</th><th class="tg-c3ow">(32, 32, 128, 128)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_sqrt(dim)</th><th class="tg-c3ow">(1)</th><th class="tg-c3ow">(1,)</th><th class="tg-c3ow">torch.float32</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_QK^T|sqrt(dim)</th><th class="tg-c3ow">(bsz, n_heads, seq_len, seq_len)</th><th class="tg-c3ow">(32, 32, 128, 128)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_QK^T|sqrt(dim)_aftermask</th><th class="tg-c3ow">(bsz, n_heads, seq_len, seq_len)</th><th class="tg-c3ow">(32, 32, 128, 128)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_softmax(QK^T|sqrt(dim)_aftermask)</th><th class="tg-c3ow">(bsz, n_heads, seq_len, seq_len)</th><th class="tg-c3ow">(32, 32, 128, 128)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_qkv_output</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_oweight</th><th class="tg-c3ow">(dim / 8, dim)</th><th class="tg-c3ow">(512, 4096)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_ozeros</th><th class="tg-c3ow">(dim / group_size, dim / 8)</th><th class="tg-c3ow">(32, 512)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_oscales</th><th class="tg-c3ow">(dim / group_size, dim)</th><th class="tg-c3ow">(32, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.self_attn_output</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.post_attention_layernorm_input</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.post_attention_layernorm_output</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.post_attention_layernorm_weight</th><th class="tg-c3ow">(dim)</th><th class="tg-c3ow">(4096,)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.mlp_input</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.mlp_gateweight</th><th class="tg-c3ow">(dim / 8, dim)</th><th class="tg-c3ow">(512, 11008)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.0.mlp_gatezeros</th><th class="tg-c3ow">(dim / group_size, dim / 8)</th><th class="tg-c3ow">(32, 1376)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.0.mlp_gatescales</th><th class="tg-c3ow">(dim / group_size, dim)</th><th class="tg-c3ow">(32, 11008)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.mlp_upweight</th><th class="tg-c3ow">(dim / 8, dim)</th><th class="tg-c3ow">(512, 11008)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.0.mlp_upzeros</th><th class="tg-c3ow">(dim / group_size, dim / 8)</th><th class="tg-c3ow">(32, 1376)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.0.mlp_upscales</th><th class="tg-c3ow">(dim / group_size, dim)</th><th class="tg-c3ow">(32, 11008)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.mlp_downweight</th><th class="tg-c3ow">(dim / 8, dim)</th><th class="tg-c3ow">(1376, 4096)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.0.mlp_downzeros</th><th class="tg-c3ow">(dim / group_size, dim / 8)</th><th class="tg-c3ow">(86, 512)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.0.mlp_downscales</th><th class="tg-c3ow">(dim / group_size, dim)</th><th class="tg-c3ow">(86, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.mlp_gate(x)</th><th class="tg-c3ow">(bsz, seqlen, intermediate_dim)</th><th class="tg-c3ow">(32, 128, 11008)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.mlp_silu(gate(x))</th><th class="tg-c3ow">(bsz, seqlen, intermediate_dim)</th><th class="tg-c3ow">(32, 128, 11008)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.mlp_up(x)</th><th class="tg-c3ow">(bsz, seqlen, intermediate_dim)</th><th class="tg-c3ow">(32, 128, 11008)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.mlp_silu(gate(x))*up(x)</th><th class="tg-c3ow">(bsz, seqlen, intermediate_dim)</th><th class="tg-c3ow">(32, 128, 11008)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.0.mlp_output</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.input_layernorm_input</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.input_layernorm_output</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.input_layernorm_weight</th><th class="tg-c3ow">(dim)</th><th class="tg-c3ow">(4096,)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_input</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_qweight</th><th class="tg-c3ow">(dim / 8, dim)</th><th class="tg-c3ow">(512, 4096)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_kweight</th><th class="tg-c3ow">(dim / 8, dim)</th><th class="tg-c3ow">(512, 4096)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_vweight</th><th class="tg-c3ow">(dim / 8, dim)</th><th class="tg-c3ow">(512, 4096)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_qzeros</th><th class="tg-c3ow">(dim / group_size, dim / 8)</th><th class="tg-c3ow">(32, 512)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_kzeros</th><th class="tg-c3ow">(dim / group_size, dim / 8)</th><th class="tg-c3ow">(32, 512)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_vzeros</th><th class="tg-c3ow">(dim / group_size, dim / 8)</th><th class="tg-c3ow">(32, 512)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_qscales</th><th class="tg-c3ow">(dim / group_size, dim)</th><th class="tg-c3ow">(32, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_kscales</th><th class="tg-c3ow">(dim / group_size, dim)</th><th class="tg-c3ow">(32, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_vscales</th><th class="tg-c3ow">(dim / group_size, dim)</th><th class="tg-c3ow">(32, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_q_before_rope</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_k_before_rope</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_Q</th><th class="tg-c3ow">(bsz, n_heads, seqlen, head_dim)</th><th class="tg-c3ow">(32, 32, 128, 128)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_K</th><th class="tg-c3ow">(bsz, n_heads, seqlen, head_dim)</th><th class="tg-c3ow">(32, 32, 128, 128)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_V</th><th class="tg-c3ow">(bsz, n_heads, seqlen, head_dim)</th><th class="tg-c3ow">(32, 32, 128, 128)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_QK^T</th><th class="tg-c3ow">(bsz, n_heads, seq_len, seq_len)</th><th class="tg-c3ow">(32, 32, 128, 128)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_sqrt(dim)</th><th class="tg-c3ow">(1)</th><th class="tg-c3ow">(1,)</th><th class="tg-c3ow">torch.float32</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_QK^T|sqrt(dim)</th><th class="tg-c3ow">(bsz, n_heads, seq_len, seq_len)</th><th class="tg-c3ow">(32, 32, 128, 128)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_QK^T|sqrt(dim)_aftermask</th><th class="tg-c3ow">(bsz, n_heads, seq_len, seq_len)</th><th class="tg-c3ow">(32, 32, 128, 128)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_softmax(QK^T|sqrt(dim)_aftermask)</th><th class="tg-c3ow">(bsz, n_heads, seq_len, seq_len)</th><th class="tg-c3ow">(32, 32, 128, 128)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_qkv_output</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_oweight</th><th class="tg-c3ow">(dim / 8, dim)</th><th class="tg-c3ow">(512, 4096)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_ozeros</th><th class="tg-c3ow">(dim / group_size, dim / 8)</th><th class="tg-c3ow">(32, 512)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_oscales</th><th class="tg-c3ow">(dim / group_size, dim)</th><th class="tg-c3ow">(32, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.self_attn_output</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.post_attention_layernorm_input</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.post_attention_layernorm_output</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.post_attention_layernorm_weight</th><th class="tg-c3ow">(dim)</th><th class="tg-c3ow">(4096,)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.mlp_input</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.mlp_gateweight</th><th class="tg-c3ow">(dim / 8, dim)</th><th class="tg-c3ow">(512, 11008)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.31.mlp_gatezeros</th><th class="tg-c3ow">(dim / group_size, dim / 8)</th><th class="tg-c3ow">(32, 1376)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.31.mlp_gatescales</th><th class="tg-c3ow">(dim / group_size, dim)</th><th class="tg-c3ow">(32, 11008)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.mlp_upweight</th><th class="tg-c3ow">(dim / 8, dim)</th><th class="tg-c3ow">(512, 11008)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.31.mlp_upzeros</th><th class="tg-c3ow">(dim / group_size, dim / 8)</th><th class="tg-c3ow">(32, 1376)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.31.mlp_upscales</th><th class="tg-c3ow">(dim / group_size, dim)</th><th class="tg-c3ow">(32, 11008)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.mlp_downweight</th><th class="tg-c3ow">(dim / 8, dim)</th><th class="tg-c3ow">(1376, 4096)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.31.mlp_downzeros</th><th class="tg-c3ow">(dim / group_size, dim / 8)</th><th class="tg-c3ow">(86, 512)</th><th class="tg-c3ow">torch.int32</th></tr>
<tr><th class="tg-c3ow">model.layers.31.mlp_downscales</th><th class="tg-c3ow">(dim / group_size, dim)</th><th class="tg-c3ow">(86, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.mlp_gate(x)</th><th class="tg-c3ow">(bsz, seqlen, intermediate_dim)</th><th class="tg-c3ow">(32, 128, 11008)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.mlp_silu(gate(x))</th><th class="tg-c3ow">(bsz, seqlen, intermediate_dim)</th><th class="tg-c3ow">(32, 128, 11008)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.mlp_up(x)</th><th class="tg-c3ow">(bsz, seqlen, intermediate_dim)</th><th class="tg-c3ow">(32, 128, 11008)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.mlp_silu(gate(x))*up(x)</th><th class="tg-c3ow">(bsz, seqlen, intermediate_dim)</th><th class="tg-c3ow">(32, 128, 11008)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.layers.31.mlp_output</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.norm_input</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.norm_output</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">model.norm_weight</th><th class="tg-c3ow">(dim)</th><th class="tg-c3ow">(4096,)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">lm_head_input</th><th class="tg-c3ow">(bsz, seqlen, dim)</th><th class="tg-c3ow">(32, 128, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">lm_head_output</th><th class="tg-c3ow">(bsz, seqlen, vocal_size)</th><th class="tg-c3ow">(32, 128, 32000)</th><th class="tg-c3ow">torch.float16</th></tr>
<tr><th class="tg-c3ow">lm_head_weight</th><th class="tg-c3ow">(vocal_size, dim)</th><th class="tg-c3ow">(32000, 4096)</th><th class="tg-c3ow">torch.float16</th></tr>
</tbody></table>