forked from NVIDIA/TensorRT-LLM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquantize.py
208 lines (198 loc) · 8.36 KB
/
quantize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import argparse
import torch.multiprocessing as mp
from tensorrt_llm.quantization import (quantize_and_export,
quantize_nemo_and_export)
mp.set_start_method("spawn", force=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--model_dir",
help="Specify where the HuggingFace model is",
default=None)
parser.add_argument('--nemo_ckpt_path',
help="Specify where the NeMo checkpoint is",
default=None)
parser.add_argument(
'--decoder_type',
type=str,
default='gptnext',
choices=['gptnext', 'llama'],
help="Decoder type; effective for NeMo checkpoint only.")
parser.add_argument(
'--device',
help=
"The device to run calibration; effective for HuggingFace model only.",
default='cuda',
choices=['cuda', 'cpu'])
parser.add_argument(
"--device_map",
help="How to map the model on the devices",
default="auto",
choices=["auto", "sequential", "cpu", "gpu"],
)
parser.add_argument(
'--calib_dataset',
type=str,
default='cnn_dailymail',
help=
"The huggingface dataset name or the local directory of the dataset for calibration."
)
parser.add_argument(
'--calib_tp_size',
type=int,
default=1,
help=
"Tensor parallel size for calibration; effective for NeMo checkpoint only."
)
parser.add_argument(
'--calib_pp_size',
type=int,
default=1,
help=
"Pipeline parallel size for calibration; effective for NeMo checkpoint only."
)
parser.add_argument(
'--dtype',
type=str,
default='auto',
choices=['auto', 'float16', 'bfloat16', 'float32'],
help=
"The data type for the model weights and activations of the non-quantized part, e.g., embedding and lm_head. "
"If 'auto', the data type is automatically inferred from the source model; "
"however, if the source dtype is float32, it is converted to float16.")
parser.add_argument(
"--qformat",
help="Quantization format.",
default="full_prec",
choices=[
"nvfp4",
"fp8",
"int8_sq",
"int4_awq",
"w4a8_awq",
"int8_wo",
"int4_wo",
"full_prec",
],
)
parser.add_argument(
"--seed",
help="Seed the generate random numbers, the value will be used to call"
"random.seed(value) and numpy.random.seed(value)",
type=int,
default=1234)
parser.add_argument("--tokenizer_max_seq_length",
help="Max sequence length to init the tokenizers",
type=int,
default=2048)
parser.add_argument("--batch_size",
help="Batch size for calibration.",
type=int,
default=1)
parser.add_argument("--calib_size",
help="Number of samples for calibration.",
type=int,
default=512)
parser.add_argument("--calib_max_seq_length",
help="Max sequence length for calibration",
type=int,
default=512)
parser.add_argument("--output_dir", default="exported_model")
parser.add_argument("--tp_size", type=int, default=1)
parser.add_argument("--pp_size", type=int, default=1)
parser.add_argument("--cp_size", type=int, default=1)
parser.add_argument("--awq_block_size", type=int, default=128)
parser.add_argument("--kv_cache_dtype",
help="KV Cache dtype.",
default=None,
choices=["int8", "fp8", None])
parser.add_argument("--quantize_lm_head",
action='store_true',
default=False)
# Medusa
parser.add_argument('--num_medusa_heads', type=int, default=4)
parser.add_argument('--num_medusa_layers', type=int, default=1)
parser.add_argument('--max_draft_len', type=int, default=63)
parser.add_argument('--medusa_hidden_act', type=str, default="silu")
parser.add_argument('--medusa_model_dir', type=str, default=None)
parser.add_argument('--quant_medusa_head',
default=False,
action='store_true',
help="whether to quantize the weights of medusa heads")
# auto quantization
parser.add_argument(
'--autoq_format',
default=None,
type=str,
help=
"Specific quantization algorithms will be searched in auto quantization."
"The algorithm must in ['fp8', 'int4_awq', 'w4a8_awq', 'int8_sq']."
"You can use ',' to separate more than one quantization algorithms(e.g. --autoq_format fp8,int4_awq,w4a8_awq)."
"Notice: fp8 and int8_sq can't be used at the same time.")
parser.add_argument(
'--auto_quantize_bits',
type=float,
default=None,
help="Effective bits constraint for auto quantization. If not set, "
"regular quantization without auto quantization search will be applied."
"You can't set it lower than the num_bits of most aggressive quantization format."
"For example, if 'int4_awq' is in autoq_format, it can't be lower than 4.0."
)
args = parser.parse_args()
# auto_quantize_bits check
if args.autoq_format:
lower_bound, upper_bound = 4 if '4' in args.autoq_format else 8, 16
if args.auto_quantize_bits is None or args.auto_quantize_bits < lower_bound or args.auto_quantize_bits > upper_bound:
print(
f"invalid auto_quantize_bits value, will be set to {lower_bound}"
)
args.auto_quantize_bits = lower_bound
if args.model_dir is not None:
quantize_and_export(
model_dir=args.model_dir,
device=args.device,
calib_dataset=args.calib_dataset,
dtype=args.dtype,
qformat=args.qformat
if args.auto_quantize_bits is None else args.autoq_format,
kv_cache_dtype=args.kv_cache_dtype,
calib_size=args.calib_size,
batch_size=args.batch_size,
calib_max_seq_length=args.calib_max_seq_length,
awq_block_size=args.awq_block_size,
output_dir=args.output_dir,
tp_size=args.tp_size,
pp_size=args.pp_size,
cp_size=args.cp_size,
seed=args.seed,
tokenizer_max_seq_length=args.tokenizer_max_seq_length,
num_medusa_heads=args.num_medusa_heads,
num_medusa_layers=args.num_medusa_layers,
max_draft_len=args.max_draft_len,
medusa_hidden_act=args.medusa_hidden_act,
medusa_model_dir=args.medusa_model_dir,
quant_medusa_head=args.quant_medusa_head,
auto_quantize_bits=args.auto_quantize_bits,
device_map=args.device_map,
quantize_lm_head=args.quantize_lm_head)
elif args.nemo_ckpt_path is not None:
quantize_nemo_and_export(nemo_ckpt_path=args.nemo_ckpt_path,
decoder_type=args.decoder_type,
calib_dataset=args.calib_dataset,
calib_tp_size=args.calib_tp_size,
calib_pp_size=args.calib_pp_size,
dtype=args.dtype,
qformat=args.qformat,
kv_cache_dtype=args.kv_cache_dtype,
calib_size=args.calib_size,
batch_size=args.batch_size,
calib_max_seq_length=args.calib_max_seq_length,
awq_block_size=args.awq_block_size,
output_dir=args.output_dir,
tp_size=args.tp_size,
pp_size=args.pp_size,
cp_size=args.cp_size,
seed=args.seed)
else:
raise ValueError(
"One of source checkpoint (model_dir, nemo_ckpt_path) must be specified"
)