Skip to content

Commit

Permalink
Add initial support for compressed-tensors checkpoints
Browse files Browse the repository at this point in the history
compressed-tensors is a safetensors extension for sparse, quantized
tensors. The format is more powerful than earlier AWQ/GPTQ/FP8
quantization, because

- Different quantizer configurations can be used for different targets.
- The format can specify input/output quantizers in addition to weight
  quantizers.
- Configurable exclusions for quantization.

This change adds a dependency on the `compressed-tensors` package for
its configuration parsing and layer matching functionality.

The following types of quantization are supported in this PR:

- W8A16 and W4A16 INT using GPTQ-Marlin kernels.
- W8A8 and W8A16 FP using FP8-Marlin and cutlass kernels.

Support for other quantization types will be added in subsequent PRs.
  • Loading branch information
danieldk committed Nov 7, 2024
1 parent b1f9044 commit f4fd8d1
Show file tree
Hide file tree
Showing 21 changed files with 2,021 additions and 63 deletions.
7 changes: 4 additions & 3 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
};
nix-filter.url = "github:numtide/nix-filter";
tgi-nix.url = "github:huggingface/text-generation-inference-nix";
tgi-nix.url = "github:huggingface/text-generation-inference-nix/compressed-tensors-0.7.1";
nixpkgs.follows = "tgi-nix/nixpkgs";
flake-utils.url = "github:numtide/flake-utils";
rust-overlay = {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.609375,
"text": "What"
},
{
"id": 374,
"logprob": -0.92529297,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94628906,
"text": " learning"
},
{
"id": 30,
"logprob": -2.9042969,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 18682,
"logprob": -0.8769531,
"special": false,
"text": " Deep"
},
{
"id": 6975,
"logprob": -0.0076942444,
"special": false,
"text": " learning"
},
{
"id": 374,
"logprob": -0.25073242,
"special": false,
"text": " is"
},
{
"id": 264,
"logprob": -0.097595215,
"special": false,
"text": " a"
},
{
"id": 955,
"logprob": -0.921875,
"special": false,
"text": " type"
},
{
"id": 315,
"logprob": -0.00027918816,
"special": false,
"text": " of"
},
{
"id": 21075,
"logprob": -0.5527344,
"special": false,
"text": " artificial"
},
{
"id": 11478,
"logprob": -0.042541504,
"special": false,
"text": " intelligence"
},
{
"id": 320,
"logprob": -0.38891602,
"special": false,
"text": " ("
},
{
"id": 15836,
"logprob": -0.0011043549,
"special": false,
"text": "AI"
}
],
"top_tokens": null
},
"generated_text": " Deep learning is a type of artificial intelligence (AI"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.609375,
"text": "What"
},
{
"id": 374,
"logprob": -0.92529297,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94628906,
"text": " learning"
}
],
"seed": 0,
"tokens": [
{
"id": 5380,
"logprob": -0.23840332,
"special": false,
"text": "?\n"
},
{
"id": 34564,
"logprob": 0.0,
"special": false,
"text": "Deep"
},
{
"id": 6975,
"logprob": 0.0,
"special": false,
"text": " learning"
},
{
"id": 11,
"logprob": 0.0,
"special": false,
"text": ","
},
{
"id": 1101,
"logprob": -1.2011719,
"special": false,
"text": " also"
},
{
"id": 3967,
"logprob": 0.0,
"special": false,
"text": " known"
},
{
"id": 439,
"logprob": 0.0,
"special": false,
"text": " as"
},
{
"id": 30828,
"logprob": 0.0,
"special": false,
"text": " neural"
},
{
"id": 4009,
"logprob": -0.6777344,
"special": false,
"text": " network"
},
{
"id": 477,
"logprob": 0.0,
"special": false,
"text": " or"
}
],
"top_tokens": null
},
"generated_text": "What is deep learning?\nDeep learning, also known as neural network or"
}
Loading

0 comments on commit f4fd8d1

Please sign in to comment.