-
Notifications
You must be signed in to change notification settings - Fork 31
♻️ use fp8 model for testing SB + CB #359
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
86424a2
1b600fe
b3a507f
0939fdf
8ac83d5
4f69dfc
4aa46f1
df6e3f0
a8bd0ed
c284fad
7bd7a3a
891a655
baa2b80
bdb09bd
7230e0e
620f98e
d8fe05b
bfe4015
00336bc
545ef84
46ede6a
5893df1
b1186d3
1f4b8bf
5df9f2c
51a3913
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did a major refactoring on this file just because we were using the same logic for the tests but different parameters. Do double check and let me know if I missed something! |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,19 +4,39 @@ | |
|
|
||
|
|
||
| @pytest.mark.parametrize("model", get_spyre_model_list()) | ||
| @pytest.mark.parametrize("tp_size", [ | ||
| pytest.param(1, marks=pytest.mark.basic), | ||
| pytest.param(2, marks=pytest.mark.multi), | ||
| pytest.param(4, marks=pytest.mark.multi), | ||
| pytest.param(8, marks=pytest.mark.multi), | ||
| ], | ||
| ids=lambda val: f"TP({val})") | ||
| @pytest.mark.parametrize( | ||
| "tp_size", | ||
| [ | ||
| pytest.param(1, marks=pytest.mark.basic), | ||
| pytest.param(2, marks=pytest.mark.multi), | ||
| pytest.param(4, marks=pytest.mark.multi), | ||
| pytest.param(8, marks=pytest.mark.multi), | ||
| ], | ||
| ids=lambda val: f"TP({val})", | ||
| ) | ||
| @pytest.mark.parametrize("backend", get_spyre_backend_list()) | ||
| @pytest.mark.parametrize("warmup_shape", [[ | ||
| (64, 20, 1), | ||
| ]]) | ||
| def test_openai_serving(remote_openai_server, model, warmup_shape, backend, | ||
| tp_size): | ||
| @pytest.mark.parametrize( | ||
| "warmup_shape", | ||
| [[ | ||
| (64, 20, 1), | ||
| ]], | ||
| ) | ||
| @pytest.mark.parametrize("cb", | ||
| [pytest.param(1, marks=pytest.mark.cb, id="cb"), 0]) | ||
| @pytest.mark.parametrize("max_num_seqs", [2], | ||
| ids=lambda val: f"max_num_seqs({val})") | ||
| @pytest.mark.parametrize("max_model_len", [256], | ||
| ids=lambda val: f"max_model_len({val})") | ||
| def test_openai_serving( | ||
| remote_openai_server, | ||
| model, | ||
| warmup_shape, | ||
| backend, | ||
| tp_size, | ||
| cb, | ||
| max_num_seqs, | ||
| max_model_len, | ||
| ): | ||
| """Test online serving using the `vllm serve` CLI""" | ||
|
|
||
| client = remote_openai_server.get_client() | ||
|
|
@@ -35,6 +55,9 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend, | |
| assert len(completion.choices) == 2 | ||
| assert len(completion.choices[0].text) > 0 | ||
|
|
||
| # rest are SB tests | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think? |
||
| if cb: | ||
| return | ||
| # Check some basic error handling as well. This is all done in one test | ||
| # now to avoid server boot-up overhead to test each case. | ||
| # To change this we'll need: | ||
|
|
@@ -55,83 +78,3 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend, | |
| max_tokens=25) | ||
| except openai.BadRequestError as e: | ||
| assert "warmup" in str(e) | ||
|
|
||
|
|
||
| @pytest.mark.skip(reason="Test disabled until a model is available") | ||
| @pytest.mark.parametrize("model", get_spyre_model_list(quantized="gptq")) | ||
| @pytest.mark.parametrize("backend", ["sendnn"]) | ||
| @pytest.mark.parametrize("quantization", ["gptq"]) | ||
| @pytest.mark.parametrize("warmup_shape", [[(64, 20, 1)]]) | ||
| def test_openai_serving_gptq(remote_openai_server, model, backend, | ||
maxdebayser marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| warmup_shape, quantization): | ||
| """Test online serving a GPTQ model with the sendnn backend only""" | ||
|
|
||
| client = remote_openai_server.get_client() | ||
| completion = client.completions.create(model=model, | ||
| prompt="Hello World!", | ||
| max_tokens=5, | ||
| temperature=0.0) | ||
| assert len(completion.choices) == 1 | ||
| assert len(completion.choices[0].text) > 0 | ||
|
|
||
| completion = client.completions.create(model=model, | ||
| prompt="Hello World!", | ||
| max_tokens=5, | ||
| temperature=1.0, | ||
| n=2) | ||
| assert len(completion.choices) == 2 | ||
| assert len(completion.choices[0].text) > 0 | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("model", get_spyre_model_list(quantized="fp8")) | ||
| @pytest.mark.parametrize("backend", get_spyre_backend_list()) | ||
| @pytest.mark.parametrize("warmup_shape", [[(64, 20, 1)]]) | ||
| def test_openai_serving_fp8(remote_openai_server, model, backend, | ||
| warmup_shape): | ||
| """Test online serving an FP8 model""" | ||
|
|
||
| client = remote_openai_server.get_client() | ||
| completion = client.completions.create(model=model, | ||
| prompt="Hello World!", | ||
| max_tokens=5, | ||
| temperature=0.0) | ||
| assert len(completion.choices) == 1 | ||
| assert len(completion.choices[0].text) > 0 | ||
|
|
||
| completion = client.completions.create(model=model, | ||
| prompt="Hello World!", | ||
| max_tokens=5, | ||
| temperature=1.0, | ||
| n=2) | ||
| assert len(completion.choices) == 2 | ||
| assert len(completion.choices[0].text) > 0 | ||
|
|
||
|
|
||
| @pytest.mark.basic | ||
| @pytest.mark.parametrize("model", get_spyre_model_list()) | ||
| @pytest.mark.parametrize("cb", | ||
| [pytest.param(1, marks=pytest.mark.cb, id="cb")]) | ||
| @pytest.mark.parametrize("max_num_seqs", [2], | ||
| ids=lambda val: f"max_num_seqs({val})") | ||
| @pytest.mark.parametrize("max_model_len", [256], | ||
| ids=lambda val: f"max_model_len({val})") | ||
| @pytest.mark.parametrize("backend", get_spyre_backend_list()) | ||
| def test_openai_serving_cb(remote_openai_server, model, backend, cb, | ||
| max_num_seqs, max_model_len): | ||
| """Test online serving with CB using the `vllm serve` CLI""" | ||
|
|
||
| client = remote_openai_server.get_client() | ||
| completion = client.completions.create(model=model, | ||
| prompt="Hello World!", | ||
| max_tokens=5, | ||
| temperature=0.0) | ||
| assert len(completion.choices) == 1 | ||
| assert len(completion.choices[0].text) > 0 | ||
|
|
||
| completion = client.completions.create(model=model, | ||
| prompt="Hello World!", | ||
| max_tokens=5, | ||
| temperature=1.0, | ||
| n=2) | ||
| assert len(completion.choices) == 2 | ||
| assert len(completion.choices[0].text) > 0 | ||
Uh oh!
There was an error while loading. Please reload this page.