-
Notifications
You must be signed in to change notification settings - Fork 31
♻️ use fp8 model for testing SB + CB #359
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
86424a2
1b600fe
b3a507f
0939fdf
8ac83d5
4f69dfc
4aa46f1
df6e3f0
a8bd0ed
c284fad
7bd7a3a
891a655
baa2b80
bdb09bd
7230e0e
620f98e
d8fe05b
bfe4015
00336bc
545ef84
46ede6a
5893df1
b1186d3
1f4b8bf
5df9f2c
51a3913
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,7 +42,7 @@ jobs: | |
| markers: "cpu and decoder and not cb" | ||
| flags: "--timeout=300" | ||
| - name: "quantized" | ||
| markers: "cpu and quantized" | ||
| markers: "cpu and decoder" | ||
| flags: "--timeout=300" | ||
| - name: "embedding" | ||
| markers: "cpu and embedding" | ||
|
|
@@ -179,5 +179,9 @@ jobs: | |
| # re-install the vllm_sypre package from source | ||
| source .venv/bin/activate | ||
|
|
||
| if [ "${{ matrix.test_suite.name }}" == "quantized" ]; then | ||
| export VLLM_SPYRE_TEST_MODEL_LIST="ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8" | ||
| fi | ||
|
|
||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess a better way to do this would be to select the quantized model when using |
||
| python3 -m pytest ${{ matrix.test_suite.flags }} \ | ||
| tests -v -m "${{ matrix.test_suite.markers }}" | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did a major refactoring on this file just because we were using the same logic for the tests but different parameters. Do double check and let me know if I missed something! |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,19 +4,39 @@ | |
|
|
||
|
|
||
| @pytest.mark.parametrize("model", get_spyre_model_list()) | ||
| @pytest.mark.parametrize("tp_size", [ | ||
| pytest.param(1, marks=pytest.mark.basic), | ||
| pytest.param(2, marks=pytest.mark.multi), | ||
| pytest.param(4, marks=pytest.mark.multi), | ||
| pytest.param(8, marks=pytest.mark.multi), | ||
| ], | ||
| ids=lambda val: f"TP({val})") | ||
| @pytest.mark.parametrize( | ||
| "tp_size", | ||
| [ | ||
| pytest.param(1, marks=pytest.mark.basic), | ||
| pytest.param(2, marks=pytest.mark.multi), | ||
| pytest.param(4, marks=pytest.mark.multi), | ||
| pytest.param(8, marks=pytest.mark.multi), | ||
| ], | ||
| ids=lambda val: f"TP({val})", | ||
| ) | ||
| @pytest.mark.parametrize("backend", get_spyre_backend_list()) | ||
| @pytest.mark.parametrize("warmup_shape", [[ | ||
| (64, 20, 1), | ||
| ]]) | ||
| def test_openai_serving(remote_openai_server, model, warmup_shape, backend, | ||
| tp_size): | ||
| @pytest.mark.parametrize( | ||
| "warmup_shape", | ||
| [[ | ||
| (64, 20, 1), | ||
| ]], | ||
| ) | ||
| @pytest.mark.parametrize("cb", | ||
| [pytest.param(1, marks=pytest.mark.cb, id="cb"), 0]) | ||
| @pytest.mark.parametrize("max_num_seqs", [2], | ||
| ids=lambda val: f"max_num_seqs({val})") | ||
| @pytest.mark.parametrize("max_model_len", [256], | ||
| ids=lambda val: f"max_model_len({val})") | ||
| def test_openai_serving( | ||
| remote_openai_server, | ||
| model, | ||
| warmup_shape, | ||
| backend, | ||
| tp_size, | ||
| cb, | ||
| max_num_seqs, | ||
| max_model_len, | ||
| ): | ||
| """Test online serving using the `vllm serve` CLI""" | ||
|
|
||
| client = remote_openai_server.get_client() | ||
|
|
@@ -35,6 +55,9 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend, | |
| assert len(completion.choices) == 2 | ||
| assert len(completion.choices[0].text) > 0 | ||
|
|
||
| # rest are SB tests | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think? |
||
| if cb: | ||
| return | ||
| # Check some basic error handling as well. This is all done in one test | ||
| # now to avoid server boot-up overhead to test each case. | ||
| # To change this we'll need: | ||
|
|
@@ -55,83 +78,3 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend, | |
| max_tokens=25) | ||
| except openai.BadRequestError as e: | ||
| assert "warmup" in str(e) | ||
|
|
||
|
|
||
| @pytest.mark.skip(reason="Test disabled until a model is available") | ||
| @pytest.mark.parametrize("model", get_spyre_model_list(quantized="gptq")) | ||
| @pytest.mark.parametrize("backend", ["sendnn"]) | ||
| @pytest.mark.parametrize("quantization", ["gptq"]) | ||
| @pytest.mark.parametrize("warmup_shape", [[(64, 20, 1)]]) | ||
| def test_openai_serving_gptq(remote_openai_server, model, backend, | ||
maxdebayser marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| warmup_shape, quantization): | ||
| """Test online serving a GPTQ model with the sendnn backend only""" | ||
|
|
||
| client = remote_openai_server.get_client() | ||
| completion = client.completions.create(model=model, | ||
| prompt="Hello World!", | ||
| max_tokens=5, | ||
| temperature=0.0) | ||
| assert len(completion.choices) == 1 | ||
| assert len(completion.choices[0].text) > 0 | ||
|
|
||
| completion = client.completions.create(model=model, | ||
| prompt="Hello World!", | ||
| max_tokens=5, | ||
| temperature=1.0, | ||
| n=2) | ||
| assert len(completion.choices) == 2 | ||
| assert len(completion.choices[0].text) > 0 | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("model", get_spyre_model_list(quantized="fp8")) | ||
| @pytest.mark.parametrize("backend", get_spyre_backend_list()) | ||
| @pytest.mark.parametrize("warmup_shape", [[(64, 20, 1)]]) | ||
| def test_openai_serving_fp8(remote_openai_server, model, backend, | ||
| warmup_shape): | ||
| """Test online serving an FP8 model""" | ||
|
|
||
| client = remote_openai_server.get_client() | ||
| completion = client.completions.create(model=model, | ||
| prompt="Hello World!", | ||
| max_tokens=5, | ||
| temperature=0.0) | ||
| assert len(completion.choices) == 1 | ||
| assert len(completion.choices[0].text) > 0 | ||
|
|
||
| completion = client.completions.create(model=model, | ||
| prompt="Hello World!", | ||
| max_tokens=5, | ||
| temperature=1.0, | ||
| n=2) | ||
| assert len(completion.choices) == 2 | ||
| assert len(completion.choices[0].text) > 0 | ||
|
|
||
|
|
||
| @pytest.mark.basic | ||
| @pytest.mark.parametrize("model", get_spyre_model_list()) | ||
| @pytest.mark.parametrize("cb", | ||
| [pytest.param(1, marks=pytest.mark.cb, id="cb")]) | ||
| @pytest.mark.parametrize("max_num_seqs", [2], | ||
| ids=lambda val: f"max_num_seqs({val})") | ||
| @pytest.mark.parametrize("max_model_len", [256], | ||
| ids=lambda val: f"max_model_len({val})") | ||
| @pytest.mark.parametrize("backend", get_spyre_backend_list()) | ||
| def test_openai_serving_cb(remote_openai_server, model, backend, cb, | ||
| max_num_seqs, max_model_len): | ||
| """Test online serving with CB using the `vllm serve` CLI""" | ||
|
|
||
| client = remote_openai_server.get_client() | ||
| completion = client.completions.create(model=model, | ||
| prompt="Hello World!", | ||
| max_tokens=5, | ||
| temperature=0.0) | ||
| assert len(completion.choices) == 1 | ||
| assert len(completion.choices[0].text) > 0 | ||
|
|
||
| completion = client.completions.create(model=model, | ||
| prompt="Hello World!", | ||
| max_tokens=5, | ||
| temperature=1.0, | ||
| n=2) | ||
| assert len(completion.choices) == 2 | ||
| assert len(completion.choices[0].text) > 0 | ||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Uh oh!
There was an error while loading. Please reload this page.