Skip to content

Commit 183b125

Browse files
committed
Update 2025-05-02 05:50:06
1 parent bec895a commit 183b125

40 files changed

+7708
-7772
lines changed

_sources/backend/function_calling.ipynb

+177-188
Large diffs are not rendered by default.

_sources/backend/lora.ipynb

+255-259
Large diffs are not rendered by default.

_sources/backend/native_api.ipynb

+273-323
Large diffs are not rendered by default.

_sources/backend/offline_engine_api.ipynb

+428-455
Large diffs are not rendered by default.

_sources/backend/openai_api_completions.ipynb

+283-187
Large diffs are not rendered by default.

_sources/backend/openai_api_embeddings.ipynb

+62-85
Large diffs are not rendered by default.

_sources/backend/openai_api_vision.ipynb

+81-93
Large diffs are not rendered by default.

_sources/backend/send_request.ipynb

+97-94
Large diffs are not rendered by default.

_sources/backend/separate_reasoning.ipynb

+113-136
Large diffs are not rendered by default.

_sources/backend/speculative_decoding.ipynb

+289-394
Large diffs are not rendered by default.

_sources/backend/structured_outputs.ipynb

+157-155
Large diffs are not rendered by default.

_sources/backend/structured_outputs_for_reasoning_models.ipynb

+533-476
Large diffs are not rendered by default.

_sources/frontend/frontend.ipynb

+234-228
Large diffs are not rendered by default.

backend/function_calling.html

+103-102
Large diffs are not rendered by default.

backend/function_calling.ipynb

+177-188
Large diffs are not rendered by default.

backend/lora.html

+207-205
Large diffs are not rendered by default.

backend/lora.ipynb

+255-259
Large diffs are not rendered by default.

backend/native_api.html

+191-175
Large diffs are not rendered by default.

backend/native_api.ipynb

+273-323
Large diffs are not rendered by default.

backend/offline_engine_api.html

+36-57
Large diffs are not rendered by default.

backend/offline_engine_api.ipynb

+428-455
Large diffs are not rendered by default.

backend/openai_api_completions.html

+165-121
Large diffs are not rendered by default.

backend/openai_api_completions.ipynb

+283-187
Large diffs are not rendered by default.

backend/openai_api_embeddings.html

+38-37
Original file line numberDiff line numberDiff line change
@@ -507,35 +507,35 @@ <h2>Launch A Server<a class="headerlink" href="#Launch-A-Server" title="Link to
507507
</div>
508508
<div class="output_area docutils container">
509509
<div class="highlight"><pre>
510-
[2025-05-02 05:09:19] server_args=ServerArgs(model_path=&#39;Alibaba-NLP/gte-Qwen2-1.5B-instruct&#39;, tokenizer_path=&#39;Alibaba-NLP/gte-Qwen2-1.5B-instruct&#39;, tokenizer_mode=&#39;auto&#39;, skip_tokenizer_init=False, enable_tokenizer_batch_encode=False, load_format=&#39;auto&#39;, trust_remote_code=False, dtype=&#39;auto&#39;, kv_cache_dtype=&#39;auto&#39;, quantization=None, quantization_param_path=None, context_length=None, device=&#39;cuda&#39;, served_model_name=&#39;Alibaba-NLP/gte-Qwen2-1.5B-instruct&#39;, chat_template=None, completion_template=None, is_embedding=True, revision=None, host=&#39;0.0.0.0&#39;, port=32531, mem_fraction_static=0.88, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy=&#39;fcfs&#39;, schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=162562981, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level=&#39;info&#39;, log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_path=&#39;sglang_storage&#39;, enable_cache_report=False, reasoning_parser=None, dp_size=1, load_balance_method=&#39;round_robin&#39;, ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args=&#39;{}&#39;, lora_paths=None, max_loras_per_batch=8, lora_backend=&#39;triton&#39;, attention_backend=None, sampling_backend=&#39;flashinfer&#39;, grammar_backend=&#39;xgrammar&#39;, speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type=&#39;qk&#39;, ds_sparse_decode_threshold=4096, disable_radix_cache=False, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_nccl_nvls=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_multimodal=None, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_deepep_moe=False, deepep_mode=&#39;auto&#39;, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=None, cuda_graph_bs=None, torchao_config=&#39;&#39;, enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, tool_call_parser=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy=&#39;write_through_selective&#39;, flashinfer_mla_disable_ragged=False, warmups=None, moe_dense_tp_size=None, n_share_experts_fusion=0, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode=&#39;null&#39;, disaggregation_bootstrap_port=8998, disaggregation_transfer_backend=&#39;mooncake&#39;, disaggregation_ib_device=None)
511-
[2025-05-02 05:09:19] Downcasting torch.float32 to torch.float16.
512-
[2025-05-02 05:09:32] Downcasting torch.float32 to torch.float16.
513-
[2025-05-02 05:09:33] Overlap scheduler is disabled for embedding models.
514-
[2025-05-02 05:09:33] Downcasting torch.float32 to torch.float16.
515-
[2025-05-02 05:09:33] Attention backend not set. Use fa3 backend by default.
516-
[2025-05-02 05:09:33] Init torch distributed begin.
517-
[2025-05-02 05:09:34] Init torch distributed ends. mem usage=0.00 GB
518-
[2025-05-02 05:09:34] Load weight begin. avail mem=78.60 GB
519-
[2025-05-02 05:09:34] Ignore import error when loading sglang.srt.models.llama4.
520-
[2025-05-02 05:09:35] Using model weights format [&#39;*.safetensors&#39;]
510+
[2025-05-02 05:43:58] server_args=ServerArgs(model_path=&#39;Alibaba-NLP/gte-Qwen2-1.5B-instruct&#39;, tokenizer_path=&#39;Alibaba-NLP/gte-Qwen2-1.5B-instruct&#39;, tokenizer_mode=&#39;auto&#39;, skip_tokenizer_init=False, enable_tokenizer_batch_encode=False, load_format=&#39;auto&#39;, trust_remote_code=False, dtype=&#39;auto&#39;, kv_cache_dtype=&#39;auto&#39;, quantization=None, quantization_param_path=None, context_length=None, device=&#39;cuda&#39;, served_model_name=&#39;Alibaba-NLP/gte-Qwen2-1.5B-instruct&#39;, chat_template=None, completion_template=None, is_embedding=True, revision=None, host=&#39;0.0.0.0&#39;, port=34798, mem_fraction_static=0.88, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy=&#39;fcfs&#39;, schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=91062660, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level=&#39;info&#39;, log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_path=&#39;sglang_storage&#39;, enable_cache_report=False, reasoning_parser=None, dp_size=1, load_balance_method=&#39;round_robin&#39;, ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args=&#39;{}&#39;, lora_paths=None, max_loras_per_batch=8, lora_backend=&#39;triton&#39;, attention_backend=None, sampling_backend=&#39;flashinfer&#39;, grammar_backend=&#39;xgrammar&#39;, speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type=&#39;qk&#39;, ds_sparse_decode_threshold=4096, disable_radix_cache=False, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_nccl_nvls=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_multimodal=None, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_deepep_moe=False, deepep_mode=&#39;auto&#39;, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=None, cuda_graph_bs=None, torchao_config=&#39;&#39;, enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, tool_call_parser=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy=&#39;write_through_selective&#39;, flashinfer_mla_disable_ragged=False, warmups=None, moe_dense_tp_size=None, n_share_experts_fusion=0, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode=&#39;null&#39;, disaggregation_bootstrap_port=8998, disaggregation_transfer_backend=&#39;mooncake&#39;, disaggregation_ib_device=None)
511+
[2025-05-02 05:43:58] Downcasting torch.float32 to torch.float16.
512+
[2025-05-02 05:44:07] Downcasting torch.float32 to torch.float16.
513+
[2025-05-02 05:44:08] Overlap scheduler is disabled for embedding models.
514+
[2025-05-02 05:44:08] Downcasting torch.float32 to torch.float16.
515+
[2025-05-02 05:44:08] Attention backend not set. Use fa3 backend by default.
516+
[2025-05-02 05:44:08] Init torch distributed begin.
517+
[2025-05-02 05:44:08] Init torch distributed ends. mem usage=0.00 GB
518+
[2025-05-02 05:44:08] Load weight begin. avail mem=58.38 GB
519+
[2025-05-02 05:44:08] Ignore import error when loading sglang.srt.models.llama4.
520+
[2025-05-02 05:44:09] Using model weights format [&#39;*.safetensors&#39;]
521521
Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00&lt;?, ?it/s]
522-
Loading safetensors checkpoint shards: 50% Completed | 1/2 [00:01&lt;00:01, 1.26s/it]
523-
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:07&lt;00:00, 4.47s/it]
524-
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:07&lt;00:00, 3.99s/it]
525-
526-
[2025-05-02 05:09:44] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=60.05 GB, mem usage=18.54 GB.
527-
[2025-05-02 05:09:44] KV Cache is allocated. #tokens: 20480, K size: 0.27 GB, V size: 0.27 GB
528-
[2025-05-02 05:09:44] Memory pool end. avail mem=59.23 GB
529-
[2025-05-02 05:09:44] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072
530-
[2025-05-02 05:09:45] INFO: Started server process [2407028]
531-
[2025-05-02 05:09:45] INFO: Waiting for application startup.
532-
[2025-05-02 05:09:45] INFO: Application startup complete.
533-
[2025-05-02 05:09:45] INFO: Uvicorn running on http://0.0.0.0:32531 (Press CTRL+C to quit)
534-
[2025-05-02 05:09:46] INFO: 127.0.0.1:57576 - &#34;GET /v1/models HTTP/1.1&#34; 200 OK
535-
[2025-05-02 05:09:46] INFO: 127.0.0.1:57578 - &#34;GET /get_model_info HTTP/1.1&#34; 200 OK
536-
[2025-05-02 05:09:46] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
537-
[2025-05-02 05:09:47] INFO: 127.0.0.1:57580 - &#34;POST /encode HTTP/1.1&#34; 200 OK
538-
[2025-05-02 05:09:47] The server is fired up and ready to roll!
522+
Loading safetensors checkpoint shards: 50% Completed | 1/2 [00:01&lt;00:01, 1.53s/it]
523+
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02&lt;00:00, 1.04s/it]
524+
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02&lt;00:00, 1.11s/it]
525+
526+
[2025-05-02 05:44:11] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=54.75 GB, mem usage=3.63 GB.
527+
[2025-05-02 05:44:11] KV Cache is allocated. #tokens: 20480, K size: 0.27 GB, V size: 0.27 GB
528+
[2025-05-02 05:44:11] Memory pool end. avail mem=53.93 GB
529+
[2025-05-02 05:44:11] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072
530+
[2025-05-02 05:44:12] INFO: Started server process [1705597]
531+
[2025-05-02 05:44:12] INFO: Waiting for application startup.
532+
[2025-05-02 05:44:12] INFO: Application startup complete.
533+
[2025-05-02 05:44:12] INFO: Uvicorn running on http://0.0.0.0:34798 (Press CTRL+C to quit)
534+
[2025-05-02 05:44:12] INFO: 127.0.0.1:47564 - &#34;GET /v1/models HTTP/1.1&#34; 200 OK
535+
[2025-05-02 05:44:13] INFO: 127.0.0.1:47576 - &#34;GET /get_model_info HTTP/1.1&#34; 200 OK
536+
[2025-05-02 05:44:13] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
537+
[2025-05-02 05:44:14] INFO: 127.0.0.1:47588 - &#34;POST /encode HTTP/1.1&#34; 200 OK
538+
[2025-05-02 05:44:14] The server is fired up and ready to roll!
539539
</pre></div></div>
540540
</div>
541541
<div class="nboutput nblast docutils container">
@@ -571,8 +571,8 @@ <h2>Using cURL<a class="headerlink" href="#Using-cURL" title="Link to this headi
571571
</div>
572572
<div class="output_area docutils container">
573573
<div class="highlight"><pre>
574-
[2025-05-02 05:09:51] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
575-
[2025-05-02 05:09:51] INFO: 127.0.0.1:57588 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
574+
[2025-05-02 05:44:17] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
575+
[2025-05-02 05:44:17] INFO: 127.0.0.1:47600 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
576576
</pre></div></div>
577577
</div>
578578
<div class="nboutput nblast docutils container">
@@ -608,8 +608,8 @@ <h2>Using Python Requests<a class="headerlink" href="#Using-Python-Requests" tit
608608
</div>
609609
<div class="output_area docutils container">
610610
<div class="highlight"><pre>
611-
[2025-05-02 05:09:51] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0
612-
[2025-05-02 05:09:51] INFO: 127.0.0.1:57596 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
611+
[2025-05-02 05:44:17] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0
612+
[2025-05-02 05:44:17] INFO: 127.0.0.1:47612 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
613613
</pre></div></div>
614614
</div>
615615
<div class="nboutput nblast docutils container">
@@ -645,8 +645,8 @@ <h2>Using OpenAI Python Client<a class="headerlink" href="#Using-OpenAI-Python-C
645645
</div>
646646
<div class="output_area docutils container">
647647
<div class="highlight"><pre>
648-
[2025-05-02 05:09:51] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0
649-
[2025-05-02 05:09:51] INFO: 127.0.0.1:57598 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
648+
[2025-05-02 05:44:17] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0
649+
[2025-05-02 05:44:17] INFO: 127.0.0.1:47614 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
650650
</pre></div></div>
651651
</div>
652652
<div class="nboutput nblast docutils container">
@@ -688,8 +688,8 @@ <h2>Using Input IDs<a class="headerlink" href="#Using-Input-IDs" title="Link to
688688
</div>
689689
<div class="output_area docutils container">
690690
<div class="highlight"><pre>
691-
[2025-05-02 05:09:52] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0
692-
[2025-05-02 05:09:52] INFO: 127.0.0.1:57606 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
691+
[2025-05-02 05:44:18] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0
692+
[2025-05-02 05:44:18] INFO: 127.0.0.1:47620 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
693693
</pre></div></div>
694694
</div>
695695
<div class="nboutput nblast docutils container">
@@ -711,7 +711,8 @@ <h2>Using Input IDs<a class="headerlink" href="#Using-Input-IDs" title="Link to
711711
</div>
712712
<div class="output_area docutils container">
713713
<div class="highlight"><pre>
714-
[2025-05-02 05:09:52] Child process unexpectedly failed with an exit code 9. pid=2407173
714+
[2025-05-02 05:44:18] Child process unexpectedly failed with an exit code 9. pid=1706224
715+
[2025-05-02 05:44:18] Child process unexpectedly failed with an exit code 9. pid=1706107
715716
</pre></div></div>
716717
</div>
717718
</section>

0 commit comments

Comments
 (0)