|
57 | 57 | <link rel="prev" title="OpenAI APIs - Vision" href="openai_api_vision.html" />
|
58 | 58 | <meta name="viewport" content="width=device-width, initial-scale=1"/>
|
59 | 59 | <meta name="docsearch:language" content="en"/>
|
60 |
| - <meta name="docbuild:last-update" content="May 02, 2025"/> |
| 60 | + <meta name="docbuild:last-update" content="May 03, 2025"/> |
61 | 61 | </head>
|
62 | 62 |
|
63 | 63 |
|
@@ -507,35 +507,35 @@ <h2>Launch A Server<a class="headerlink" href="#Launch-A-Server" title="Link to
|
507 | 507 | </div>
|
508 | 508 | <div class="output_area docutils container">
|
509 | 509 | <div class="highlight"><pre>
|
510 |
| -[2025-05-02 16:37:09] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-1.5B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-1.5B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, enable_tokenizer_batch_encode=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-1.5B-instruct', chat_template=None, completion_template=None, is_embedding=True, revision=None, host='0.0.0.0', port=36432, mem_fraction_static=0.88, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=377892282, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, dp_size=1, load_balance_method='round_robin', ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_nccl_nvls=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_multimodal=None, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_deepep_moe=False, deepep_mode='auto', enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=None, cuda_graph_bs=None, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, tool_call_parser=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through_selective', flashinfer_mla_disable_ragged=False, warmups=None, moe_dense_tp_size=None, n_share_experts_fusion=0, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_bootstrap_port=8998, disaggregation_transfer_backend='mooncake', disaggregation_ib_device=None) |
511 |
| -[2025-05-02 16:37:09] Downcasting torch.float32 to torch.float16. |
512 |
| -[2025-05-02 16:37:22] Downcasting torch.float32 to torch.float16. |
513 |
| -[2025-05-02 16:37:23] Overlap scheduler is disabled for embedding models. |
514 |
| -[2025-05-02 16:37:23] Downcasting torch.float32 to torch.float16. |
515 |
| -[2025-05-02 16:37:24] Attention backend not set. Use fa3 backend by default. |
516 |
| -[2025-05-02 16:37:24] Init torch distributed begin. |
517 |
| -[2025-05-02 16:37:24] Init torch distributed ends. mem usage=0.00 GB |
518 |
| -[2025-05-02 16:37:24] Load weight begin. avail mem=78.60 GB |
519 |
| -[2025-05-02 16:37:24] Ignore import error when loading sglang.srt.models.llama4. |
520 |
| -[2025-05-02 16:37:26] Using model weights format ['*.safetensors'] |
| 510 | +[2025-05-03 07:35:23] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-1.5B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-1.5B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, enable_tokenizer_batch_encode=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-1.5B-instruct', chat_template=None, completion_template=None, is_embedding=True, revision=None, host='0.0.0.0', port=39508, mem_fraction_static=0.88, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=682895549, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, dp_size=1, load_balance_method='round_robin', ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_nccl_nvls=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_multimodal=None, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_deepep_moe=False, deepep_mode='auto', enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=None, cuda_graph_bs=None, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, tool_call_parser=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through_selective', flashinfer_mla_disable_ragged=False, warmups=None, moe_dense_tp_size=None, n_share_experts_fusion=0, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_bootstrap_port=8998, disaggregation_transfer_backend='mooncake', disaggregation_ib_device=None) |
| 511 | +[2025-05-03 07:35:23] Downcasting torch.float32 to torch.float16. |
| 512 | +[2025-05-03 07:35:33] Downcasting torch.float32 to torch.float16. |
| 513 | +[2025-05-03 07:35:33] Overlap scheduler is disabled for embedding models. |
| 514 | +[2025-05-03 07:35:33] Downcasting torch.float32 to torch.float16. |
| 515 | +[2025-05-03 07:35:33] Attention backend not set. Use fa3 backend by default. |
| 516 | +[2025-05-03 07:35:33] Init torch distributed begin. |
| 517 | +[2025-05-03 07:35:33] Init torch distributed ends. mem usage=0.00 GB |
| 518 | +[2025-05-03 07:35:33] Load weight begin. avail mem=76.35 GB |
| 519 | +[2025-05-03 07:35:34] Ignore import error when loading sglang.srt.models.llama4. |
| 520 | +[2025-05-03 07:35:34] Using model weights format ['*.safetensors'] |
521 | 521 | Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00<?, ?it/s]
|
522 |
| -Loading safetensors checkpoint shards: 50% Completed | 1/2 [00:03<00:03, 3.51s/it] |
523 |
| -Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:11<00:00, 6.43s/it] |
524 |
| -Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:11<00:00, 5.99s/it] |
525 |
| - |
526 |
| -[2025-05-02 16:37:39] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=58.59 GB, mem usage=20.01 GB. |
527 |
| -[2025-05-02 16:37:39] KV Cache is allocated. #tokens: 20480, K size: 0.27 GB, V size: 0.27 GB |
528 |
| -[2025-05-02 16:37:39] Memory pool end. avail mem=57.76 GB |
529 |
| -[2025-05-02 16:37:40] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072 |
530 |
| -[2025-05-02 16:37:40] INFO: Started server process [2682437] |
531 |
| -[2025-05-02 16:37:40] INFO: Waiting for application startup. |
532 |
| -[2025-05-02 16:37:40] INFO: Application startup complete. |
533 |
| -[2025-05-02 16:37:40] INFO: Uvicorn running on http://0.0.0.0:36432 (Press CTRL+C to quit) |
534 |
| -[2025-05-02 16:37:41] INFO: 127.0.0.1:52148 - "GET /v1/models HTTP/1.1" 200 OK |
535 |
| -[2025-05-02 16:37:41] INFO: 127.0.0.1:52158 - "GET /get_model_info HTTP/1.1" 200 OK |
536 |
| -[2025-05-02 16:37:41] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0 |
537 |
| -[2025-05-02 16:37:42] INFO: 127.0.0.1:52172 - "POST /encode HTTP/1.1" 200 OK |
538 |
| -[2025-05-02 16:37:42] The server is fired up and ready to roll! |
| 522 | +Loading safetensors checkpoint shards: 50% Completed | 1/2 [00:01<00:01, 1.44s/it] |
| 523 | +Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00, 1.04it/s] |
| 524 | +Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00, 1.03s/it] |
| 525 | + |
| 526 | +[2025-05-03 07:35:36] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=45.43 GB, mem usage=30.93 GB. |
| 527 | +[2025-05-03 07:35:36] KV Cache is allocated. #tokens: 20480, K size: 0.27 GB, V size: 0.27 GB |
| 528 | +[2025-05-03 07:35:36] Memory pool end. avail mem=44.60 GB |
| 529 | +[2025-05-03 07:35:37] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072 |
| 530 | +[2025-05-03 07:35:37] INFO: Started server process [2131176] |
| 531 | +[2025-05-03 07:35:37] INFO: Waiting for application startup. |
| 532 | +[2025-05-03 07:35:37] INFO: Application startup complete. |
| 533 | +[2025-05-03 07:35:37] INFO: Uvicorn running on http://0.0.0.0:39508 (Press CTRL+C to quit) |
| 534 | +[2025-05-03 07:35:38] INFO: 127.0.0.1:36736 - "GET /v1/models HTTP/1.1" 200 OK |
| 535 | +[2025-05-03 07:35:38] INFO: 127.0.0.1:36748 - "GET /get_model_info HTTP/1.1" 200 OK |
| 536 | +[2025-05-03 07:35:38] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0 |
| 537 | +[2025-05-03 07:35:39] INFO: 127.0.0.1:36758 - "POST /encode HTTP/1.1" 200 OK |
| 538 | +[2025-05-03 07:35:39] The server is fired up and ready to roll! |
539 | 539 | </pre></div></div>
|
540 | 540 | </div>
|
541 | 541 | <div class="nboutput nblast docutils container">
|
@@ -571,8 +571,8 @@ <h2>Using cURL<a class="headerlink" href="#Using-cURL" title="Link to this headi
|
571 | 571 | </div>
|
572 | 572 | <div class="output_area docutils container">
|
573 | 573 | <div class="highlight"><pre>
|
574 |
| -[2025-05-02 16:37:46] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0 |
575 |
| -[2025-05-02 16:37:46] INFO: 127.0.0.1:40698 - "POST /v1/embeddings HTTP/1.1" 200 OK |
| 574 | +[2025-05-03 07:35:43] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0 |
| 575 | +[2025-05-03 07:35:43] INFO: 127.0.0.1:36774 - "POST /v1/embeddings HTTP/1.1" 200 OK |
576 | 576 | </pre></div></div>
|
577 | 577 | </div>
|
578 | 578 | <div class="nboutput nblast docutils container">
|
@@ -608,8 +608,8 @@ <h2>Using Python Requests<a class="headerlink" href="#Using-Python-Requests" tit
|
608 | 608 | </div>
|
609 | 609 | <div class="output_area docutils container">
|
610 | 610 | <div class="highlight"><pre>
|
611 |
| -[2025-05-02 16:37:46] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0 |
612 |
| -[2025-05-02 16:37:46] INFO: 127.0.0.1:40712 - "POST /v1/embeddings HTTP/1.1" 200 OK |
| 611 | +[2025-05-03 07:35:43] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0 |
| 612 | +[2025-05-03 07:35:43] INFO: 127.0.0.1:36778 - "POST /v1/embeddings HTTP/1.1" 200 OK |
613 | 613 | </pre></div></div>
|
614 | 614 | </div>
|
615 | 615 | <div class="nboutput nblast docutils container">
|
@@ -645,8 +645,8 @@ <h2>Using OpenAI Python Client<a class="headerlink" href="#Using-OpenAI-Python-C
|
645 | 645 | </div>
|
646 | 646 | <div class="output_area docutils container">
|
647 | 647 | <div class="highlight"><pre>
|
648 |
| -[2025-05-02 16:37:46] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0 |
649 |
| -[2025-05-02 16:37:46] INFO: 127.0.0.1:40718 - "POST /v1/embeddings HTTP/1.1" 200 OK |
| 648 | +[2025-05-03 07:35:43] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0 |
| 649 | +[2025-05-03 07:35:43] INFO: 127.0.0.1:36782 - "POST /v1/embeddings HTTP/1.1" 200 OK |
650 | 650 | </pre></div></div>
|
651 | 651 | </div>
|
652 | 652 | <div class="nboutput nblast docutils container">
|
@@ -688,8 +688,8 @@ <h2>Using Input IDs<a class="headerlink" href="#Using-Input-IDs" title="Link to
|
688 | 688 | </div>
|
689 | 689 | <div class="output_area docutils container">
|
690 | 690 | <div class="highlight"><pre>
|
691 |
| -[2025-05-02 16:37:46] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0 |
692 |
| -[2025-05-02 16:37:46] INFO: 127.0.0.1:40722 - "POST /v1/embeddings HTTP/1.1" 200 OK |
| 691 | +[2025-05-03 07:35:44] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0 |
| 692 | +[2025-05-03 07:35:44] INFO: 127.0.0.1:36792 - "POST /v1/embeddings HTTP/1.1" 200 OK |
693 | 693 | </pre></div></div>
|
694 | 694 | </div>
|
695 | 695 | <div class="nboutput nblast docutils container">
|
@@ -792,7 +792,7 @@ <h2>Using Input IDs<a class="headerlink" href="#Using-Input-IDs" title="Link to
|
792 | 792 |
|
793 | 793 | <div class="footer-item">
|
794 | 794 | <p class="last-updated">
|
795 |
| - Last updated on May 02, 2025. |
| 795 | + Last updated on May 03, 2025. |
796 | 796 | <br/>
|
797 | 797 | </p>
|
798 | 798 | </div>
|
|
0 commit comments