@@ -507,35 +507,35 @@ <h2>Launch A Server<a class="headerlink" href="#Launch-A-Server" title="Link to
507
507
</ div >
508
508
< div class ="output_area docutils container ">
509
509
< div class ="highlight "> < pre >
510
- [2025-05-01 00:53:02] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-1.5B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-1.5B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, enable_tokenizer_batch_encode=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-1.5B-instruct', chat_template=None, completion_template=None, is_embedding=True, revision=None, host='0.0.0.0', port=34416, mem_fraction_static=0.88, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, stream_interval=1, stream_output=False, random_seed=581794747, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, dp_size=1, load_balance_method='round_robin', ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_nccl_nvls=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_multimodal=None, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_deepep_moe=False, deepep_mode='auto', enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=None, cuda_graph_bs=None, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, tool_call_parser=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through_selective', flashinfer_mla_disable_ragged=False, warmups=None, moe_dense_tp_size=None, n_share_experts_fusion=0, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_bootstrap_port=8998, disaggregation_transfer_backend='mooncake', disaggregation_ib_device=None)
511
- [2025-05-01 00:53:02 ] Downcasting torch.float32 to torch.float16.
512
- [2025-05-01 00:53:11 TP0 ] Downcasting torch.float32 to torch.float16.
513
- [2025-05-01 00:53:11 TP0 ] Overlap scheduler is disabled for embedding models.
514
- [2025-05-01 00:53:11 TP0 ] Downcasting torch.float32 to torch.float16.
515
- [2025-05-01 00:53:12 TP0 ] Attention backend not set. Use fa3 backend by default.
516
- [2025-05-01 00:53:12 TP0 ] Init torch distributed begin.
517
- [2025-05-01 00:53:12 TP0 ] Init torch distributed ends. mem usage=0.00 GB
518
- [2025-05-01 00:53:12 TP0 ] Load weight begin. avail mem=60.58 GB
519
- [2025-05-01 00:53:12 TP0 ] Ignore import error when loading sglang.srt.models.llama4.
520
- [2025-05-01 00:53:13 TP0 ] Using model weights format ['*.safetensors']
510
+ [2025-05-01 02:09:37] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-1.5B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-1.5B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, enable_tokenizer_batch_encode=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-1.5B-instruct', chat_template=None, completion_template=None, is_embedding=True, revision=None, host='0.0.0.0', port=38809, mem_fraction_static=0.88, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=452911257, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, dp_size=1, load_balance_method='round_robin', ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_nccl_nvls=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_multimodal=None, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_deepep_moe=False, deepep_mode='auto', enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=None, cuda_graph_bs=None, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, tool_call_parser=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through_selective', flashinfer_mla_disable_ragged=False, warmups=None, moe_dense_tp_size=None, n_share_experts_fusion=0, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_bootstrap_port=8998, disaggregation_transfer_backend='mooncake', disaggregation_ib_device=None)
511
+ [2025-05-01 02:09:37 ] Downcasting torch.float32 to torch.float16.
512
+ [2025-05-01 02:09:45 ] Downcasting torch.float32 to torch.float16.
513
+ [2025-05-01 02:09:46 ] Overlap scheduler is disabled for embedding models.
514
+ [2025-05-01 02:09:46 ] Downcasting torch.float32 to torch.float16.
515
+ [2025-05-01 02:09:46 ] Attention backend not set. Use fa3 backend by default.
516
+ [2025-05-01 02:09:46 ] Init torch distributed begin.
517
+ [2025-05-01 02:09:46 ] Init torch distributed ends. mem usage=0.00 GB
518
+ [2025-05-01 02:09:46 ] Load weight begin. avail mem=76.35 GB
519
+ [2025-05-01 02:09:46 ] Ignore import error when loading sglang.srt.models.llama4.
520
+ [2025-05-01 02:09:47 ] Using model weights format ['*.safetensors']
521
521
Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00<?, ?it/s]
522
- Loading safetensors checkpoint shards: 50% Completed | 1/2 [00:01<00:01, 1.57s /it]
523
- Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00, 1.04s /it]
524
- Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00, 1.12s /it]
525
-
526
- [2025-05-01 00:53:15 TP0 ] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=56.99 GB, mem usage=3.60 GB.
527
- [2025-05-01 00:53:15 TP0 ] KV Cache is allocated. #tokens: 20480, K size: 0.27 GB, V size: 0.27 GB
528
- [2025-05-01 00:53:15 TP0 ] Memory pool end. avail mem=56.16 GB
529
- [2025-05-01 00:53:16 TP0 ] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072
530
- [2025-05-01 00:53:16 ] INFO: Started server process [738890 ]
531
- [2025-05-01 00:53:16 ] INFO: Waiting for application startup.
532
- [2025-05-01 00:53:16 ] INFO: Application startup complete.
533
- [2025-05-01 00:53:16 ] INFO: Uvicorn running on http://0.0.0.0:34416 (Press CTRL+C to quit)
534
- [2025-05-01 00:53:17 ] INFO: 127.0.0.1:55278 - "GET /v1/models HTTP/1.1" 200 OK
535
- [2025-05-01 00:53:17 ] INFO: 127.0.0.1:55284 - "GET /get_model_info HTTP/1.1" 200 OK
536
- [2025-05-01 00:53:17 TP0 ] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
537
- [2025-05-01 00:53:18 ] INFO: 127.0.0.1:55300 - "POST /encode HTTP/1.1" 200 OK
538
- [2025-05-01 00:53:18 ] The server is fired up and ready to roll!
522
+ Loading safetensors checkpoint shards: 50% Completed | 1/2 [00:01<00:01, 1.56s /it]
523
+ Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00, 1.11s /it]
524
+ Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00, 1.18s /it]
525
+
526
+ [2025-05-01 02:09:49 ] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=72.75 GB, mem usage=3.60 GB.
527
+ [2025-05-01 02:09:49 ] KV Cache is allocated. #tokens: 20480, K size: 0.27 GB, V size: 0.27 GB
528
+ [2025-05-01 02:09:49 ] Memory pool end. avail mem=71.93 GB
529
+ [2025-05-01 02:09:50 ] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072
530
+ [2025-05-01 02:09:50 ] INFO: Started server process [922395 ]
531
+ [2025-05-01 02:09:50 ] INFO: Waiting for application startup.
532
+ [2025-05-01 02:09:50 ] INFO: Application startup complete.
533
+ [2025-05-01 02:09:50 ] INFO: Uvicorn running on http://0.0.0.0:38809 (Press CTRL+C to quit)
534
+ [2025-05-01 02:09:51 ] INFO: 127.0.0.1:52664 - "GET /v1/models HTTP/1.1" 200 OK
535
+ [2025-05-01 02:09:51 ] INFO: 127.0.0.1:52678 - "GET /get_model_info HTTP/1.1" 200 OK
536
+ [2025-05-01 02:09:51 ] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
537
+ [2025-05-01 02:09:52 ] INFO: 127.0.0.1:52682 - "POST /encode HTTP/1.1" 200 OK
538
+ [2025-05-01 02:09:52 ] The server is fired up and ready to roll!
539
539
</ pre > </ div > </ div >
540
540
</ div >
541
541
< div class ="nboutput nblast docutils container ">
@@ -571,8 +571,8 @@ <h2>Using cURL<a class="headerlink" href="#Using-cURL" title="Link to this headi
571
571
</ div >
572
572
< div class ="output_area docutils container ">
573
573
< div class ="highlight "> < pre >
574
- [2025-05-01 00:53:22 TP0 ] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
575
- [2025-05-01 00:53:22 ] INFO: 127.0.0.1:55316 - "POST /v1/embeddings HTTP/1.1" 200 OK
574
+ [2025-05-01 02:09:56 ] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
575
+ [2025-05-01 02:09:56 ] INFO: 127.0.0.1:50838 - "POST /v1/embeddings HTTP/1.1" 200 OK
576
576
</ pre > </ div > </ div >
577
577
</ div >
578
578
< div class ="nboutput nblast docutils container ">
@@ -608,8 +608,8 @@ <h2>Using Python Requests<a class="headerlink" href="#Using-Python-Requests" tit
608
608
</ div >
609
609
< div class ="output_area docutils container ">
610
610
< div class ="highlight "> < pre >
611
- [2025-05-01 00:53:22 TP0 ] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0
612
- [2025-05-01 00:53:22 ] INFO: 127.0.0.1:55318 - "POST /v1/embeddings HTTP/1.1" 200 OK
611
+ [2025-05-01 02:09:56 ] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0
612
+ [2025-05-01 02:09:56 ] INFO: 127.0.0.1:50854 - "POST /v1/embeddings HTTP/1.1" 200 OK
613
613
</ pre > </ div > </ div >
614
614
</ div >
615
615
< div class ="nboutput nblast docutils container ">
@@ -645,8 +645,8 @@ <h2>Using OpenAI Python Client<a class="headerlink" href="#Using-OpenAI-Python-C
645
645
</ div >
646
646
< div class ="output_area docutils container ">
647
647
< div class ="highlight "> < pre >
648
- [2025-05-01 00:53:22 TP0 ] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0
649
- [2025-05-01 00:53:22 ] INFO: 127.0.0.1:55326 - "POST /v1/embeddings HTTP/1.1" 200 OK
648
+ [2025-05-01 02:09:56 ] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0
649
+ [2025-05-01 02:09:56 ] INFO: 127.0.0.1:50870 - "POST /v1/embeddings HTTP/1.1" 200 OK
650
650
</ pre > </ div > </ div >
651
651
</ div >
652
652
< div class ="nboutput nblast docutils container ">
@@ -688,8 +688,8 @@ <h2>Using Input IDs<a class="headerlink" href="#Using-Input-IDs" title="Link to
688
688
</ div >
689
689
< div class ="output_area docutils container ">
690
690
< div class ="highlight "> < pre >
691
- [2025-05-01 00:53:23 TP0 ] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0
692
- [2025-05-01 00:53:23 ] INFO: 127.0.0.1:55332 - "POST /v1/embeddings HTTP/1.1" 200 OK
691
+ [2025-05-01 02:09:57 ] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0
692
+ [2025-05-01 02:09:57 ] INFO: 127.0.0.1:50874 - "POST /v1/embeddings HTTP/1.1" 200 OK
693
693
</ pre > </ div > </ div >
694
694
</ div >
695
695
< div class ="nboutput nblast docutils container ">
@@ -698,14 +698,22 @@ <h2>Using Input IDs<a class="headerlink" href="#Using-Input-IDs" title="Link to
698
698
< div class ="output_area rendered_html docutils container ">
699
699
< strong style ='color: #00008B; '> Input IDs embedding (first 10): [-0.00021922588348388672, -0.049896240234375, -0.0032215118408203125, 0.011077880859375, -0.01406097412109375, 0.016021728515625, -0.01439666748046875, 0.00592803955078125, -0.0228271484375, 0.02734375]</ strong > </ div >
700
700
</ div >
701
- < div class ="nbinput nblast docutils container ">
701
+ < div class ="nbinput docutils container ">
702
702
< div class ="prompt highlight-none notranslate "> < div class ="highlight "> < pre > < span > </ span > [6]:
703
703
</ pre > </ div >
704
704
</ div >
705
705
< div class ="input_area highlight-ipython3 notranslate "> < div class ="highlight "> < pre > < span > </ span > < span class ="n "> terminate_process</ span > < span class ="p "> (</ span > < span class ="n "> embedding_process</ span > < span class ="p "> )</ span >
706
706
</ pre > </ div >
707
707
</ div >
708
708
</ div >
709
+ < div class ="nboutput nblast docutils container ">
710
+ < div class ="prompt empty docutils container ">
711
+ </ div >
712
+ < div class ="output_area docutils container ">
713
+ < div class ="highlight "> < pre >
714
+ [2025-05-01 02:09:57] Child process unexpectedly failed with an exit code 9. pid=922625
715
+ </ pre > </ div > </ div >
716
+ </ div >
709
717
</ section >
710
718
</ section >
711
719
0 commit comments