generated from oracle-quickstart/oci-quickstart-template
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathautoscaling_recipe.json
73 lines (73 loc) · 1.87 KB
/
autoscaling_recipe.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
{
"recipe_id": "llm_inference_nvidia",
"recipe_mode": "service",
"deployment_name": "autoscale_vllm_example",
"recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.5",
"recipe_node_shape": "VM.GPU.A10.2",
"input_object_storage": [
{
"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/YlAYhysDWeVdSzDKyaA4r26UTiOYfr6mJuBK3Q0SC-wP5bkExO6w6lAXoipGHxYN/n/iduyx1qnmway/b/metallama321binstruct/o/",
"mount_location": "/models",
"volume_size_in_gbs": 100
}
],
"recipe_container_env": [
{
"key": "tensor_parallel_size",
"value": "1"
},
{
"key": "model_name",
"value": ""
},
{
"key": "Model_Path",
"value": "/models"
}
],
"recipe_replica_count": 1,
"recipe_container_port": "8000",
"recipe_nvidia_gpu_count": 1,
"recipe_container_command_args": [
"--model",
"$(Model_Path)",
"--tensor-parallel-size",
"$(tensor_parallel_size)",
"--gpu-memory-utilization",
"0.99",
"--max-model-len",
"1024"
],
"recipe_ephemeral_storage_size": 200,
"recipe_node_boot_volume_size_in_gbs": 300,
"recipe_node_pool_size": 1,
"recipe_shared_memory_volume_size_limit_in_mb": 200,
"recipe_startup_probe_params": {
"failure_threshold": 30,
"endpoint_path": "/health",
"port": 8000,
"scheme": "HTTP",
"initial_delay_seconds": 60,
"period_seconds": 2,
"success_threshold": 1,
"timeout_seconds": 10
},
"recipe_liveness_probe_params": {
"failure_threshold": 3,
"endpoint_path": "/health",
"port": 8000,
"scheme": "HTTP",
"initial_delay_seconds": 65,
"period_seconds": 600,
"success_threshold": 1,
"timeout_seconds": 10
},
"recipe_node_autoscaling_params": {
"min_nodes": 1,
"max_nodes": 2
},
"recipe_pod_autoscaling_params": {
"min_replicas": 1,
"max_replicas": 4
}
}