docs/sample_recipes/autoscaling_recipe.json

{
  "recipe_id": "llm_inference_nvidia",
  "recipe_mode": "service",
  "deployment_name": "autoscale_vllm_example",
  "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.5",
  "recipe_node_shape": "VM.GPU.A10.2",
  "input_object_storage": [
    {
      "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/YlAYhysDWeVdSzDKyaA4r26UTiOYfr6mJuBK3Q0SC-wP5bkExO6w6lAXoipGHxYN/n/iduyx1qnmway/b/metallama321binstruct/o/",
      "mount_location": "/models",
      "volume_size_in_gbs": 100
    }
  ],
  "recipe_container_env": [
    {
      "key": "tensor_parallel_size",
      "value": "1"
    },
    {
      "key": "model_name",
      "value": ""
    },
    {
      "key": "Model_Path",
      "value": "/models"
    }
  ],
  "recipe_replica_count": 1,
  "recipe_container_port": "8000",
  "recipe_nvidia_gpu_count": 1,
  "recipe_container_command_args": [
    "--model",
    "$(Model_Path)",
    "--tensor-parallel-size",
    "$(tensor_parallel_size)",
    "--gpu-memory-utilization",
    "0.99",
    "--max-model-len",
    "1024"
  ],
  "recipe_ephemeral_storage_size": 200,
  "recipe_node_boot_volume_size_in_gbs": 300,
  "recipe_node_pool_size": 1,
  "recipe_shared_memory_volume_size_limit_in_mb": 200,
  "recipe_startup_probe_params": {
    "failure_threshold": 30,
    "endpoint_path": "/health",
    "port": 8000,
    "scheme": "HTTP",
    "initial_delay_seconds": 60,
    "period_seconds": 2,
    "success_threshold": 1,
    "timeout_seconds": 10
  },
  "recipe_liveness_probe_params": {
    "failure_threshold": 3,
    "endpoint_path": "/health",
    "port": 8000,
    "scheme": "HTTP",
    "initial_delay_seconds": 65,
    "period_seconds": 600,
    "success_threshold": 1,
    "timeout_seconds": 10
  },
  "recipe_node_autoscaling_params": {
    "min_nodes": 1,
    "max_nodes": 2
  },
  "recipe_pod_autoscaling_params": {
    "min_replicas": 1,
    "max_replicas": 4
  }
}