Skip to content

Commit

Permalink
Add support for kserve
Browse files Browse the repository at this point in the history
Signed-off-by: Daniel J Walsh <[email protected]>
  • Loading branch information
rhatdan committed Feb 24, 2025
1 parent 00839ee commit 775232c
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 13 deletions.
34 changes: 32 additions & 2 deletions docs/ramalama-serve.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,9 @@ Generate specified configuration format for running the AI Model as a service

| Key | Description |
| ------------ | -------------------------------------------------------------------------|
| quadlet | Podman supported container definition for running AI Model under systemd |
| kserve | Kserve YAML definition for running the AI Model as a kserve service in Kubernetes |
| kube | Kubernetes YAML definition for running the AI Model as a service |
| quadlet | Podman supported container definition for running AI Model under systemd |
| quadlet/kube | Kubernetes YAML definition for running the AI Model as a service and Podman supported container definition for running the Kube YAML specified pod under systemd|

#### **--help**, **-h**
Expand Down Expand Up @@ -119,7 +120,7 @@ llama.cpp explains this as:

The higher the number is the more creative the response is, but more likely to hallucinate when set too high.

Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories

#### **--tls-verify**=*true*
require HTTPS and verify certificates when contacting OCI registries
Expand All @@ -140,6 +141,35 @@ CONTAINER ID IMAGE COMMAND CREATED
3f64927f11a5 quay.io/ramalama/ramalama:latest /usr/bin/ramalama... 17 seconds ago Up 17 seconds 0.0.0.0:8082->8082/tcp ramalama_YMPQvJxN97
```

### Generate kserve service off of OCI Model car quay.io/ramalama/granite:1.0
```
$ ramalama serve --generate kserve oci://quay.io/ramalama/granite:1.0
Generating kserve file: granite-1.0-kserve.yaml
$ cat granite-1.0-kserve.yaml
# RamaLama quay.io/ramalama/granite:1.0 AI Model Service
# kubectl create -f to import this kserve file into Kubernetes.
#
apiVersion: serving.kserve.io/v1beta1
kind: InferenceService
metadata:
name: huggingface-quay.io/ramalama/granite:1.0
spec:
predictor:
model:
modelFormat:
name: vLLM
storageUri: "quay.io/ramalama/granite:1.0"
resources:
limits:
cpu: "6"
memory: 24Gi
nvidia.com/gpu: "1"
requests:
cpu: "6"
memory: 24Gi
nvidia.com/gpu: "1"
```

### Generate quadlet service off of HuggingFace granite Model
```
$ ramalama serve --name MyGraniteServer --generate=quadlet granite
Expand Down
2 changes: 1 addition & 1 deletion ramalama/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -859,7 +859,7 @@ def serve_parser(subparsers):
parser.add_argument("--host", default=config.get('host', "0.0.0.0"), help="IP address to listen")
parser.add_argument(
"--generate",
choices=["quadlet", "kube", "quadlet/kube"],
choices=["kserve", "kube", "quadlet", "quadlet/kube", ],
help="generate specified configuration format for running the AI Model as a service",
)
parser.add_argument(
Expand Down
105 changes: 105 additions & 0 deletions ramalama/kserve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import os

from ramalama.common import get_env_vars


class Kserve:
def __init__(self, model, image, args, exec_args):
self.ai_image = model
if hasattr(args, "MODEL"):
self.ai_image = args.MODEL
self.ai_image = self.ai_image.removeprefix("oci://")
if args.name:
self.name = args.name
else:
self.name = os.path.basename(self.ai_image)

self.model = model.removeprefix("oci://")
self.args = args
self.exec_args = exec_args
self.image = image
self.runtime = args.runtime

def generate(self):
env_var_string = ""
for k, v in get_env_vars().items():
env_var_string += f"Environment={k}={v}\n"

_gpu = ""
if os.getenv("CUDA_VISIBLE_DEVICES") != "":
_gpu = 'nvidia.com/gpu'
elif os.getenv("HIP_VISIBLE_DEVICES") != "":
_gpu = 'amd.com/gpu'
if _gpu != "":
gpu = f'\n {_gpu}: "1"'

outfile = self.name + "-kserve-runtime.yaml"
outfile = outfile.replace(":", "-")
print(f"Generating kserve runtime file: {outfile}")
with open(outfile, 'w') as c:
c.write(
f"""\
apiVersion: serving.kserve.io/v1alpha1
kind: ServingRuntime
metadata:
name: {self.runtime}-runtime
annotations:
openshift.io/display-name: KServe ServingRuntime for {self.model}
opendatahub.io/recommended-accelerators: '["{_gpu}"]'
labels:
opendatahub.io/dashboard: 'true'
spec:
annotations:
prometheus.io/port: '{self.args.port}'
prometheus.io/path: '/metrics'
multiModel: false
supportedModelFormats:
- autoSelect: true
name: vLLM
containers:
- name: kserve-container
image: {self.image}
command:
- python
- -m
- vllm.entrypoints.openai.api_server
args:
- "--port={self.args.port}"
- "--model=/mnt/models"
- "--served-model-name={{.Name}}"
env:
- name: HF_HOME
value: /tmp/hf_home
ports:
- containerPort: {self.args.port}
protocol: TCP
""")

outfile = self.name + "-kserve.yaml"
outfile = outfile.replace(":", "-")
print(f"Generating kserve file: {outfile}")
with open(outfile, 'w') as c:
c.write(
f"""\
# RamaLama {self.model} AI Model Service
# kubectl create -f to import this kserve file into Kubernetes.
#
apiVersion: serving.kserve.io/v1beta1
kind: InferenceService
metadata:
name: huggingface-{self.model}
spec:
predictor:
model:
modelFormat:
name: vLLM
storageUri: "oci://{self.model}"
resources:
limits:
cpu: "6"
memory: 24Gi{gpu}
requests:
cpu: "6"
memory: 24Gi{gpu}
"""
)
27 changes: 17 additions & 10 deletions ramalama/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from ramalama.kube import Kube
from ramalama.model_inspect import GGUFModelInfo, ModelInfoBase
from ramalama.quadlet import Quadlet
from ramalama.kserve import Kserve
from ramalama.version import version

MODEL_TYPES = ["file", "https", "http", "oci", "huggingface", "hf", "ollama"]
Expand Down Expand Up @@ -360,7 +361,6 @@ def get_model_path(self, args):

if args.dryrun:
return "/path/to/model"

model_path = self.pull(args)

return model_path
Expand Down Expand Up @@ -486,16 +486,15 @@ def handle_runtime(self, args, exec_args, exec_model_path):

def generate_container_config(self, model_path, args, exec_args):
self.image = self._image(args)
if args.generate == "kserve":
return self.kserve(model_path, args, exec_args)
if args.generate == "quadlet":
self.quadlet(model_path, args, exec_args)
elif args.generate == "kube":
self.kube(model_path, args, exec_args)
elif args.generate == "quadlet/kube":
self.quadlet_kube(model_path, args, exec_args)
else:
return False

return True
return self.quadlet(model_path, args, exec_args)
if args.generate == "kube":
return self.kube(model_path, args, exec_args)
if args.generate == "quadlet/kube":
return self.quadlet_kube(model_path, args, exec_args)
return False

def execute_command(self, model_path, exec_args, args):
try:
Expand Down Expand Up @@ -526,19 +525,27 @@ def serve(self, args):

self.execute_command(model_path, exec_args, args)

def kserve(self, model, args, exec_args):
kserve = Kserve(model, self.image, args, exec_args)
kserve.generate()
return True

def quadlet(self, model, args, exec_args):
quadlet = Quadlet(model, self.image, args, exec_args)
quadlet.generate()
return True

def quadlet_kube(self, model, args, exec_args):
kube = Kube(model, self.image, args, exec_args)
kube.generate()
quadlet = Quadlet(model, self.image, args, exec_args)
quadlet.kube()
return True

def kube(self, model, args, exec_args):
kube = Kube(model, self.image, args, exec_args)
kube.generate()
return True

def path(self, args):
return self.model_path(args)
Expand Down

0 comments on commit 775232c

Please sign in to comment.