From 40cd706d7c5aa31dacea714ceefc99229cf161b9 Mon Sep 17 00:00:00 2001 From: Daniel J Walsh Date: Mon, 24 Feb 2025 10:32:21 -0500 Subject: [PATCH] Add support for kserve Signed-off-by: Daniel J Walsh --- docs/ramalama-serve.1.md | 72 ++++++++++++++++++++++++++- ramalama/cli.py | 2 +- ramalama/kserve.py | 105 +++++++++++++++++++++++++++++++++++++++ ramalama/model.py | 27 ++++++---- 4 files changed, 193 insertions(+), 13 deletions(-) create mode 100644 ramalama/kserve.py diff --git a/docs/ramalama-serve.1.md b/docs/ramalama-serve.1.md index 099b9bee..3fe2d1ea 100644 --- a/docs/ramalama-serve.1.md +++ b/docs/ramalama-serve.1.md @@ -60,8 +60,9 @@ Generate specified configuration format for running the AI Model as a service | Key | Description | | ------------ | -------------------------------------------------------------------------| -| quadlet | Podman supported container definition for running AI Model under systemd | +| kserve | Kserve YAML definition for running the AI Model as a kserve service in Kubernetes | | kube | Kubernetes YAML definition for running the AI Model as a service | +| quadlet | Podman supported container definition for running AI Model under systemd | | quadlet/kube | Kubernetes YAML definition for running the AI Model as a service and Podman supported container definition for running the Kube YAML specified pod under systemd| #### **--help**, **-h** @@ -119,7 +120,7 @@ llama.cpp explains this as: The higher the number is the more creative the response is, but more likely to hallucinate when set too high. - Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories + Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories #### **--tls-verify**=*true* require HTTPS and verify certificates when contacting OCI registries @@ -140,6 +141,73 @@ CONTAINER ID IMAGE COMMAND CREATED 3f64927f11a5 quay.io/ramalama/ramalama:latest /usr/bin/ramalama... 17 seconds ago Up 17 seconds 0.0.0.0:8082->8082/tcp ramalama_YMPQvJxN97 ``` +### Generate kserve service off of OCI Model car quay.io/ramalama/granite:1.0 +``` +./bin/ramalama serve --port 8081 --generate kserve oci://quay.io/ramalama/granite:1.0 +Generating kserve runtime file: granite-1.0-kserve-runtime.yaml +Generating kserve file: granite-1.0-kserve.yaml + +$ cat granite-1.0-kserve-runtime.yaml +apiVersion: serving.kserve.io/v1alpha1 +kind: ServingRuntime +metadata: + name: llama.cpp-runtime + annotations: + openshift.io/display-name: KServe ServingRuntime for quay.io/ramalama/granite:1.0 + opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]' + labels: + opendatahub.io/dashboard: 'true' +spec: + annotations: + prometheus.io/port: '8081' + prometheus.io/path: '/metrics' + multiModel: false + supportedModelFormats: + - autoSelect: true + name: vLLM + containers: + - name: kserve-container + image: quay.io/ramalama/ramalama:latest + command: + - python + - -m + - vllm.entrypoints.openai.api_server + args: + - "--port=8081" + - "--model=/mnt/models" + - "--served-model-name={.Name}" + env: + - name: HF_HOME + value: /tmp/hf_home + ports: + - containerPort: 8081 + protocol: TCP + +$ cat granite-1.0-kserve.yaml +# RamaLama quay.io/ramalama/granite:1.0 AI Model Service +# kubectl create -f to import this kserve file into Kubernetes. +# +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: huggingface-quay.io/ramalama/granite:1.0 +spec: + predictor: + model: + modelFormat: + name: vLLM + storageUri: "oci://quay.io/ramalama/granite:1.0" + resources: + limits: + cpu: "6" + memory: 24Gi + nvidia.com/gpu: "1" + requests: + cpu: "6" + memory: 24Gi + nvidia.com/gpu: "1" +``` + ### Generate quadlet service off of HuggingFace granite Model ``` $ ramalama serve --name MyGraniteServer --generate=quadlet granite diff --git a/ramalama/cli.py b/ramalama/cli.py index d6ffe8ec..2dae24c2 100644 --- a/ramalama/cli.py +++ b/ramalama/cli.py @@ -859,7 +859,7 @@ def serve_parser(subparsers): parser.add_argument("--host", default=config.get('host', "0.0.0.0"), help="IP address to listen") parser.add_argument( "--generate", - choices=["quadlet", "kube", "quadlet/kube"], + choices=["kserve", "kube", "quadlet", "quadlet/kube", ], help="generate specified configuration format for running the AI Model as a service", ) parser.add_argument( diff --git a/ramalama/kserve.py b/ramalama/kserve.py new file mode 100644 index 00000000..39abab46 --- /dev/null +++ b/ramalama/kserve.py @@ -0,0 +1,105 @@ +import os + +from ramalama.common import get_env_vars + + +class Kserve: + def __init__(self, model, image, args, exec_args): + self.ai_image = model + if hasattr(args, "MODEL"): + self.ai_image = args.MODEL + self.ai_image = self.ai_image.removeprefix("oci://") + if args.name: + self.name = args.name + else: + self.name = os.path.basename(self.ai_image) + + self.model = model.removeprefix("oci://") + self.args = args + self.exec_args = exec_args + self.image = image + self.runtime = args.runtime + + def generate(self): + env_var_string = "" + for k, v in get_env_vars().items(): + env_var_string += f"Environment={k}={v}\n" + + _gpu = "" + if os.getenv("CUDA_VISIBLE_DEVICES") != "": + _gpu = 'nvidia.com/gpu' + elif os.getenv("HIP_VISIBLE_DEVICES") != "": + _gpu = 'amd.com/gpu' + if _gpu != "": + gpu = f'\n {_gpu}: "1"' + + outfile = self.name + "-kserve-runtime.yaml" + outfile = outfile.replace(":", "-") + print(f"Generating kserve runtime file: {outfile}") + with open(outfile, 'w') as c: + c.write( + f"""\ +apiVersion: serving.kserve.io/v1alpha1 +kind: ServingRuntime +metadata: + name: {self.runtime}-runtime + annotations: + openshift.io/display-name: KServe ServingRuntime for {self.model} + opendatahub.io/recommended-accelerators: '["{_gpu}"]' + labels: + opendatahub.io/dashboard: 'true' +spec: + annotations: + prometheus.io/port: '{self.args.port}' + prometheus.io/path: '/metrics' + multiModel: false + supportedModelFormats: + - autoSelect: true + name: vLLM + containers: + - name: kserve-container + image: {self.image} + command: + - python + - -m + - vllm.entrypoints.openai.api_server + args: + - "--port={self.args.port}" + - "--model=/mnt/models" + - "--served-model-name={{.Name}}" + env: + - name: HF_HOME + value: /tmp/hf_home + ports: + - containerPort: {self.args.port} + protocol: TCP +""") + + outfile = self.name + "-kserve.yaml" + outfile = outfile.replace(":", "-") + print(f"Generating kserve file: {outfile}") + with open(outfile, 'w') as c: + c.write( + f"""\ +# RamaLama {self.model} AI Model Service +# kubectl create -f to import this kserve file into Kubernetes. +# +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: huggingface-{self.model} +spec: + predictor: + model: + modelFormat: + name: vLLM + storageUri: "oci://{self.model}" + resources: + limits: + cpu: "6" + memory: 24Gi{gpu} + requests: + cpu: "6" + memory: 24Gi{gpu} +""" + ) diff --git a/ramalama/model.py b/ramalama/model.py index 1463268e..3daf07a2 100644 --- a/ramalama/model.py +++ b/ramalama/model.py @@ -17,6 +17,7 @@ from ramalama.kube import Kube from ramalama.model_inspect import GGUFModelInfo, ModelInfoBase from ramalama.quadlet import Quadlet +from ramalama.kserve import Kserve from ramalama.version import version MODEL_TYPES = ["file", "https", "http", "oci", "huggingface", "hf", "ollama"] @@ -360,7 +361,6 @@ def get_model_path(self, args): if args.dryrun: return "/path/to/model" - model_path = self.pull(args) return model_path @@ -486,16 +486,15 @@ def handle_runtime(self, args, exec_args, exec_model_path): def generate_container_config(self, model_path, args, exec_args): self.image = self._image(args) + if args.generate == "kserve": + return self.kserve(model_path, args, exec_args) if args.generate == "quadlet": - self.quadlet(model_path, args, exec_args) - elif args.generate == "kube": - self.kube(model_path, args, exec_args) - elif args.generate == "quadlet/kube": - self.quadlet_kube(model_path, args, exec_args) - else: - return False - - return True + return self.quadlet(model_path, args, exec_args) + if args.generate == "kube": + return self.kube(model_path, args, exec_args) + if args.generate == "quadlet/kube": + return self.quadlet_kube(model_path, args, exec_args) + return False def execute_command(self, model_path, exec_args, args): try: @@ -526,19 +525,27 @@ def serve(self, args): self.execute_command(model_path, exec_args, args) + def kserve(self, model, args, exec_args): + kserve = Kserve(model, self.image, args, exec_args) + kserve.generate() + return True + def quadlet(self, model, args, exec_args): quadlet = Quadlet(model, self.image, args, exec_args) quadlet.generate() + return True def quadlet_kube(self, model, args, exec_args): kube = Kube(model, self.image, args, exec_args) kube.generate() quadlet = Quadlet(model, self.image, args, exec_args) quadlet.kube() + return True def kube(self, model, args, exec_args): kube = Kube(model, self.image, args, exec_args) kube.generate() + return True def path(self, args): return self.model_path(args)