feat: replace llama-cpp-python with llama.cpp (#83)

feat: replace llama-cpp-python with llama.cpp
jitsi · Jun 3, 2024 · 338ef4a · 338ef4a
1 parent 476145b
commit 338ef4a
Show file tree

Hide file tree

Showing 18 changed files with 731 additions and 800 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "llama.cpp"]
+	path = llama.cpp
+	url = https://github.com/ggerganov/llama.cpp
diff --git a/Dockerfile b/Dockerfile
@@ -15,16 +15,27 @@ COPY docker/rootfs/ /
 RUN \
     apt-dpkg-wrap apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F23C5A6CF475977595C89F51BA6932366A755776 && \
     apt-dpkg-wrap apt-get update && \
-    apt-dpkg-wrap apt-get install -y build-essential python3.11 python3.11-venv && \
+    apt-dpkg-wrap apt-get install -y wget build-essential python3.11 python3.11-venv && \
     apt-cleanup
 
+RUN \
+    wget -nv -O cmake.sh https://github.com/Kitware/CMake/releases/download/v3.29.3/cmake-3.29.3-linux-x86_64.sh && \
+    sh cmake.sh --skip-license --prefix=/usr/local && \
+    rm cmake.sh
+
+ENV LLAMA_CPP_RELEASE=b3070
+COPY llama.cpp llama.cpp
+RUN \
+    cd llama.cpp && \
+    rm -rf build && \
+    cmake -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUDA=ON -DLLAMA_NATIVE=OFF && \
+    cmake --build build --target server -j`getconf _NPROCESSORS_ONLN`
+
 COPY requirements.txt /app/
+
 WORKDIR /app
 
-ENV \
-    CMAKE_ARGS="-DLLAMA_CUBLAS=ON -DLLAMA_NATIVE=OFF" \
-    FORCE_CMAKE=1 \
-    PIP_DISABLE_PIP_VERSION_CHECK=on
+ENV PIP_DISABLE_PIP_VERSION_CHECK=on
 
 RUN \
     python3.11 -m venv .venv && \
@@ -56,6 +67,7 @@ RUN \
 
 # Copy virtual environment
 COPY --chown=jitsi:jitsi --from=builder /app/.venv /app/.venv
+COPY --chown=jitsi:jitsi --from=builder /llama.cpp/build/bin/server /app/llama.cpp/server
 
 # Copy application files
 COPY --chown=jitsi:jitsi /skynet /app/skynet/
@@ -66,7 +78,7 @@ ENV \
     # https://docs.python.org/3/using/cmdline.html#envvar-PYTHONDONTWRITEBYTECODE
     PYTHONDONTWRITEBYTECODE=1 \
     PYTHONPATH=/app \
-    LLAMA_PATH="/models/llama-2-7b-chat.Q4_K_M.gguf"
+    LLAMA_PATH="/models/llama-3-8b-instruct-Q8_0.gguf"
 
 VOLUME [ "/models" ]
 

diff --git a/README.md b/README.md
@@ -16,12 +16,16 @@ It is comprised of specialized modules which can be enabled or disabled as neede
 ## Summaries Quickstart
 
 ```bash
+# Init and update submodules if you haven't already. This will add llama.cpp which provides the OpenAI api server
+git submodule update --init
+
 # Download the preferred GGUF llama model
 mkdir "$HOME/models"
 
 wget -q --show-progress "https://huggingface.co/jitsi/Llama-3-8B-Instruct-GGUF/resolve/main/llama-3-8b-instruct-Q4_K_M.gguf?download=true" -O "$HOME/models/llama-3-8b-instruct.Q4_K_M.gguf"
 
 export LLAMA_PATH="$HOME/models/llama-3-8b-instruct.Q4_K_M.gguf"
+export OPENAI_API_SERVER_PATH="$HOME/skynet/llama.cpp/server"
 
 # start Redis
 docker run -d --rm -p 6379:6379 redis 
@@ -47,6 +51,18 @@ export WHISPER_MODEL_PATH="$HOME/models/streaming-whisper"
 poetry install
 ./run.sh
 ```
+
+## Testing docker changes
+```bash
+docker compose -f compose-dev.yaml up --build
+docker cp $HOME/models/llama-3-8b-instruct-Q8_0.gguf skynet-web-1:/models
+docker restart skynet-web-1
+
+# localhost:8000 for Skynet APIs
+# localhost:8001/metrics for Prometheus metrics
+# localhost:8002 for llama.cpp web server GUI
+```
+
 ### Test it from Github Pages
 Go to [Streaming Whisper Demo](https://jitsi.github.io/skynet/) to test your deployment from a browser
 

diff --git a/compose-dev.yaml b/compose-dev.yaml
@@ -0,0 +1,14 @@
+services:
+    web:
+        build:  .
+        environment:
+            - BYPASS_AUTHORIZATION=true
+            - REDIS_HOST=redis
+        platform: linux/amd64
+        ports:
+            - "8000:8000"
+            - "8001:8001"
+            - "8002:8002"
+    redis:
+        image: "redis:alpine"
+        platform: linux/amd64
diff --git a/llama.cpp b/llama.cpp
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,7 +24,6 @@ fastapi = "0.109"
 fastapi-versionizer = "3.0.4"
 faster-whisper = "0.10.1"
 langchain = "0.1.7"
-llama-cpp-python = {version = "0.2.74", extras = ["server"]}
 prometheus-client = "0.19.0"
 prometheus-fastapi-instrumentator = "6.1.0"
 pydantic = "2.5.3"

diff --git a/requirements.txt b/requirements.txt
@@ -1,41 +1,40 @@
 aiofiles==23.2.1 ; python_version >= "3.11" and python_version < "3.12"
 aiohttp==3.9.5 ; python_version >= "3.11" and python_version < "3.12"
 aiosignal==1.3.1 ; python_version >= "3.11" and python_version < "3.12"
-annotated-types==0.6.0 ; python_version >= "3.11" and python_version < "3.12"
-anyio==4.3.0 ; python_version >= "3.11" and python_version < "3.12"
+annotated-types==0.7.0 ; python_version >= "3.11" and python_version < "3.12"
+anyio==4.4.0 ; python_version >= "3.11" and python_version < "3.12"
 async-lru==2.0.4 ; python_version >= "3.11" and python_version < "3.12"
 async-timeout==4.0.3 ; python_version >= "3.11" and python_full_version <= "3.11.2"
 attrs==23.2.0 ; python_version >= "3.11" and python_version < "3.12"
 av==10.0.0 ; python_version >= "3.11" and python_version < "3.12"
-boto3==1.34.55 ; python_version >= "3.11" and python_version < "3.12"
-botocore==1.34.55 ; python_version >= "3.11" and python_version < "3.12"
+boto3==1.34.113 ; python_version >= "3.11" and python_version < "3.12"
+botocore==1.34.113 ; python_version >= "3.11" and python_version < "3.12"
 certifi==2024.2.2 ; python_version >= "3.11" and python_version < "3.12"
 cffi==1.16.0 ; python_version >= "3.11" and python_version < "3.12" and platform_python_implementation != "PyPy"
 charset-normalizer==3.3.2 ; python_version >= "3.11" and python_version < "3.12"
 click==8.1.7 ; python_version >= "3.11" and python_version < "3.12"
 colorama==0.4.6 ; python_version >= "3.11" and python_version < "3.12" and (sys_platform == "win32" or platform_system == "Windows")
 coloredlogs==15.0.1 ; python_version >= "3.11" and python_version < "3.12"
-cryptography==42.0.5 ; python_version >= "3.11" and python_version < "3.12"
+cryptography==42.0.7 ; python_version >= "3.11" and python_version < "3.12"
 ctranslate2==3.24.0 ; python_version >= "3.11" and python_version < "3.12"
-dataclasses-json==0.6.4 ; python_version >= "3.11" and python_version < "3.12"
-diskcache==5.6.3 ; python_version >= "3.11" and python_version < "3.12"
+dataclasses-json==0.6.6 ; python_version >= "3.11" and python_version < "3.12"
 distro==1.9.0 ; python_version >= "3.11" and python_version < "3.12"
 fastapi-versionizer==3.0.4 ; python_version >= "3.11" and python_version < "3.12"
 fastapi==0.109.0 ; python_version >= "3.11" and python_version < "3.12"
 faster-whisper==0.10.1 ; python_version >= "3.11" and python_version < "3.12"
-filelock==3.13.1 ; python_version >= "3.11" and python_version < "3.12"
-flatbuffers==23.5.26 ; python_version >= "3.11" and python_version < "3.12"
+filelock==3.14.0 ; python_version >= "3.11" and python_version < "3.12"
+flatbuffers==24.3.25 ; python_version >= "3.11" and python_version < "3.12"
 frozenlist==1.4.1 ; python_version >= "3.11" and python_version < "3.12"
-fsspec==2024.2.0 ; python_version >= "3.11" and python_version < "3.12"
+fsspec==2024.5.0 ; python_version >= "3.11" and python_version < "3.12"
 greenlet==3.0.3 ; python_version >= "3.11" and python_version < "3.12" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32")
 h11==0.14.0 ; python_version >= "3.11" and python_version < "3.12"
-httpcore==1.0.4 ; python_version >= "3.11" and python_version < "3.12"
+httpcore==1.0.5 ; python_version >= "3.11" and python_version < "3.12"
 httptools==0.6.1 ; python_version >= "3.11" and python_version < "3.12"
 httpx==0.27.0 ; python_version >= "3.11" and python_version < "3.12"
-huggingface-hub==0.21.3 ; python_version >= "3.11" and python_version < "3.12"
+huggingface-hub==0.23.2 ; python_version >= "3.11" and python_version < "3.12"
 humanfriendly==10.0 ; python_version >= "3.11" and python_version < "3.12"
-idna==3.6 ; python_version >= "3.11" and python_version < "3.12"
-jinja2==3.1.3 ; python_version >= "3.11" and python_version < "3.12"
+idna==3.7 ; python_version >= "3.11" and python_version < "3.12"
+jinja2==3.1.4 ; python_version >= "3.11" and python_version < "3.12"
 jmespath==1.0.1 ; python_version >= "3.11" and python_version < "3.12"
 jsonpatch==1.33 ; python_version >= "3.11" and python_version < "3.12"
 jsonpointer==2.4 ; python_version >= "3.11" and python_version < "3.12"
@@ -44,55 +43,50 @@ langchain-core==0.1.23 ; python_version >= "3.11" and python_version < "3.12"
 langchain-openai==0.0.6 ; python_version >= "3.11" and python_version < "3.12"
 langchain==0.1.7 ; python_version >= "3.11" and python_version < "3.12"
 langsmith==0.0.87 ; python_version >= "3.11" and python_version < "3.12"
-llama-cpp-python[server]==0.2.74 ; python_version >= "3.11" and python_version < "3.12"
 markupsafe==2.1.5 ; python_version >= "3.11" and python_version < "3.12"
-marshmallow==3.21.1 ; python_version >= "3.11" and python_version < "3.12"
+marshmallow==3.21.2 ; python_version >= "3.11" and python_version < "3.12"
 mpmath==1.3.0 ; python_version >= "3.11" and python_version < "3.12"
 multidict==6.0.5 ; python_version >= "3.11" and python_version < "3.12"
 mypy-extensions==1.0.0 ; python_version >= "3.11" and python_version < "3.12"
 natsort==8.4.0 ; python_version >= "3.11" and python_version < "3.12"
-networkx==3.2.1 ; python_version >= "3.11" and python_version < "3.12"
+networkx==3.3 ; python_version >= "3.11" and python_version < "3.12"
 numpy==1.26.4 ; python_version >= "3.11" and python_version < "3.12"
-onnxruntime==1.17.1 ; python_version >= "3.11" and python_version < "3.12"
-openai==1.13.3 ; python_version >= "3.11" and python_version < "3.12"
+onnxruntime==1.18.0 ; python_version >= "3.11" and python_version < "3.12"
+openai==1.30.3 ; python_version >= "3.11" and python_version < "3.12"
 packaging==23.2 ; python_version >= "3.11" and python_version < "3.12"
 prometheus-client==0.19.0 ; python_version >= "3.11" and python_version < "3.12"
 prometheus-fastapi-instrumentator==6.1.0 ; python_version >= "3.11" and python_version < "3.12"
-protobuf==4.25.3 ; python_version >= "3.11" and python_version < "3.12"
-pycparser==2.21 ; python_version >= "3.11" and python_version < "3.12" and platform_python_implementation != "PyPy"
+protobuf==5.27.0 ; python_version >= "3.11" and python_version < "3.12"
+pycparser==2.22 ; python_version >= "3.11" and python_version < "3.12" and platform_python_implementation != "PyPy"
 pydantic-core==2.14.6 ; python_version >= "3.11" and python_version < "3.12"
-pydantic-settings==2.2.1 ; python_version >= "3.11" and python_version < "3.12"
 pydantic==2.5.3 ; python_version >= "3.11" and python_version < "3.12"
 pyjwt[crypto]==2.8.0 ; python_version >= "3.11" and python_version < "3.12"
 pyreadline3==3.4.1 ; sys_platform == "win32" and python_version >= "3.11" and python_version < "3.12"
 python-dateutil==2.9.0.post0 ; python_version >= "3.11" and python_version < "3.12"
 python-dotenv==1.0.1 ; python_version >= "3.11" and python_version < "3.12"
 pyyaml==6.0.1 ; python_version >= "3.11" and python_version < "3.12"
 redis==5.0.1 ; python_version >= "3.11" and python_version < "3.12"
-regex==2023.12.25 ; python_version >= "3.11" and python_version < "3.12"
-requests==2.31.0 ; python_version >= "3.11" and python_version < "3.12"
-s3transfer==0.10.0 ; python_version >= "3.11" and python_version < "3.12"
-setuptools==69.1.1 ; python_version >= "3.11" and python_version < "3.12"
+regex==2024.5.15 ; python_version >= "3.11" and python_version < "3.12"
+requests==2.32.2 ; python_version >= "3.11" and python_version < "3.12"
+s3transfer==0.10.1 ; python_version >= "3.11" and python_version < "3.12"
+setuptools==70.0.0 ; python_version >= "3.11" and python_version < "3.12"
 six==1.16.0 ; python_version >= "3.11" and python_version < "3.12"
 sniffio==1.3.1 ; python_version >= "3.11" and python_version < "3.12"
-sqlalchemy==2.0.28 ; python_version >= "3.11" and python_version < "3.12"
-sse-starlette==2.0.0 ; python_version >= "3.11" and python_version < "3.12"
-starlette-context==0.3.6 ; python_version >= "3.11" and python_version < "3.12"
+sqlalchemy==2.0.30 ; python_version >= "3.11" and python_version < "3.12"
 starlette==0.35.1 ; python_version >= "3.11" and python_version < "3.12"
 sympy==1.12 ; python_version >= "3.11" and python_version < "3.12"
-tenacity==8.2.3 ; python_version >= "3.11" and python_version < "3.12"
-tiktoken==0.6.0 ; python_version >= "3.11" and python_version < "3.12"
+tenacity==8.3.0 ; python_version >= "3.11" and python_version < "3.12"
+tiktoken==0.7.0 ; python_version >= "3.11" and python_version < "3.12"
 tokenizers==0.15.2 ; python_version >= "3.11" and python_version < "3.12"
 torch==2.0.1 ; python_version >= "3.11" and python_version < "3.12"
 torchaudio==2.0.2 ; python_version >= "3.11" and python_version < "3.12"
-tqdm==4.66.2 ; python_version >= "3.11" and python_version < "3.12"
-typing-extensions==4.10.0 ; python_version >= "3.11" and python_version < "3.12"
+tqdm==4.66.4 ; python_version >= "3.11" and python_version < "3.12"
+typing-extensions==4.12.0 ; python_version >= "3.11" and python_version < "3.12"
 typing-inspect==0.9.0 ; python_version >= "3.11" and python_version < "3.12"
-urllib3==2.0.7 ; python_version >= "3.11" and python_version < "3.12"
+urllib3==2.2.1 ; python_version >= "3.11" and python_version < "3.12"
 uuid6==2024.1.12 ; python_version >= "3.11" and python_version < "3.12"
-uvicorn==0.29.0 ; python_version >= "3.11" and python_version < "3.12"
 uvicorn[standard]==0.29.0 ; python_version >= "3.11" and python_version < "3.12"
 uvloop==0.19.0 ; (sys_platform != "win32" and sys_platform != "cygwin") and platform_python_implementation != "PyPy" and python_version >= "3.11" and python_version < "3.12"
-watchfiles==0.21.0 ; python_version >= "3.11" and python_version < "3.12"
+watchfiles==0.22.0 ; python_version >= "3.11" and python_version < "3.12"
 websockets==12.0 ; python_version >= "3.11" and python_version < "3.12"
 yarl==1.9.4 ; python_version >= "3.11" and python_version < "3.12"
diff --git a/run.sh b/run.sh
@@ -1,3 +1,6 @@
 #!/bin/sh
+cd llama.cpp
+make server
+cd ..
 
-exec poetry run python skynet/main.py
+poetry run python -m uvicorn skynet.main:app --reload
diff --git a/skynet/auth/openai.py b/skynet/auth/openai.py
@@ -40,4 +40,4 @@ async def setup_credentials():
 
 
 def get_credentials(customer_id):
-    return credentials.get(customer_id, {})
+    return credentials.get(customer_id, {}) or {}
diff --git a/skynet/env.py b/skynet/env.py
@@ -19,22 +19,21 @@ def tobool(val: str | None):
 
 # general
 log_level = os.environ.get('LOG_LEVEL', 'DEBUG').strip().upper()
-supported_modules = {'summaries:dispatcher', 'summaries:executor', 'openai-api', 'streaming_whisper'}
+supported_modules = {'summaries:dispatcher', 'summaries:executor', 'streaming_whisper'}
 enabled_modules = set(os.environ.get('ENABLED_MODULES', 'summaries:dispatcher,summaries:executor').split(','))
 modules = supported_modules.intersection(enabled_modules)
 file_refresh_interval = int(os.environ.get('FILE_REFRESH_INTERVAL', 30))
 
 # models
-
-# Some formats are auto-detected: https://github.com/abetlen/llama-cpp-python/blob/c50d3300d2a09c98765be7f2c05b7e4fd0b4232e/llama_cpp/llama_chat_format.py#L724
-model_chat_format = os.environ.get('MODEL_CHAT_FORMAT')
 llama_path = os.environ.get('LLAMA_PATH')
 llama_n_ctx = int(os.environ.get('LLAMA_N_CTX', 8192))
 llama_n_gpu_layers = int(os.environ.get('LLAMA_N_GPU_LAYERS', -1 if is_mac else 40))
 llama_n_batch = int(os.environ.get('LLAMA_N_BATCH', 512))
 
 # openai api
-openai_api_base_url = os.environ.get('OPENAI_API_BASE_URL', 'http://localhost:8000/openai-api/v1')
+openai_api_server_path = os.environ.get('OPENAI_API_SERVER_PATH', '/app/llama.cpp/server')
+openai_api_server_port = int(os.environ.get('OPENAI_API_SERVER_PORT', 8002))
+openai_api_base_url = os.environ.get('OPENAI_API_BASE_URL', f'http://localhost:{openai_api_server_port}/v1')
 
 # openai
 openai_credentials_file = os.environ.get('SKYNET_CREDENTIALS_PATH')

diff --git a/skynet/index.html b/skynet/index.html
@@ -4,10 +4,6 @@
 </head>
 <body>
     <h1>Skynet</h1>
-    <ul>
-        <li>
-            <a href="/openai-api">OpenAI compatible API</a>
-        </li>
         <li>
             <a href="/summaries/docs">Summaries API</a>
         </li>

diff --git a/skynet/main.py b/skynet/main.py
@@ -37,17 +37,10 @@ async def lifespan(main_app: FastAPI):
         await summaries_startup()
 
     if 'summaries:executor' in modules:
-        from skynet.modules.ttt.openai_api.app import app as openai_api_app
         from skynet.modules.ttt.summaries.app import executor_startup as executor_startup
 
-        main_app.mount('/openai-api', openai_api_app)
         await executor_startup()
 
-    if 'openai-api' in modules:  # init this one last in order to not wait for the model to load if setup fails
-        from skynet.modules.ttt.openai_api.app import app as openai_api_app
-
-        main_app.mount('/openai-api', openai_api_app)
-
     yield
 
     log.info('Skynet is shutting down')

diff --git a/skynet/metrics.py b/skynet/metrics.py
@@ -5,7 +5,6 @@
 from skynet.modules.monitoring import (
     instrumentator,
     PROMETHEUS_NAMESPACE,
-    PROMETHEUS_OPENAI_API_SUBSYSTEM,
     PROMETHEUS_STREAMING_WHISPER_SUBSYSTEM,
     PROMETHEUS_SUMMARIES_SUBSYSTEM,
 )
@@ -34,13 +33,6 @@ async def autoscaler_metrics():
         queue_size = await db.llen(PENDING_JOBS_KEY)
         return {'queueSize': queue_size}
 
-    if 'openai-api' in modules:
-        from skynet.modules.ttt.openai_api.app import app as openai_api_app
-
-        instrumentator.instrument(
-            openai_api_app, metric_namespace=PROMETHEUS_NAMESPACE, metric_subsystem=PROMETHEUS_OPENAI_API_SUBSYSTEM
-        ).expose(metrics)
-
     if 'summaries:dispatcher' in modules:
         from skynet.modules.ttt.summaries.app import app as summaries_app
 

diff --git a/skynet/modules/monitoring.py b/skynet/modules/monitoring.py
@@ -3,7 +3,6 @@
 
 PROMETHEUS_NAMESPACE = 'Skynet'
 PROMETHEUS_SUMMARIES_SUBSYSTEM = 'Summaries'
-PROMETHEUS_OPENAI_API_SUBSYSTEM = 'OpenAI_API'
 PROMETHEUS_STREAMING_WHISPER_SUBSYSTEM = 'Streaming_Whisper'
 
 REDIS_CONNECTION_STATUS = Gauge(
Original file line number	Diff line number	Diff line change
Expand Up		@@ -40,4 +40,4 @@ async def setup_credentials():


		def get_credentials(customer_id):
		return credentials.get(customer_id, {})
		return credentials.get(customer_id, {}) or {}
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,10 +4,6 @@ @@
     </head>
     <body>
         <h1>Skynet</h1>
-        <ul>
-            <li>
-                <a href="/openai-api">OpenAI compatible API</a>
-            </li>
             <li>
                 <a href="/summaries/docs">Summaries API</a>
             </li>
@@ Expand Down @@