Skip to content

Commit

Permalink
feat: replace llama-cpp-python with llama.cpp (#83)
Browse files Browse the repository at this point in the history
feat: replace llama-cpp-python with llama.cpp
  • Loading branch information
quitrk authored Jun 3, 2024
1 parent 476145b commit 338ef4a
Show file tree
Hide file tree
Showing 18 changed files with 731 additions and 800 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "llama.cpp"]
path = llama.cpp
url = https://github.com/ggerganov/llama.cpp
24 changes: 18 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,27 @@ COPY docker/rootfs/ /
RUN \
apt-dpkg-wrap apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F23C5A6CF475977595C89F51BA6932366A755776 && \
apt-dpkg-wrap apt-get update && \
apt-dpkg-wrap apt-get install -y build-essential python3.11 python3.11-venv && \
apt-dpkg-wrap apt-get install -y wget build-essential python3.11 python3.11-venv && \
apt-cleanup

RUN \
wget -nv -O cmake.sh https://github.com/Kitware/CMake/releases/download/v3.29.3/cmake-3.29.3-linux-x86_64.sh && \
sh cmake.sh --skip-license --prefix=/usr/local && \
rm cmake.sh

ENV LLAMA_CPP_RELEASE=b3070
COPY llama.cpp llama.cpp
RUN \
cd llama.cpp && \
rm -rf build && \
cmake -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUDA=ON -DLLAMA_NATIVE=OFF && \
cmake --build build --target server -j`getconf _NPROCESSORS_ONLN`

COPY requirements.txt /app/

WORKDIR /app

ENV \
CMAKE_ARGS="-DLLAMA_CUBLAS=ON -DLLAMA_NATIVE=OFF" \
FORCE_CMAKE=1 \
PIP_DISABLE_PIP_VERSION_CHECK=on
ENV PIP_DISABLE_PIP_VERSION_CHECK=on

RUN \
python3.11 -m venv .venv && \
Expand Down Expand Up @@ -56,6 +67,7 @@ RUN \

# Copy virtual environment
COPY --chown=jitsi:jitsi --from=builder /app/.venv /app/.venv
COPY --chown=jitsi:jitsi --from=builder /llama.cpp/build/bin/server /app/llama.cpp/server

# Copy application files
COPY --chown=jitsi:jitsi /skynet /app/skynet/
Expand All @@ -66,7 +78,7 @@ ENV \
# https://docs.python.org/3/using/cmdline.html#envvar-PYTHONDONTWRITEBYTECODE
PYTHONDONTWRITEBYTECODE=1 \
PYTHONPATH=/app \
LLAMA_PATH="/models/llama-2-7b-chat.Q4_K_M.gguf"
LLAMA_PATH="/models/llama-3-8b-instruct-Q8_0.gguf"

VOLUME [ "/models" ]

Expand Down
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,16 @@ It is comprised of specialized modules which can be enabled or disabled as neede
## Summaries Quickstart

```bash
# Init and update submodules if you haven't already. This will add llama.cpp which provides the OpenAI api server
git submodule update --init

# Download the preferred GGUF llama model
mkdir "$HOME/models"

wget -q --show-progress "https://huggingface.co/jitsi/Llama-3-8B-Instruct-GGUF/resolve/main/llama-3-8b-instruct-Q4_K_M.gguf?download=true" -O "$HOME/models/llama-3-8b-instruct.Q4_K_M.gguf"

export LLAMA_PATH="$HOME/models/llama-3-8b-instruct.Q4_K_M.gguf"
export OPENAI_API_SERVER_PATH="$HOME/skynet/llama.cpp/server"

# start Redis
docker run -d --rm -p 6379:6379 redis
Expand All @@ -47,6 +51,18 @@ export WHISPER_MODEL_PATH="$HOME/models/streaming-whisper"
poetry install
./run.sh
```

## Testing docker changes
```bash
docker compose -f compose-dev.yaml up --build
docker cp $HOME/models/llama-3-8b-instruct-Q8_0.gguf skynet-web-1:/models
docker restart skynet-web-1

# localhost:8000 for Skynet APIs
# localhost:8001/metrics for Prometheus metrics
# localhost:8002 for llama.cpp web server GUI
```

### Test it from Github Pages
Go to [Streaming Whisper Demo](https://jitsi.github.io/skynet/) to test your deployment from a browser

Expand Down
14 changes: 14 additions & 0 deletions compose-dev.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
services:
web:
build: .
environment:
- BYPASS_AUTHORIZATION=true
- REDIS_HOST=redis
platform: linux/amd64
ports:
- "8000:8000"
- "8001:8001"
- "8002:8002"
redis:
image: "redis:alpine"
platform: linux/amd64
1 change: 1 addition & 0 deletions llama.cpp
Submodule llama.cpp added at 549279
1,306 changes: 599 additions & 707 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ fastapi = "0.109"
fastapi-versionizer = "3.0.4"
faster-whisper = "0.10.1"
langchain = "0.1.7"
llama-cpp-python = {version = "0.2.74", extras = ["server"]}
prometheus-client = "0.19.0"
prometheus-fastapi-instrumentator = "6.1.0"
pydantic = "2.5.3"
Expand Down
66 changes: 30 additions & 36 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,41 +1,40 @@
aiofiles==23.2.1 ; python_version >= "3.11" and python_version < "3.12"
aiohttp==3.9.5 ; python_version >= "3.11" and python_version < "3.12"
aiosignal==1.3.1 ; python_version >= "3.11" and python_version < "3.12"
annotated-types==0.6.0 ; python_version >= "3.11" and python_version < "3.12"
anyio==4.3.0 ; python_version >= "3.11" and python_version < "3.12"
annotated-types==0.7.0 ; python_version >= "3.11" and python_version < "3.12"
anyio==4.4.0 ; python_version >= "3.11" and python_version < "3.12"
async-lru==2.0.4 ; python_version >= "3.11" and python_version < "3.12"
async-timeout==4.0.3 ; python_version >= "3.11" and python_full_version <= "3.11.2"
attrs==23.2.0 ; python_version >= "3.11" and python_version < "3.12"
av==10.0.0 ; python_version >= "3.11" and python_version < "3.12"
boto3==1.34.55 ; python_version >= "3.11" and python_version < "3.12"
botocore==1.34.55 ; python_version >= "3.11" and python_version < "3.12"
boto3==1.34.113 ; python_version >= "3.11" and python_version < "3.12"
botocore==1.34.113 ; python_version >= "3.11" and python_version < "3.12"
certifi==2024.2.2 ; python_version >= "3.11" and python_version < "3.12"
cffi==1.16.0 ; python_version >= "3.11" and python_version < "3.12" and platform_python_implementation != "PyPy"
charset-normalizer==3.3.2 ; python_version >= "3.11" and python_version < "3.12"
click==8.1.7 ; python_version >= "3.11" and python_version < "3.12"
colorama==0.4.6 ; python_version >= "3.11" and python_version < "3.12" and (sys_platform == "win32" or platform_system == "Windows")
coloredlogs==15.0.1 ; python_version >= "3.11" and python_version < "3.12"
cryptography==42.0.5 ; python_version >= "3.11" and python_version < "3.12"
cryptography==42.0.7 ; python_version >= "3.11" and python_version < "3.12"
ctranslate2==3.24.0 ; python_version >= "3.11" and python_version < "3.12"
dataclasses-json==0.6.4 ; python_version >= "3.11" and python_version < "3.12"
diskcache==5.6.3 ; python_version >= "3.11" and python_version < "3.12"
dataclasses-json==0.6.6 ; python_version >= "3.11" and python_version < "3.12"
distro==1.9.0 ; python_version >= "3.11" and python_version < "3.12"
fastapi-versionizer==3.0.4 ; python_version >= "3.11" and python_version < "3.12"
fastapi==0.109.0 ; python_version >= "3.11" and python_version < "3.12"
faster-whisper==0.10.1 ; python_version >= "3.11" and python_version < "3.12"
filelock==3.13.1 ; python_version >= "3.11" and python_version < "3.12"
flatbuffers==23.5.26 ; python_version >= "3.11" and python_version < "3.12"
filelock==3.14.0 ; python_version >= "3.11" and python_version < "3.12"
flatbuffers==24.3.25 ; python_version >= "3.11" and python_version < "3.12"
frozenlist==1.4.1 ; python_version >= "3.11" and python_version < "3.12"
fsspec==2024.2.0 ; python_version >= "3.11" and python_version < "3.12"
fsspec==2024.5.0 ; python_version >= "3.11" and python_version < "3.12"
greenlet==3.0.3 ; python_version >= "3.11" and python_version < "3.12" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32")
h11==0.14.0 ; python_version >= "3.11" and python_version < "3.12"
httpcore==1.0.4 ; python_version >= "3.11" and python_version < "3.12"
httpcore==1.0.5 ; python_version >= "3.11" and python_version < "3.12"
httptools==0.6.1 ; python_version >= "3.11" and python_version < "3.12"
httpx==0.27.0 ; python_version >= "3.11" and python_version < "3.12"
huggingface-hub==0.21.3 ; python_version >= "3.11" and python_version < "3.12"
huggingface-hub==0.23.2 ; python_version >= "3.11" and python_version < "3.12"
humanfriendly==10.0 ; python_version >= "3.11" and python_version < "3.12"
idna==3.6 ; python_version >= "3.11" and python_version < "3.12"
jinja2==3.1.3 ; python_version >= "3.11" and python_version < "3.12"
idna==3.7 ; python_version >= "3.11" and python_version < "3.12"
jinja2==3.1.4 ; python_version >= "3.11" and python_version < "3.12"
jmespath==1.0.1 ; python_version >= "3.11" and python_version < "3.12"
jsonpatch==1.33 ; python_version >= "3.11" and python_version < "3.12"
jsonpointer==2.4 ; python_version >= "3.11" and python_version < "3.12"
Expand All @@ -44,55 +43,50 @@ langchain-core==0.1.23 ; python_version >= "3.11" and python_version < "3.12"
langchain-openai==0.0.6 ; python_version >= "3.11" and python_version < "3.12"
langchain==0.1.7 ; python_version >= "3.11" and python_version < "3.12"
langsmith==0.0.87 ; python_version >= "3.11" and python_version < "3.12"
llama-cpp-python[server]==0.2.74 ; python_version >= "3.11" and python_version < "3.12"
markupsafe==2.1.5 ; python_version >= "3.11" and python_version < "3.12"
marshmallow==3.21.1 ; python_version >= "3.11" and python_version < "3.12"
marshmallow==3.21.2 ; python_version >= "3.11" and python_version < "3.12"
mpmath==1.3.0 ; python_version >= "3.11" and python_version < "3.12"
multidict==6.0.5 ; python_version >= "3.11" and python_version < "3.12"
mypy-extensions==1.0.0 ; python_version >= "3.11" and python_version < "3.12"
natsort==8.4.0 ; python_version >= "3.11" and python_version < "3.12"
networkx==3.2.1 ; python_version >= "3.11" and python_version < "3.12"
networkx==3.3 ; python_version >= "3.11" and python_version < "3.12"
numpy==1.26.4 ; python_version >= "3.11" and python_version < "3.12"
onnxruntime==1.17.1 ; python_version >= "3.11" and python_version < "3.12"
openai==1.13.3 ; python_version >= "3.11" and python_version < "3.12"
onnxruntime==1.18.0 ; python_version >= "3.11" and python_version < "3.12"
openai==1.30.3 ; python_version >= "3.11" and python_version < "3.12"
packaging==23.2 ; python_version >= "3.11" and python_version < "3.12"
prometheus-client==0.19.0 ; python_version >= "3.11" and python_version < "3.12"
prometheus-fastapi-instrumentator==6.1.0 ; python_version >= "3.11" and python_version < "3.12"
protobuf==4.25.3 ; python_version >= "3.11" and python_version < "3.12"
pycparser==2.21 ; python_version >= "3.11" and python_version < "3.12" and platform_python_implementation != "PyPy"
protobuf==5.27.0 ; python_version >= "3.11" and python_version < "3.12"
pycparser==2.22 ; python_version >= "3.11" and python_version < "3.12" and platform_python_implementation != "PyPy"
pydantic-core==2.14.6 ; python_version >= "3.11" and python_version < "3.12"
pydantic-settings==2.2.1 ; python_version >= "3.11" and python_version < "3.12"
pydantic==2.5.3 ; python_version >= "3.11" and python_version < "3.12"
pyjwt[crypto]==2.8.0 ; python_version >= "3.11" and python_version < "3.12"
pyreadline3==3.4.1 ; sys_platform == "win32" and python_version >= "3.11" and python_version < "3.12"
python-dateutil==2.9.0.post0 ; python_version >= "3.11" and python_version < "3.12"
python-dotenv==1.0.1 ; python_version >= "3.11" and python_version < "3.12"
pyyaml==6.0.1 ; python_version >= "3.11" and python_version < "3.12"
redis==5.0.1 ; python_version >= "3.11" and python_version < "3.12"
regex==2023.12.25 ; python_version >= "3.11" and python_version < "3.12"
requests==2.31.0 ; python_version >= "3.11" and python_version < "3.12"
s3transfer==0.10.0 ; python_version >= "3.11" and python_version < "3.12"
setuptools==69.1.1 ; python_version >= "3.11" and python_version < "3.12"
regex==2024.5.15 ; python_version >= "3.11" and python_version < "3.12"
requests==2.32.2 ; python_version >= "3.11" and python_version < "3.12"
s3transfer==0.10.1 ; python_version >= "3.11" and python_version < "3.12"
setuptools==70.0.0 ; python_version >= "3.11" and python_version < "3.12"
six==1.16.0 ; python_version >= "3.11" and python_version < "3.12"
sniffio==1.3.1 ; python_version >= "3.11" and python_version < "3.12"
sqlalchemy==2.0.28 ; python_version >= "3.11" and python_version < "3.12"
sse-starlette==2.0.0 ; python_version >= "3.11" and python_version < "3.12"
starlette-context==0.3.6 ; python_version >= "3.11" and python_version < "3.12"
sqlalchemy==2.0.30 ; python_version >= "3.11" and python_version < "3.12"
starlette==0.35.1 ; python_version >= "3.11" and python_version < "3.12"
sympy==1.12 ; python_version >= "3.11" and python_version < "3.12"
tenacity==8.2.3 ; python_version >= "3.11" and python_version < "3.12"
tiktoken==0.6.0 ; python_version >= "3.11" and python_version < "3.12"
tenacity==8.3.0 ; python_version >= "3.11" and python_version < "3.12"
tiktoken==0.7.0 ; python_version >= "3.11" and python_version < "3.12"
tokenizers==0.15.2 ; python_version >= "3.11" and python_version < "3.12"
torch==2.0.1 ; python_version >= "3.11" and python_version < "3.12"
torchaudio==2.0.2 ; python_version >= "3.11" and python_version < "3.12"
tqdm==4.66.2 ; python_version >= "3.11" and python_version < "3.12"
typing-extensions==4.10.0 ; python_version >= "3.11" and python_version < "3.12"
tqdm==4.66.4 ; python_version >= "3.11" and python_version < "3.12"
typing-extensions==4.12.0 ; python_version >= "3.11" and python_version < "3.12"
typing-inspect==0.9.0 ; python_version >= "3.11" and python_version < "3.12"
urllib3==2.0.7 ; python_version >= "3.11" and python_version < "3.12"
urllib3==2.2.1 ; python_version >= "3.11" and python_version < "3.12"
uuid6==2024.1.12 ; python_version >= "3.11" and python_version < "3.12"
uvicorn==0.29.0 ; python_version >= "3.11" and python_version < "3.12"
uvicorn[standard]==0.29.0 ; python_version >= "3.11" and python_version < "3.12"
uvloop==0.19.0 ; (sys_platform != "win32" and sys_platform != "cygwin") and platform_python_implementation != "PyPy" and python_version >= "3.11" and python_version < "3.12"
watchfiles==0.21.0 ; python_version >= "3.11" and python_version < "3.12"
watchfiles==0.22.0 ; python_version >= "3.11" and python_version < "3.12"
websockets==12.0 ; python_version >= "3.11" and python_version < "3.12"
yarl==1.9.4 ; python_version >= "3.11" and python_version < "3.12"
5 changes: 4 additions & 1 deletion run.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#!/bin/sh
cd llama.cpp
make server
cd ..

exec poetry run python skynet/main.py
poetry run python -m uvicorn skynet.main:app --reload
2 changes: 1 addition & 1 deletion skynet/auth/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,4 @@ async def setup_credentials():


def get_credentials(customer_id):
return credentials.get(customer_id, {})
return credentials.get(customer_id, {}) or {}
9 changes: 4 additions & 5 deletions skynet/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,21 @@ def tobool(val: str | None):

# general
log_level = os.environ.get('LOG_LEVEL', 'DEBUG').strip().upper()
supported_modules = {'summaries:dispatcher', 'summaries:executor', 'openai-api', 'streaming_whisper'}
supported_modules = {'summaries:dispatcher', 'summaries:executor', 'streaming_whisper'}
enabled_modules = set(os.environ.get('ENABLED_MODULES', 'summaries:dispatcher,summaries:executor').split(','))
modules = supported_modules.intersection(enabled_modules)
file_refresh_interval = int(os.environ.get('FILE_REFRESH_INTERVAL', 30))

# models

# Some formats are auto-detected: https://github.com/abetlen/llama-cpp-python/blob/c50d3300d2a09c98765be7f2c05b7e4fd0b4232e/llama_cpp/llama_chat_format.py#L724
model_chat_format = os.environ.get('MODEL_CHAT_FORMAT')
llama_path = os.environ.get('LLAMA_PATH')
llama_n_ctx = int(os.environ.get('LLAMA_N_CTX', 8192))
llama_n_gpu_layers = int(os.environ.get('LLAMA_N_GPU_LAYERS', -1 if is_mac else 40))
llama_n_batch = int(os.environ.get('LLAMA_N_BATCH', 512))

# openai api
openai_api_base_url = os.environ.get('OPENAI_API_BASE_URL', 'http://localhost:8000/openai-api/v1')
openai_api_server_path = os.environ.get('OPENAI_API_SERVER_PATH', '/app/llama.cpp/server')
openai_api_server_port = int(os.environ.get('OPENAI_API_SERVER_PORT', 8002))
openai_api_base_url = os.environ.get('OPENAI_API_BASE_URL', f'http://localhost:{openai_api_server_port}/v1')

# openai
openai_credentials_file = os.environ.get('SKYNET_CREDENTIALS_PATH')
Expand Down
4 changes: 0 additions & 4 deletions skynet/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@
</head>
<body>
<h1>Skynet</h1>
<ul>
<li>
<a href="/openai-api">OpenAI compatible API</a>
</li>
<li>
<a href="/summaries/docs">Summaries API</a>
</li>
Expand Down
7 changes: 0 additions & 7 deletions skynet/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,10 @@ async def lifespan(main_app: FastAPI):
await summaries_startup()

if 'summaries:executor' in modules:
from skynet.modules.ttt.openai_api.app import app as openai_api_app
from skynet.modules.ttt.summaries.app import executor_startup as executor_startup

main_app.mount('/openai-api', openai_api_app)
await executor_startup()

if 'openai-api' in modules: # init this one last in order to not wait for the model to load if setup fails
from skynet.modules.ttt.openai_api.app import app as openai_api_app

main_app.mount('/openai-api', openai_api_app)

yield

log.info('Skynet is shutting down')
Expand Down
8 changes: 0 additions & 8 deletions skynet/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from skynet.modules.monitoring import (
instrumentator,
PROMETHEUS_NAMESPACE,
PROMETHEUS_OPENAI_API_SUBSYSTEM,
PROMETHEUS_STREAMING_WHISPER_SUBSYSTEM,
PROMETHEUS_SUMMARIES_SUBSYSTEM,
)
Expand Down Expand Up @@ -34,13 +33,6 @@ async def autoscaler_metrics():
queue_size = await db.llen(PENDING_JOBS_KEY)
return {'queueSize': queue_size}

if 'openai-api' in modules:
from skynet.modules.ttt.openai_api.app import app as openai_api_app

instrumentator.instrument(
openai_api_app, metric_namespace=PROMETHEUS_NAMESPACE, metric_subsystem=PROMETHEUS_OPENAI_API_SUBSYSTEM
).expose(metrics)

if 'summaries:dispatcher' in modules:
from skynet.modules.ttt.summaries.app import app as summaries_app

Expand Down
1 change: 0 additions & 1 deletion skynet/modules/monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

PROMETHEUS_NAMESPACE = 'Skynet'
PROMETHEUS_SUMMARIES_SUBSYSTEM = 'Summaries'
PROMETHEUS_OPENAI_API_SUBSYSTEM = 'OpenAI_API'
PROMETHEUS_STREAMING_WHISPER_SUBSYSTEM = 'Streaming_Whisper'

REDIS_CONNECTION_STATUS = Gauge(
Expand Down
Loading

0 comments on commit 338ef4a

Please sign in to comment.