-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add diagnostic tools for deployments * add ci check for bash scripts * update top-level README to include diagnostics dir * remove redundant shell openers * clarify log warnings
- Loading branch information
Showing
5 changed files
with
339 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Diagnostics | ||
|
||
This directory contains a collection of tools and scripts designed to help validate, monitor, and troubleshoot the deployment of Deepgram's self-hosted product. | ||
|
||
## Usage | ||
|
||
For detailed usage instructions and features of each script, please refer to the header comments within the respective script files. | ||
|
||
## Contents | ||
### 1. [dg_validate_nvidia_setup.sh](./dg_validate_nvidia_setup.sh) | ||
|
||
This script verifies the GPU environment and container runtime setup for Deepgram self-hosted products running with Docker or Podman. | ||
|
||
### 2. [dg_log_parser.sh](./dg_log_parser.sh) | ||
This script analyzes log files from Deepgram self-hosted containers to identify common issues and provide troubleshooting suggestions. | ||
|
||
Collecting log files for analysis will vary depending on your container orchestrator: | ||
|
||
#### Docker | ||
```bash | ||
docker ps # Note the container ID of the relevant Deepgram container | ||
docker logs <container_id> > dg_container.log 2>&1 | ||
``` | ||
#### Podman | ||
```bash | ||
podman ps # Note the container ID of the relevant Deepgram container | ||
podman logs <container_id> > dg_container.log 2>&1 | ||
``` | ||
#### Kubernetes | ||
```bash | ||
kubectl get pods -n <namespace> # Note the name of the Pod containing the relevant Deepgram container | ||
kubectl logs <pod_name> > dg_container.log 2>&1 | ||
``` | ||
|
||
## Getting Help | ||
|
||
See the [Getting Help section](../README.md#getting-help) of the repo README. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
#!/bin/bash | ||
# | ||
# This script analyzes log files from Deepgram self-hosted containers to identify common | ||
# issues and provide troubleshooting suggestions. | ||
# | ||
# ## Usage | ||
# This script can analyze individual container logs by passing a single file as an argument. | ||
# Additionally, it can analyze logs from containers deployed in the same environment | ||
# by passing each log file as a seperate argument. This can be useful for analyzing a | ||
# paired API and Engine container. | ||
# | ||
# ``` | ||
# ./dg_log_parser.sh <logfile1> [logfile2] [logfile3] ... | ||
# ``` | ||
# | ||
# ## Supported Containers | ||
# - API | ||
# - Engine | ||
# - License Proxy | ||
|
||
set -euo pipefail | ||
|
||
YELLOW='\033[0;33m' | ||
GREEN='\033[0;32m' | ||
RED='\033[0;31m' | ||
NC='\033[0m' # No Color | ||
|
||
usage() { | ||
printf "Usage: %s <logfile1> [logfile2] [logfile3] ...\n" "$0" | ||
exit 1 | ||
} | ||
|
||
if [ $# -eq 0 ]; then | ||
usage | ||
fi | ||
|
||
check_file_errors() { | ||
local file="$1" | ||
local error_found=false | ||
local container_name="Deepgram" | ||
|
||
if grep -q "stem::config:" "$file"; then | ||
container_name="API" | ||
elif grep -q "impeller::config:" "$file"; then | ||
container_name="Engine" | ||
elif grep -q "hermes::config:" "$file"; then | ||
container_name="Hermes" | ||
fi | ||
|
||
if grep -q "Configuration file not found at .* Falling back to default/bundled configuration" "$file"; then | ||
printf "%bWarning%b: Using default configuration for %s container.\n" "$YELLOW" "$NC" "$container_name" | ||
printf "If you intended to specify your own configuration file, ensure it is being properly mounted to the container.\n" | ||
fi | ||
|
||
if grep -q "Missing license configuration" "$file"; then | ||
printf "%bError%b: Missing API key for %s container.\n" "$RED" "$NC" "$container_name" | ||
printf "Suggested fix: Ensure that the environment variable \`DEEPGRAM_API_KEY\` is set within the container (usually via your Compose file or Helm chart).\n" | ||
error_found=true | ||
fi | ||
|
||
if grep -qE "^.*Aegis request to .* failed.*$" "$file"; then | ||
local target_url | ||
target_url=$(grep -oE "Aegis request to [^ ]+ failed" "$file" | head -n1 | cut -d' ' -f4) | ||
printf "%bError%b: Connection issue detected for %s container. Unable to connect/authenticate with License Server via %s\n" \ | ||
"$RED" "$NC" "$container_name" "$target_url" | ||
|
||
if grep -qE "^.*Aegis request to .* failed:.*dns error.*$" "$file"; then | ||
printf "Suggested fix: Check DNS resolution for the target service.\n" | ||
elif grep -qE "^.*Aegis request to .* failed.*401.*$" "$file"; then | ||
printf "Suggested fix: Your API key is unauthorized. Check console.deepgram.com to ensure that your API key is active and has self-hosted access.\n" | ||
printf "See https://developers.deepgram.com/docs/self-hosted-self-service-tutorial for details" | ||
elif grep -qE "^.*Aegis request to .* failed:.*[TimedOut|Connection refused].*$" "$file"; then | ||
printf "Suggested fix: " | ||
if [[ "$target_url" =~ ^.*license.deepgram.com.*$ ]]; then | ||
printf "Verify egress traffic to license.deepgram.com is allow-listed by your firewall, and check network connectivity for your container.\n" | ||
else | ||
printf "Verify the License Proxy container is running and healthy\n" | ||
fi | ||
fi | ||
|
||
error_found=true | ||
fi | ||
|
||
if grep -q "impeller::config: Using devices: CPU" "$file"; then | ||
printf "%bWarning%b: Engine container was unable to detect a GPU, and is running in CPU mode.\n" "$YELLOW" "$NC" | ||
printf "CPU mode is critically less efficient than using a GPU, and likely not intended. Ensure all GPU setup steps have been completed from the Deepgram developer documentation.\n" | ||
error_found=true | ||
elif grep -q "half_precision=false" "$file"; then | ||
printf "%bWarning%b: GPU not running in half precision mode. Inference efficiency will be significantly impacted with this setting disabled.\n" "$YELLOW" "$NC" | ||
printf "Most modern GPUs support half precision, but auto-detection of this capability may not be working.\n" | ||
error_found=true | ||
fi | ||
|
||
if grep -q "impeller::model_suppliers::autoload: Unable to read model search path" "$file"; then | ||
printf "%bError%b: Invalid models directory for $container_name container.\n" "$RED" "$NC" | ||
printf "Suggested fix: Ensure that your models are mounted properly to the container.\n" | ||
error_found=true | ||
fi | ||
|
||
if grep -q "Failed to load model" "$file"; then | ||
bad_models=$(grep -P ".*Failed to load model.*" "$file" | grep -oP 'path=\K[^}]*' | sort -u) | ||
printf "%bWARNING%b: Some models could not be loaded by the $container_name container.\n" "$YELLOW" "$NC" | ||
printf "Suggested fix: Check each of the following files for corrupt downloads, and verify the model was delivered for the same project that issued your self-hosted API key.\n" | ||
for model in $bad_models; do | ||
printf " - %s\n" "$model" | ||
done | ||
error_found=true | ||
fi | ||
|
||
$error_found | ||
} | ||
|
||
analyze_logs() { | ||
local log_files=("$@") | ||
local error_found=false | ||
|
||
# Check each file individually for errors | ||
for file in "${log_files[@]}"; do | ||
if check_file_errors "$file"; then | ||
error_found=true | ||
fi | ||
done | ||
|
||
local temp_error_file | ||
temp_error_file=$(mktemp) | ||
local engine_listening=false | ||
echo "false" >"$temp_error_file" | ||
sort -k1 --stable "${log_files[@]}" | while IFS= read -r line; do | ||
if [[ $line =~ ^.*INFO\ impeller:\ Listening\ on\ http.*$ ]]; then | ||
engine_listening=true | ||
fi | ||
|
||
if [[ "$engine_listening" = true ]] && [[ $line =~ ^.*WARN\ impeller_info:\ stem::utils::impeller_info_actor:\ Unable\ to\ get\ model\ info\ from\ Engine\ with\ any\ drivers.*$ ]]; then | ||
printf "%bError%b: The API container was unable to connect to the Engine container, even after the Engine container successfully started.\n" "$RED" "$NC" | ||
printf "Suggested fix: Check your composition files, api.toml, and engine.toml files to ensure networking between the containers is configured correctly.\n" | ||
echo "true" >"$temp_error_file" | ||
break | ||
fi | ||
done | ||
|
||
if [[ $(cat "$temp_error_file") == "true" ]]; then | ||
error_found=true | ||
fi | ||
rm "$temp_error_file" | ||
|
||
if [ "$error_found" = false ]; then | ||
printf "%bNo problems detected from provided log files.%b \ | ||
If something is wrong with your deployment, there may be a different error that is not detected by this initial script. \ | ||
Contact your Deepgram Account Representative for further assistance.\n" \ | ||
"$GREEN" "$NC" | ||
fi | ||
|
||
} | ||
|
||
analyze_logs "$@" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
#!/bin/bash | ||
# | ||
# This script verifies the GPU environment and container runtime setup for Deepgram self-hosted products running with Docker or Podman. | ||
# It performs a series of checks to ensure that your system is properly configured to run GPU-accelerated container workloads. | ||
# | ||
# This script supports Ubuntu (using dpkg) and RHEL-based distributions (using dnf). | ||
# | ||
# ## Usage | ||
# Run this script with root privileges: | ||
# ``` | ||
# sudo ./dg_validate_nvidia_setup.sh | ||
# ``` | ||
|
||
# Function to display error messages in red | ||
error() { | ||
printf "\033[0;31m%s\033[0m\n" "$1" | ||
} | ||
|
||
# Function to display success messages in green | ||
success() { | ||
printf "\033[0;32m%s\033[0m\n" "$1" | ||
} | ||
|
||
direct_to_documentation() { | ||
doc_string=$'For details, see the Deepgram Self-Hosted documentation at:\n\t'"$1" | ||
error "$doc_string" | ||
} | ||
|
||
# Detect the package manager (dpkg for Ubuntu, dnf for RHEL-based distros) | ||
if command -v dpkg &>/dev/null; then | ||
package_manager="dpkg -s" | ||
elif command -v dnf &>/dev/null; then | ||
package_manager="dnf list installed" | ||
else | ||
error "Unsupported package manager. This script supports Ubuntu (dpkg) and RHEL-based distros (dnf)." | ||
exit 1 | ||
fi | ||
|
||
# Check if NVIDIA drivers are installed correctly | ||
if lsmod | grep -q nouveau; then | ||
error "Issue: Nouveau drivers are installed instead of NVIDIA drivers." | ||
error "Please install the correct NVIDIA drivers and blacklist the Nouveau drivers." | ||
direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#remove-nouveau-drivers" | ||
exit 1 | ||
elif ! nvidia-smi &>/dev/null; then | ||
error "Issue: NVIDIA drivers are not installed correctly or are corrupt." | ||
error "Please reinstall the NVIDIA drivers and ensure they are functioning properly." | ||
direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#install-nvidia-drivers" | ||
exit 1 | ||
else | ||
success "NVIDIA drivers are installed correctly." | ||
fi | ||
|
||
# Check if NVIDIA driver version is compatible with most recent Deepgram self-hosted release | ||
MINIMUM_DRIVER_VERSION="530.30.02" | ||
nvidia_driver_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader) | ||
if [[ "$(printf '%s\n' "$nvidia_driver_version" "$MINIMUM_DRIVER_VERSION" | sort -V | head -n1)" != "$MINIMUM_DRIVER_VERSION" ]]; then | ||
error "Issue: The installed NVIDIA driver version is not compatible with the most recent Deepgram self-hosted release." | ||
error "Please install a driver on version $MINIMUM_DRIVER_VERSION+." | ||
direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#download-and-install-the-official-drivers" | ||
exit 1 | ||
else | ||
success "NVIDIA driver version is compatible with the most recent Deepgram self-hosted release." | ||
fi | ||
|
||
# Check if NVIDIA container runtime is installed | ||
if ! $package_manager nvidia-container-toolkit &>/dev/null; then | ||
error "Issue: NVIDIA container toolkit is not installed." | ||
error "Please install the NVIDIA container toolkit to enable GPU support in containers." | ||
direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#install-the-nvidia-container-runtime" | ||
exit 1 | ||
else | ||
success "NVIDIA container runtime is installed." | ||
fi | ||
|
||
if which docker &>/dev/null; then | ||
# Check if NVIDIA container runtime is configured with Docker | ||
if ! grep -q "nvidia" /etc/docker/daemon.json 2>/dev/null; then | ||
error "Issue: NVIDIA container runtime is not configured with Docker." | ||
error "Please run the **Configuration** step for the 'nvidia-container-runtime'." | ||
direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#docker-1" | ||
exit 1 | ||
fi | ||
elif which podman &>/dev/null; then | ||
# Check if NVIDIA container runtime is configured with CDI for Podman | ||
CDI_SPEC_FILE="/etc/cdi/nvidia.yaml" | ||
|
||
if [ ! -f "$CDI_SPEC_FILE" ] || [ ! -r "$CDI_SPEC_FILE" ] || [ ! -s "$CDI_SPEC_FILE" ]; then | ||
error "Issue: NVIDIA container runtime is not configured with Podman." | ||
error "Please run the **Configuration** step for the 'nvidia-container-runtime'." | ||
direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#podman-1" | ||
exit 1 | ||
fi | ||
else | ||
error "Did not detect 'docker' or 'podman' container engines." | ||
error "This script currently only supports these two approaches." | ||
direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#install-container-engine" | ||
exit 1 | ||
fi | ||
success "NVIDIA container runtime is configured properly." | ||
|
||
success $'\nYour instance appears to be ready to run GPU container workloads, such as Deepgram self-hosted products.' |