New diagnostics dir (#45)

* add diagnostic tools for deployments * add ci check for bash scripts * update top-level README to include diagnostics dir * remove redundant shell openers * clarify log warnings
deepgram · Oct 10, 2024 · 8c3a51d · 8c3a51d
1 parent 721a0db
commit 8c3a51d
Show file tree

Hide file tree

Showing 5 changed files with 339 additions and 0 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -75,3 +75,47 @@ jobs:
               exit 1
             fi
           fi
+
+  diagnostic-scripts-check:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Check for changes in diagnostic directory
+        id: check_changes
+        run: |
+          git diff --name-only origin/${{ github.base_ref }}..HEAD -- diagnostics/*.sh > changed_files.txt
+          if [ -s changed_files.txt ]; then
+            echo "changes_detected=true" >> $GITHUB_OUTPUT
+          else
+            echo "changes_detected=false" >> $GITHUB_OUTPUT
+          fi
+      - name: Install ShellCheck
+        if: steps.check_changes.outputs.changes_detected == 'true'
+        run: sudo apt-get install -y shellcheck
+      - name: Install shfmt
+        if: steps.check_changes.outputs.changes_detected == 'true'
+        run: |
+          go install mvdan.cc/sh/v3/cmd/shfmt@latest
+          echo "$HOME/go/bin" >> $GITHUB_PATH
+      - name: Run ShellCheck
+        if: steps.check_changes.outputs.changes_detected == 'true'
+        run: |
+          while IFS= read -r file; do
+            if [[ "$file" == *.sh ]]; then
+              shellcheck "$file"
+            fi
+          done < changed_files.txt
+      - name: Run shfmt
+        if: steps.check_changes.outputs.changes_detected == 'true'
+        run: |
+          while IFS= read -r file; do
+            if [[ "$file" == *.sh ]]; then
+              if ! shfmt -d "$file"; then
+                echo "Error: $file is not correctly formatted. Run 'shfmt -w $file' to fix."
+                exit 1
+              fi
+            fi
+          done < changed_files.txt
diff --git a/README.md b/README.md
@@ -6,6 +6,7 @@ Official resources for running [Deepgram](https://deepgram.com) in a [self-hoste
 * [Helm Chart](charts/deepgram-self-hosted/README.md) for Kubernetes deployments
 * [Docker Compose Files](./docker/README.md) for deploying with Docker
 * [Podman Compose Files](./podman/README.md) for deploying with Podman
+* [Diagnostic](./diagnostics/README.md) tools and scripts for troubleshooting deployments
 
 ## Documentation
 

diff --git a/diagnostics/README.md b/diagnostics/README.md
@@ -0,0 +1,37 @@
+# Diagnostics
+
+This directory contains a collection of tools and scripts designed to help validate, monitor, and troubleshoot the deployment of Deepgram's self-hosted product. 
+
+## Usage
+
+For detailed usage instructions and features of each script, please refer to the header comments within the respective script files.
+
+## Contents
+### 1. [dg_validate_nvidia_setup.sh](./dg_validate_nvidia_setup.sh)
+
+This script verifies the GPU environment and container runtime setup for Deepgram self-hosted products running with Docker or Podman. 
+
+### 2. [dg_log_parser.sh](./dg_log_parser.sh)
+This script analyzes log files from Deepgram self-hosted containers to identify common issues and provide troubleshooting suggestions.
+
+Collecting log files for analysis will vary depending on your container orchestrator:
+
+#### Docker
+```bash
+docker ps # Note the container ID of the relevant Deepgram container
+docker logs <container_id> > dg_container.log 2>&1
+```
+#### Podman
+```bash
+podman ps # Note the container ID of the relevant Deepgram container
+podman logs <container_id> > dg_container.log 2>&1
+```
+#### Kubernetes
+```bash
+kubectl get pods -n <namespace> # Note the name of the Pod containing the relevant Deepgram container
+kubectl logs <pod_name> > dg_container.log 2>&1
+```
+
+## Getting Help
+
+See the [Getting Help section](../README.md#getting-help) of the repo README.
diff --git a/diagnostics/dg_log_parser.sh b/diagnostics/dg_log_parser.sh
@@ -0,0 +1,155 @@
+#!/bin/bash
+#
+# This script analyzes log files from Deepgram self-hosted containers to identify common
+# issues and provide troubleshooting suggestions.
+#
+# ## Usage
+# This script can analyze individual container logs by passing a single file as an argument.
+# Additionally, it can analyze logs from containers deployed in the same environment
+# by passing each log file as a seperate argument. This can be useful for analyzing a
+# paired API and Engine container.
+#
+# ```
+# ./dg_log_parser.sh <logfile1> [logfile2] [logfile3] ...
+# ```
+#
+# ## Supported Containers
+# - API
+# - Engine
+# - License Proxy
+
+set -euo pipefail
+
+YELLOW='\033[0;33m'
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+usage() {
+	printf "Usage: %s <logfile1> [logfile2] [logfile3] ...\n" "$0"
+	exit 1
+}
+
+if [ $# -eq 0 ]; then
+	usage
+fi
+
+check_file_errors() {
+	local file="$1"
+	local error_found=false
+	local container_name="Deepgram"
+
+	if grep -q "stem::config:" "$file"; then
+		container_name="API"
+	elif grep -q "impeller::config:" "$file"; then
+		container_name="Engine"
+	elif grep -q "hermes::config:" "$file"; then
+		container_name="Hermes"
+	fi
+
+	if grep -q "Configuration file not found at .* Falling back to default/bundled configuration" "$file"; then
+		printf "%bWarning%b: Using default configuration for %s container.\n" "$YELLOW" "$NC" "$container_name"
+		printf "If you intended to specify your own configuration file, ensure it is being properly mounted to the container.\n"
+	fi
+
+	if grep -q "Missing license configuration" "$file"; then
+		printf "%bError%b: Missing API key for %s container.\n" "$RED" "$NC" "$container_name"
+		printf "Suggested fix: Ensure that the environment variable \`DEEPGRAM_API_KEY\` is set within the container (usually via your Compose file or Helm chart).\n"
+		error_found=true
+	fi
+
+	if grep -qE "^.*Aegis request to .* failed.*$" "$file"; then
+		local target_url
+		target_url=$(grep -oE "Aegis request to [^ ]+ failed" "$file" | head -n1 | cut -d' ' -f4)
+		printf "%bError%b: Connection issue detected for %s container. Unable to connect/authenticate with License Server via %s\n" \
+			"$RED" "$NC" "$container_name" "$target_url"
+
+		if grep -qE "^.*Aegis request to .* failed:.*dns error.*$" "$file"; then
+			printf "Suggested fix: Check DNS resolution for the target service.\n"
+		elif grep -qE "^.*Aegis request to .* failed.*401.*$" "$file"; then
+			printf "Suggested fix: Your API key is unauthorized. Check console.deepgram.com to ensure that your API key is active and has self-hosted access.\n"
+			printf "See https://developers.deepgram.com/docs/self-hosted-self-service-tutorial for details"
+		elif grep -qE "^.*Aegis request to .* failed:.*[TimedOut|Connection refused].*$" "$file"; then
+			printf "Suggested fix: "
+			if [[ "$target_url" =~ ^.*license.deepgram.com.*$ ]]; then
+				printf "Verify egress traffic to license.deepgram.com is allow-listed by your firewall, and check network connectivity for your container.\n"
+			else
+				printf "Verify the License Proxy container is running and healthy\n"
+			fi
+		fi
+
+		error_found=true
+	fi
+
+	if grep -q "impeller::config: Using devices: CPU" "$file"; then
+		printf "%bWarning%b: Engine container was unable to detect a GPU, and is running in CPU mode.\n" "$YELLOW" "$NC"
+		printf "CPU mode is critically less efficient than using a GPU, and likely not intended. Ensure all GPU setup steps have been completed from the Deepgram developer documentation.\n"
+		error_found=true
+	elif grep -q "half_precision=false" "$file"; then
+		printf "%bWarning%b: GPU not running in half precision mode. Inference efficiency will be significantly impacted with this setting disabled.\n" "$YELLOW" "$NC"
+		printf "Most modern GPUs support half precision, but auto-detection of this capability may not be working.\n"
+		error_found=true
+	fi
+
+	if grep -q "impeller::model_suppliers::autoload: Unable to read model search path" "$file"; then
+		printf "%bError%b: Invalid models directory for $container_name container.\n" "$RED" "$NC"
+		printf "Suggested fix: Ensure that your models are mounted properly to the container.\n"
+		error_found=true
+	fi
+
+	if grep -q "Failed to load model" "$file"; then
+		bad_models=$(grep -P ".*Failed to load model.*" "$file" | grep -oP 'path=\K[^}]*' | sort -u)
+		printf "%bWARNING%b: Some models could not be loaded by the $container_name container.\n" "$YELLOW" "$NC"
+		printf "Suggested fix: Check each of the following files for corrupt downloads, and verify the model was delivered for the same project that issued your self-hosted API key.\n"
+		for model in $bad_models; do
+			printf "  - %s\n" "$model"
+		done
+		error_found=true
+	fi
+
+	$error_found
+}
+
+analyze_logs() {
+	local log_files=("$@")
+	local error_found=false
+
+	# Check each file individually for errors
+	for file in "${log_files[@]}"; do
+		if check_file_errors "$file"; then
+			error_found=true
+		fi
+	done
+
+	local temp_error_file
+	temp_error_file=$(mktemp)
+	local engine_listening=false
+	echo "false" >"$temp_error_file"
+	sort -k1 --stable "${log_files[@]}" | while IFS= read -r line; do
+		if [[ $line =~ ^.*INFO\ impeller:\ Listening\ on\ http.*$ ]]; then
+			engine_listening=true
+		fi
+
+		if [[ "$engine_listening" = true ]] && [[ $line =~ ^.*WARN\ impeller_info:\ stem::utils::impeller_info_actor:\ Unable\ to\ get\ model\ info\ from\ Engine\ with\ any\ drivers.*$ ]]; then
+			printf "%bError%b: The API container was unable to connect to the Engine container, even after the Engine container successfully started.\n" "$RED" "$NC"
+			printf "Suggested fix: Check your composition files, api.toml, and engine.toml files to ensure networking between the containers is configured correctly.\n"
+			echo "true" >"$temp_error_file"
+			break
+		fi
+	done
+
+	if [[ $(cat "$temp_error_file") == "true" ]]; then
+		error_found=true
+	fi
+	rm "$temp_error_file"
+
+	if [ "$error_found" = false ]; then
+		printf "%bNo problems detected from provided log files.%b \
+      If something is wrong with your deployment, there may be a different error that is not detected by this initial script. \
+      Contact your Deepgram Account Representative for further assistance.\n" \
+			"$GREEN" "$NC"
+	fi
+
+}
+
+analyze_logs "$@"
diff --git a/diagnostics/dg_validate_nvidia_setup.sh b/diagnostics/dg_validate_nvidia_setup.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+#
+# This script verifies the GPU environment and container runtime setup for Deepgram self-hosted products running with Docker or Podman.
+# It performs a series of checks to ensure that your system is properly configured to run GPU-accelerated container workloads.
+#
+# This script supports Ubuntu (using dpkg) and RHEL-based distributions (using dnf).
+#
+# ## Usage
+# Run this script with root privileges:
+# ```
+# sudo ./dg_validate_nvidia_setup.sh
+# ```
+
+# Function to display error messages in red
+error() {
+	printf "\033[0;31m%s\033[0m\n" "$1"
+}
+
+# Function to display success messages in green
+success() {
+	printf "\033[0;32m%s\033[0m\n" "$1"
+}
+
+direct_to_documentation() {
+	doc_string=$'For details, see the Deepgram Self-Hosted documentation at:\n\t'"$1"
+	error "$doc_string"
+}
+
+# Detect the package manager (dpkg for Ubuntu, dnf for RHEL-based distros)
+if command -v dpkg &>/dev/null; then
+	package_manager="dpkg -s"
+elif command -v dnf &>/dev/null; then
+	package_manager="dnf list installed"
+else
+	error "Unsupported package manager. This script supports Ubuntu (dpkg) and RHEL-based distros (dnf)."
+	exit 1
+fi
+
+# Check if NVIDIA drivers are installed correctly
+if lsmod | grep -q nouveau; then
+	error "Issue: Nouveau drivers are installed instead of NVIDIA drivers."
+	error "Please install the correct NVIDIA drivers and blacklist the Nouveau drivers."
+	direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#remove-nouveau-drivers"
+	exit 1
+elif ! nvidia-smi &>/dev/null; then
+	error "Issue: NVIDIA drivers are not installed correctly or are corrupt."
+	error "Please reinstall the NVIDIA drivers and ensure they are functioning properly."
+	direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#install-nvidia-drivers"
+	exit 1
+else
+	success "NVIDIA drivers are installed correctly."
+fi
+
+# Check if NVIDIA driver version is compatible with most recent Deepgram self-hosted release
+MINIMUM_DRIVER_VERSION="530.30.02"
+nvidia_driver_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader)
+if [[ "$(printf '%s\n' "$nvidia_driver_version" "$MINIMUM_DRIVER_VERSION" | sort -V | head -n1)" != "$MINIMUM_DRIVER_VERSION" ]]; then
+	error "Issue: The installed NVIDIA driver version is not compatible with the most recent Deepgram self-hosted release."
+	error "Please install a driver on version $MINIMUM_DRIVER_VERSION+."
+	direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#download-and-install-the-official-drivers"
+	exit 1
+else
+	success "NVIDIA driver version is compatible with the most recent Deepgram self-hosted release."
+fi
+
+# Check if NVIDIA container runtime is installed
+if ! $package_manager nvidia-container-toolkit &>/dev/null; then
+	error "Issue: NVIDIA container toolkit is not installed."
+	error "Please install the NVIDIA container toolkit to enable GPU support in containers."
+	direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#install-the-nvidia-container-runtime"
+	exit 1
+else
+	success "NVIDIA container runtime is installed."
+fi
+
+if which docker &>/dev/null; then
+	# Check if NVIDIA container runtime is configured with Docker
+	if ! grep -q "nvidia" /etc/docker/daemon.json 2>/dev/null; then
+		error "Issue: NVIDIA container runtime is not configured with Docker."
+		error "Please run the **Configuration** step for the 'nvidia-container-runtime'."
+		direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#docker-1"
+		exit 1
+	fi
+elif which podman &>/dev/null; then
+	# Check if NVIDIA container runtime is configured with CDI for Podman
+	CDI_SPEC_FILE="/etc/cdi/nvidia.yaml"
+
+	if [ ! -f "$CDI_SPEC_FILE" ] || [ ! -r "$CDI_SPEC_FILE" ] || [ ! -s "$CDI_SPEC_FILE" ]; then
+		error "Issue: NVIDIA container runtime is not configured with Podman."
+		error "Please run the **Configuration** step for the 'nvidia-container-runtime'."
+		direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#podman-1"
+		exit 1
+	fi
+else
+	error "Did not detect 'docker' or 'podman' container engines."
+	error "This script currently only supports these two approaches."
+	direct_to_documentation "https://developers.deepgram.com/docs/drivers-and-containerization-platforms#install-container-engine"
+	exit 1
+fi
+success "NVIDIA container runtime is configured properly."
+
+success $'\nYour instance appears to be ready to run GPU container workloads, such as Deepgram self-hosted products.'