Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ used to overwrite the defaults.
- `RF_PID_FILE` - File to store process ids of started services (default: ${RF_HOME}/rapidfire_pids.txt)
- `RF_PYTHON_EXECUTABLE` - Python executable (default: python3 falls back to python if not found)
- `RF_PIP_EXECUTABLE` - pip executable (default: pip3 falls back to pip if not found)
- `RF_CONVERGE_MODE` - Whether to use Rapidfire AI Converge frontend and backend if available (default: all)

## Community & Governance

Expand Down
10 changes: 10 additions & 0 deletions rapidfireai/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,13 @@ def main():

parser.add_argument("--log-lines", type=int, default=10, help="Number of lines to log to the console")

parser.add_argument(
"--converge",
choices=["all", "none", "backend", "frontend"],
default="all",
Comment thread
david-rfai marked this conversation as resolved.
Outdated
help="Converge mode: all (default, start converge backend+frontend), none (use original frontend, do not start converge), backend (only converge backend), frontend (only converge frontend)",
)

args = parser.parse_args()

# Set environment variables from CLI args
Expand All @@ -481,6 +488,9 @@ def main():
if args.force:
os.environ["RF_FORCE"] = "true"

# Converge mode (all|none|backend|frontend) for start script
os.environ["RF_CONVERGE_MODE"] = args.converge

# Handle doctor command separately
if args.command == "doctor":
return run_doctor(args.log_lines)
Expand Down
3 changes: 3 additions & 0 deletions rapidfireai/utils/doctor.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def get_doctor_info(log_lines: int = 10):
"mlflow",
"torch",
"transformers",
"protobuf",
"flask",
"gunicorn",
"peft",
Expand All @@ -78,6 +79,8 @@ def get_doctor_info(log_lines: int = 10):
"langchain-openai",
"langchain-huggingface",
"langchain-classic",
"langchain-pinecone",
"langchain-postgres",
"unstructured",
"waitress",
"vllm",
Expand Down
4 changes: 2 additions & 2 deletions setup/evals/requirements-colab.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,5 @@ flask-cors>=5.0.1
# Logging
loguru


numpy==2.0.1
numpy==2.0.1
protobuf<6.0.0
3 changes: 3 additions & 0 deletions setup/evals/requirements-local.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,6 @@ mlflow>=3.2.0
gunicorn>=23.0.0
flask-cors>=5.0.1
loguru

numpy==2.0.1
Comment thread
david-rfai marked this conversation as resolved.
protobuf<6.0.0
100 changes: 94 additions & 6 deletions setup/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# This script starts MLflow server, API server, and frontend tracking server
# Used for pip-installed package mode


set -e # Exit on any error

# Configuration
Expand All @@ -29,6 +30,16 @@ RF_LOG_PATH="${RF_LOG_PATH:=$RF_HOME/logs}"

RF_TIMEOUT_TIME=${RF_TIMEOUT_TIME:=30}

# Converge mode: all (backend+frontend), none (original frontend only), backend, frontend
RF_CONVERGE_MODE=${RF_CONVERGE_MODE:=all}
case "$RF_CONVERGE_MODE" in
all|none|backend|frontend) ;;
*)
echo "Invalid RF_CONVERGE_MODE=$RF_CONVERGE_MODE (expected: all, none, backend, frontend)"
exit 1
;;
esac

# Colab mode configuration
if [ -z "${COLAB_GPU+x}" ]; then
RF_MLFLOW_ENABLED=${RF_MLFLOW_ENABLED:=true}
Expand Down Expand Up @@ -96,6 +107,11 @@ print_warning() {
echo -e "${YELLOW}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $1"
}

# Return 0 if rapidfireai-pro pip package is installed
has_rapidfireai_pro() {
${RF_PIP_EXECUTABLE} show rapidfireai-pro >/dev/null 2>&1
}

# Function to setup Python environment
setup_python_env() {
print_status "Setting up Python environment..."
Expand Down Expand Up @@ -182,6 +198,9 @@ cleanup() {
pkill -f "gunicorn.*rapidfireai.$RAPIDFIRE_MODE.dispatcher" 2>/dev/null || true
# Only kill Flask server if we're not in Colab (frontend doesn't run in Colab)
pkill -f "python.*rapidfireai/frontend/server.py" 2>/dev/null || true
# Stop Converge if it was running
pkill -f "converge start" 2>/dev/null || true
pkill -f "uvicorn.*main:app" 2>/dev/null || true
Comment thread
david-rfai marked this conversation as resolved.
fi

print_success "All services stopped"
Expand Down Expand Up @@ -575,6 +594,52 @@ start_frontend() {
return 0
}

# Function to start Converge via converge CLI (mode: all | backend | frontend)
start_converge() {
local mode="${1:-$RF_CONVERGE_MODE}"
print_status "Starting Converge ($mode)..."

# converge start runs in the foreground with its own monitor loop,
# so we launch it in the background and track it like other services.
print_status "Converge logs will be written to: $RF_LOG_PATH/converge.log"

local converge_args="--force"
case "$mode" in
all) ;;
backend) converge_args="$converge_args backend" ;;
frontend) converge_args="$converge_args frontend" ;;
*) converge_args="$converge_args" ;;
esac

if command -v setsid &> /dev/null; then
setsid converge start $converge_args > "$RF_LOG_PATH/converge.log" 2>&1 &
else
nohup converge start $converge_args > "$RF_LOG_PATH/converge.log" 2>&1 &
fi

local converge_pid=$!
echo "$converge_pid Converge" >> "$RF_PID_FILE"

# When starting full stack or frontend, wait for frontend port; backend-only may not serve it
if [[ "$mode" == "backend" ]]; then
print_success "Converge backend started (PID: $converge_pid)"
return 0
fi
Comment thread
cursor[bot] marked this conversation as resolved.
Comment thread
david-rfai marked this conversation as resolved.

if wait_for_service $RF_FRONTEND_HOST $RF_FRONTEND_PORT "Converge frontend" $RF_TIMEOUT_TIME; then
print_success "Converge started (PID: $converge_pid)"
return 0
else
print_error "Converge failed to start. Checking logs..."
if [[ -f "$RF_LOG_PATH/converge.log" ]]; then
echo "=== Last 30 lines of converge.log ==="
tail -30 "$RF_LOG_PATH/converge.log"
echo "=== End of logs ==="
fi
return 1
fi
}

# Function to conditionally start frontend based on mode
start_frontend_if_needed() {
# In Colab mode, always skip frontend
Expand Down Expand Up @@ -727,11 +792,34 @@ start_services() {

# Start frontend server (conditionally)
if [[ "$RF_MLFLOW_ENABLED" == "true" ]]; then
if start_frontend; then
((services_started++))
else
print_error "Failed to start frontend server"
fi
case "$RF_CONVERGE_MODE" in
none)
if start_frontend; then
((services_started++))
else
print_error "Failed to start frontend server"
fi
;;
backend|frontend|all)
if has_rapidfireai_pro; then
if start_converge; then
((services_started++))
else
print_error "Failed to start Converge"
fi
else
if [[ "$RF_CONVERGE_MODE" == "all" ]]; then
if start_frontend; then
((services_started++))
else
print_error "Failed to start frontend server"
fi
else
print_error "rapidfireai-pro is not installed (required for --converge=$RF_CONVERGE_MODE)"
fi
fi
;;
esac
else
print_status "⊗ Skipping frontend (use TensorBoard if in Colab mode)"
fi
Expand Down Expand Up @@ -794,7 +882,7 @@ main() {

# Show summary of all log files for debugging
print_status "=== Startup Failure Summary ==="
for log_file in "mlflow.log" "api.log" "frontend.log"; do
for log_file in "mlflow.log" "api.log" "frontend.log" "converge.log"; do
if [[ -f "$RF_LOG_PATH/$log_file" ]]; then
echo ""
print_status "=== $log_file ==="
Expand Down