diff --git a/README.md b/README.md index d6c3d2fa..473102a1 100644 --- a/README.md +++ b/README.md @@ -356,6 +356,7 @@ used to overwrite the defaults. - `RF_PID_FILE` - File to store process ids of started services (default: ${RF_HOME}/rapidfire_pids.txt) - `RF_PYTHON_EXECUTABLE` - Python executable (default: python3 falls back to python if not found) - `RF_PIP_EXECUTABLE` - pip executable (default: pip3 falls back to pip if not found) +- `RF_CONVERGE_MODE` - Whether to use Rapidfire AI Converge frontend and backend if available (default: all) ## Community & Governance diff --git a/rapidfireai/cli.py b/rapidfireai/cli.py index 677ece20..2d9cc27b 100644 --- a/rapidfireai/cli.py +++ b/rapidfireai/cli.py @@ -23,6 +23,7 @@ from .version import __version__ +RF_CONVERGE_MODE = os.getenv("RF_CONVERGE_MODE", "all") def get_script_path(): """Get the path to the start.sh script. @@ -456,6 +457,13 @@ def main(): parser.add_argument("--log-lines", type=int, default=10, help="Number of lines to log to the console") + parser.add_argument( + "--converge", + choices=["all", "none", "backend", "frontend"], + default=RF_CONVERGE_MODE, + help="Converge mode: all (default, start converge backend+frontend), none (use original frontend, do not start converge), backend (only converge backend), frontend (only converge frontend)", + ) + args = parser.parse_args() # Set environment variables from CLI args @@ -481,6 +489,9 @@ def main(): if args.force: os.environ["RF_FORCE"] = "true" + # Converge mode (all|none|backend|frontend) for start script + os.environ["RF_CONVERGE_MODE"] = args.converge + # Handle doctor command separately if args.command == "doctor": return run_doctor(args.log_lines) diff --git a/rapidfireai/utils/doctor.py b/rapidfireai/utils/doctor.py index c90b6612..a676cebc 100644 --- a/rapidfireai/utils/doctor.py +++ b/rapidfireai/utils/doctor.py @@ -62,6 +62,7 @@ def get_doctor_info(log_lines: int = 10): "mlflow", "torch", "transformers", + "protobuf", "flask", "gunicorn", "peft", @@ -78,6 +79,8 @@ def get_doctor_info(log_lines: int = 10): "langchain-openai", "langchain-huggingface", "langchain-classic", + "langchain-pinecone", + "langchain-postgres", "unstructured", "waitress", "vllm", diff --git a/setup/evals/requirements-colab.txt b/setup/evals/requirements-colab.txt index fc92e9ff..3aac9c1b 100644 --- a/setup/evals/requirements-colab.txt +++ b/setup/evals/requirements-colab.txt @@ -45,5 +45,5 @@ flask-cors>=5.0.1 # Logging loguru - -numpy==2.0.1 \ No newline at end of file +numpy==2.0.1 +protobuf<6.0.0 \ No newline at end of file diff --git a/setup/evals/requirements-local.txt b/setup/evals/requirements-local.txt index 771b6993..b64fa275 100644 --- a/setup/evals/requirements-local.txt +++ b/setup/evals/requirements-local.txt @@ -27,7 +27,6 @@ langchain-postgres>=0.0.17 # Data Manipulation & Display unstructured>=0.18.15 -numpy>=1.26.4,<2.3 # Other requests==2.32.5 @@ -41,3 +40,6 @@ mlflow>=3.2.0 gunicorn>=23.0.0 flask-cors>=5.0.1 loguru + +numpy==2.0.1 +protobuf<6.0.0 \ No newline at end of file diff --git a/setup/start.sh b/setup/start.sh index d838f5ab..7fdeecea 100755 --- a/setup/start.sh +++ b/setup/start.sh @@ -4,6 +4,7 @@ # This script starts MLflow server, API server, and frontend tracking server # Used for pip-installed package mode + set -e # Exit on any error # Configuration @@ -29,6 +30,20 @@ RF_LOG_PATH="${RF_LOG_PATH:=$RF_HOME/logs}" RF_TIMEOUT_TIME=${RF_TIMEOUT_TIME:=30} +# Converge mode: all (backend+frontend), none (original frontend only), backend, frontend +RF_CONVERGE_MODE=${RF_CONVERGE_MODE:=all} +case "$RF_CONVERGE_MODE" in + all|none|backend|frontend) ;; + *) + echo "Invalid RF_CONVERGE_MODE=$RF_CONVERGE_MODE (expected: all, none, backend, frontend)" + exit 1 + ;; +esac +RF_CONVERGE_BACKEND_HOST=${RF_CONVERGE_BACKEND_HOST:=0.0.0.0} +RF_CONVERGE_BACKEND_PORT=${RF_CONVERGE_BACKEND_PORT:=8860} +RF_CONVERGE_FRONTEND_HOST=${RF_CONVERGE_FRONTEND_HOST:=$RF_FRONTEND_HOST} +RF_CONVERGE_FRONTEND_PORT=${RF_CONVERGE_FRONTEND_PORT:=$RF_FRONTEND_PORT} + # Colab mode configuration if [ -z "${COLAB_GPU+x}" ]; then RF_MLFLOW_ENABLED=${RF_MLFLOW_ENABLED:=true} @@ -96,6 +111,11 @@ print_warning() { echo -e "${YELLOW}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $1" } +# Return 0 if rapidfireai-pro pip package is installed +has_rapidfireai_pro() { + ${RF_PIP_EXECUTABLE} show rapidfireai-pro >/dev/null 2>&1 +} + # Function to setup Python environment setup_python_env() { print_status "Setting up Python environment..." @@ -182,6 +202,9 @@ cleanup() { pkill -f "gunicorn.*rapidfireai.$RAPIDFIRE_MODE.dispatcher" 2>/dev/null || true # Only kill Flask server if we're not in Colab (frontend doesn't run in Colab) pkill -f "python.*rapidfireai/frontend/server.py" 2>/dev/null || true + # Stop Converge if it was running + pkill -f "converge start" 2>/dev/null || true + pkill -f "uvicorn.*main:app" 2>/dev/null || true fi print_success "All services stopped" @@ -575,6 +598,81 @@ start_frontend() { return 0 } +# Function to start Converge via converge CLI (mode: all | backend | frontend) +start_converge() { + local mode="${1:-$RF_CONVERGE_MODE}" + print_status "Starting Converge ($mode)..." + + # converge start runs in the foreground with its own monitor loop, + # so we launch it in the background and track it like other services. + print_status "Converge logs will be written to: $RF_LOG_PATH/converge.log" + + local converge_args="--force" + case "$mode" in + all) ;; + backend) converge_args="$converge_args backend" ;; + frontend) converge_args="$converge_args frontend" ;; + *) converge_args="$converge_args" ;; + esac + + if command -v setsid &> /dev/null; then + setsid converge start $converge_args > "$RF_LOG_PATH/converge.log" 2>&1 & + else + nohup converge start $converge_args > "$RF_LOG_PATH/converge.log" 2>&1 & + fi + + local converge_pid=$! + echo "$converge_pid Converge" >> "$RF_PID_FILE" + + # When starting full stack or frontend, wait for frontend port; backend-only may not serve it + if [[ "$mode" == "backend" ]] || [[ "$mode" == "all" ]]; then + if wait_for_service $RF_CONVERGE_BACKEND_HOST $RF_CONVERGE_BACKEND_PORT "Converge backend" $RF_TIMEOUT_TIME; then + print_success "Converge backend started (PID: $converge_pid)" + else + print_error "Converge backend failed to start. Checking logs..." + if [[ -f "$RF_LOG_PATH/converge.log" ]]; then + echo "=== Last 30 lines of converge.log ===" + tail -30 "$RF_LOG_PATH/converge.log" + echo "=== End of log ===" + if [[ -f "$RF_LOG_PATH/converge_backend.log" ]]; then + echo "=== Last 30 lines of converge_backend.log ===" + tail -30 "$RF_LOG_PATH/converge_backend.log" + echo "=== End of log ===" + else + echo "No converge_backend.log file found" + fi + else + echo "No converge.log file found" + fi + return 1 + fi + fi + + if [[ "$mode" == "frontend" ]] || [[ "$mode" == "all" ]]; then + if wait_for_service $RF_CONVERGE_FRONTEND_HOST $RF_CONVERGE_FRONTEND_PORT "Converge frontend" $RF_TIMEOUT_TIME; then + print_success "Converge frontend started (PID: $converge_pid)" + else + print_error "Converge frontend failed to start. Checking logs..." + if [[ -f "$RF_LOG_PATH/converge.log" ]]; then + echo "=== Last 30 lines of converge.log ===" + tail -30 "$RF_LOG_PATH/converge.log" + echo "=== End of log ===" + if [[ -f "$RF_LOG_PATH/converge_frontend.log" ]]; then + echo "=== Last 30 lines of converge_frontend.log ===" + tail -30 "$RF_LOG_PATH/converge_frontend.log" + echo "=== End of log ===" + else + echo "No converge_frontend.log file found" + fi + else + echo "No converge.log file found" + fi + return 1 + fi + fi + return 0 +} + # Function to conditionally start frontend based on mode start_frontend_if_needed() { # In Colab mode, always skip frontend @@ -727,11 +825,34 @@ start_services() { # Start frontend server (conditionally) if [[ "$RF_MLFLOW_ENABLED" == "true" ]]; then - if start_frontend; then - ((services_started++)) - else - print_error "Failed to start frontend server" - fi + case "$RF_CONVERGE_MODE" in + none) + if start_frontend; then + ((services_started++)) + else + print_error "Failed to start frontend server" + fi + ;; + backend|frontend|all) + if has_rapidfireai_pro; then + if start_converge; then + ((services_started++)) + else + print_error "Failed to start Converge" + fi + else + if [[ "$RF_CONVERGE_MODE" == "all" ]]; then + if start_frontend; then + ((services_started++)) + else + print_error "Failed to start frontend server" + fi + else + print_error "rapidfireai-pro is not installed (required for --converge=$RF_CONVERGE_MODE)" + fi + fi + ;; + esac else print_status "⊗ Skipping frontend (use TensorBoard if in Colab mode)" fi @@ -794,7 +915,7 @@ main() { # Show summary of all log files for debugging print_status "=== Startup Failure Summary ===" - for log_file in "mlflow.log" "api.log" "frontend.log"; do + for log_file in "mlflow.log" "api.log" "frontend.log" "converge.log"; do if [[ -f "$RF_LOG_PATH/$log_file" ]]; then echo "" print_status "=== $log_file ==="