diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..8f3c207 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,16 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +indent_style = space +indent_size = 2 + +[*.py] +indent_size = 4 + +[Makefile] +indent_style = tab + diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml new file mode 100644 index 0000000..03033b3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -0,0 +1,66 @@ +name: Bug Report +description: Capture a production bug, defect, or regression with enough detail to reproduce and triage. +title: "[Bug]: " +labels: + - bug +body: + - type: textarea + id: summary + attributes: + label: Summary + description: Describe the bug in one paragraph. + placeholder: What broke, where, and how bad is it? + validations: + required: true + - type: dropdown + id: severity + attributes: + label: Severity + options: + - sev1 - production outage or data risk + - sev2 - major user-facing regression + - sev3 - limited defect or workaround exists + - sev4 - minor issue + validations: + required: true + - type: textarea + id: impact + attributes: + label: User Impact + description: State who is affected, how often, and what is blocked. + validations: + required: true + - type: textarea + id: reproduction + attributes: + label: Reproduction Steps + description: Provide exact steps, data, and environment. + placeholder: | + 1. Go to ... + 2. Submit ... + 3. Observe ... + validations: + required: true + - type: textarea + id: expected + attributes: + label: Expected Behavior + validations: + required: true + - type: textarea + id: actual + attributes: + label: Actual Behavior + validations: + required: true + - type: textarea + id: evidence + attributes: + label: Evidence + description: Logs, screenshots, traces, or failing commit range. + - type: input + id: rollback + attributes: + label: Rollback or Mitigation + description: Link the rollback, mitigation, or state why none exists. + diff --git a/.github/ISSUE_TEMPLATE/change-request.yml b/.github/ISSUE_TEMPLATE/change-request.yml new file mode 100644 index 0000000..2caa2bf --- /dev/null +++ b/.github/ISSUE_TEMPLATE/change-request.yml @@ -0,0 +1,42 @@ +name: Change Request +description: Formal request for an architectural, operational, or production-impacting change. +title: "[Change]: " +labels: + - change-control +body: + - type: textarea + id: scope + attributes: + label: Scope + description: Describe the exact system change. + validations: + required: true + - type: textarea + id: risk + attributes: + label: Risk Assessment + description: State the blast radius, failure modes, and user impact if the change goes wrong. + validations: + required: true + - type: textarea + id: verification + attributes: + label: Verification Plan + description: Tests, dashboards, smoke checks, and success criteria. + validations: + required: true + - type: textarea + id: rollback + attributes: + label: Rollback Plan + description: Exact rollback command, workflow, or environment action. + validations: + required: true + - type: textarea + id: approvals + attributes: + label: Required Approvals + description: Engineering, security, product, or operations signoff needed before release. + validations: + required: true + diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..420ee24 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,6 @@ +blank_issues_enabled: false +contact_links: + - name: Incident Runbooks + url: https://github.com/liteshperumalla/smart-tutor-ai-ai-driven-personalized-teaching-support/tree/main/docs/runbooks + about: Use the runbooks for live operational response before opening non-actionable incident issues. + diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml new file mode 100644 index 0000000..dab9a18 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature-request.yml @@ -0,0 +1,53 @@ +name: Feature Request +description: Capture a new capability with business context, acceptance criteria, and release expectations. +title: "[Feature]: " +labels: + - enhancement +body: + - type: textarea + id: problem + attributes: + label: Problem Statement + description: What user or business problem are we solving? + validations: + required: true + - type: textarea + id: proposal + attributes: + label: Proposed Change + description: Describe the intended behavior, not just implementation details. + validations: + required: true + - type: textarea + id: acceptance + attributes: + label: Acceptance Criteria + placeholder: | + - [ ] User can ... + - [ ] System records ... + - [ ] Rollback path is documented ... + validations: + required: true + - type: textarea + id: dependencies + attributes: + label: Dependencies and Constraints + description: APIs, secrets, migrations, data changes, compliance, or rollout blockers. + - type: dropdown + id: change_type + attributes: + label: Change Type + options: + - feature + - platform + - security + - reliability + - data + validations: + required: true + - type: input + id: target_release + attributes: + label: Target Release or Milestone + description: Sprint, release tag, or milestone. + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..ec7468c --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,38 @@ +## Summary + +Describe the change in 2-5 lines. + +## Scope + +- Change type: +- User or system impact: +- Environments affected: + +## Requirements and Change Control + +- Linked issue or change request: +- ADR required: +- Data or schema change: +- Security impact: + +## Verification + +- [ ] Backend tests +- [ ] Frontend tests +- [ ] Contract or integration tests +- [ ] Migration validation +- [ ] Security checks +- [ ] Deployment or smoke verification + +Verification details: + +## Rollout and Rollback + +- Rollout plan: +- Rollback plan: +- Observability signals to watch: + +## Reviewer Notes + +Anything risky, incomplete, or intentionally deferred. + diff --git a/.github/actions/send-slack-notification/action.yml b/.github/actions/send-slack-notification/action.yml index a917684..1dcfde6 100644 --- a/.github/actions/send-slack-notification/action.yml +++ b/.github/actions/send-slack-notification/action.yml @@ -49,8 +49,17 @@ runs: - name: Set status metadata id: meta shell: bash + env: + INPUT_STATUS: ${{ inputs.status }} + INPUT_RUN_URL: ${{ inputs.run_url }} + INPUT_ACTOR: ${{ inputs.actor }} + INPUT_VERBIAGE: ${{ inputs.verbiage }} + INPUT_WORKFLOW_NAME: ${{ inputs.workflow_name }} + GH_REPOSITORY: ${{ github.repository }} + GH_RUN_ID: ${{ github.run_id }} + GH_ACTOR: ${{ github.actor }} run: | - case "${{ inputs.status }}" in + case "${INPUT_STATUS}" in success) echo "emoji=:white_check_mark:" >> "$GITHUB_OUTPUT" echo "color=#2EB67D" >> "$GITHUB_OUTPUT" echo "status_label=SUCCESS" >> "$GITHUB_OUTPUT" ;; @@ -62,21 +71,21 @@ runs: echo "status_label=WARNING" >> "$GITHUB_OUTPUT" ;; esac - run_url="${{ inputs.run_url }}" + run_url="${INPUT_RUN_URL}" if [ -z "${run_url}" ]; then - run_url="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + run_url="https://github.com/${GH_REPOSITORY}/actions/runs/${GH_RUN_ID}" fi echo "run_url=${run_url}" >> "$GITHUB_OUTPUT" - actor="${{ inputs.actor }}" + actor="${INPUT_ACTOR}" if [ -z "${actor}" ]; then - actor="${{ github.actor }}" + actor="${GH_ACTOR}" fi echo "actor=${actor}" >> "$GITHUB_OUTPUT" - verbiage="${{ inputs.verbiage }}" + verbiage="${INPUT_VERBIAGE}" if [ -z "${verbiage}" ]; then - verbiage="${{ inputs.workflow_name }}" + verbiage="${INPUT_WORKFLOW_NAME}" fi echo "verbiage=${verbiage}" >> "$GITHUB_OUTPUT" diff --git a/.github/workflows/_slack-notify.yml b/.github/workflows/_slack-notify.yml index 55a8fff..17ca1ba 100644 --- a/.github/workflows/_slack-notify.yml +++ b/.github/workflows/_slack-notify.yml @@ -55,8 +55,17 @@ jobs: steps: - name: Set status metadata id: meta + env: + INPUT_STATUS: ${{ inputs.status }} + INPUT_RUN_URL: ${{ inputs.run_url }} + INPUT_ACTOR: ${{ inputs.actor }} + INPUT_VERBIAGE: ${{ inputs.verbiage }} + INPUT_WORKFLOW_NAME: ${{ inputs.workflow_name }} + GH_REPOSITORY: ${{ github.repository }} + GH_RUN_ID: ${{ github.run_id }} + GH_ACTOR: ${{ github.actor }} run: | - case "${{ inputs.status }}" in + case "${INPUT_STATUS}" in success) echo "emoji=:white_check_mark:" >> "$GITHUB_OUTPUT" echo "color=#2EB67D" >> "$GITHUB_OUTPUT" echo "status_label=SUCCESS" >> "$GITHUB_OUTPUT" ;; @@ -68,21 +77,21 @@ jobs: echo "status_label=WARNING" >> "$GITHUB_OUTPUT" ;; esac - run_url="${{ inputs.run_url }}" + run_url="${INPUT_RUN_URL}" if [ -z "${run_url}" ]; then - run_url="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + run_url="https://github.com/${GH_REPOSITORY}/actions/runs/${GH_RUN_ID}" fi echo "run_url=${run_url}" >> "$GITHUB_OUTPUT" - actor="${{ inputs.actor }}" + actor="${INPUT_ACTOR}" if [ -z "${actor}" ]; then - actor="${{ github.actor }}" + actor="${GH_ACTOR}" fi echo "actor=${actor}" >> "$GITHUB_OUTPUT" - verbiage="${{ inputs.verbiage }}" + verbiage="${INPUT_VERBIAGE}" if [ -z "${verbiage}" ]; then - verbiage="${{ inputs.workflow_name }}" + verbiage="${INPUT_WORKFLOW_NAME}" fi echo "verbiage=${verbiage}" >> "$GITHUB_OUTPUT" diff --git a/.github/workflows/ci-cd.yml b/.github/workflows/ci-cd.yml index 696ac76..b6c449f 100644 --- a/.github/workflows/ci-cd.yml +++ b/.github/workflows/ci-cd.yml @@ -11,7 +11,7 @@ on: env: PYTHON_VERSION: "3.11" - NODE_VERSION: "20.9.0" + NODE_VERSION: "20.20.2" REGISTRY: ghcr.io IMAGE_NAME: liteshperumalla/smart-tutor-ai-ai-driven-personalized-teaching-support VERCEL_TOKEN: ${{ secrets.VERCEL_TOKEN }} @@ -92,6 +92,192 @@ jobs: if-no-files-found: warn retention-days: 30 + environment-discipline: + name: Environment Discipline + runs-on: ubuntu-latest + permissions: + contents: read + needs: [secret-scanning] + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Validate environment contracts + run: | + set -euo pipefail + python scripts/check-env-drift.py + + sast: + name: SAST + runs-on: ubuntu-latest + timeout-minutes: 20 + permissions: + contents: read + security-events: write + needs: [secret-scanning] + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: pip + + - name: Install Semgrep + run: | + set -euo pipefail + python -m pip install --upgrade pip + pip install semgrep + + - name: Run Semgrep SAST scan + run: | + set -euo pipefail + semgrep scan \ + --config p/owasp-top-ten \ + --config p/python \ + --config p/typescript \ + --exclude node_modules \ + --exclude .next \ + --exclude coverage \ + --sarif \ + --output semgrep.sarif \ + . + semgrep scan \ + --config p/owasp-top-ten \ + --config p/python \ + --config p/typescript \ + --exclude node_modules \ + --exclude .next \ + --exclude coverage \ + --severity ERROR \ + --error \ + . + + - name: Upload Semgrep SARIF + if: always() + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: semgrep.sarif + category: semgrep-sast + + - name: Upload SAST artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: sast-report + path: semgrep.sarif + if-no-files-found: warn + retention-days: 30 + + dast: + name: DAST + runs-on: ubuntu-latest + timeout-minutes: 30 + permissions: + contents: read + needs: [backend-tests] + env: + ENVIRONMENT: development + SECRETS_PROVIDER: env + STORAGE_BACKEND: filesystem + USE_REDIS_CACHE: "false" + REDIS_SSL: "false" + AGENT_SYSTEM_ENABLED: "false" + WARMUP_ENABLED: "false" + REPRODUCIBILITY_ENABLED: "false" + LANGFUSE_ENABLED: "false" + OTEL_ENABLED: "false" + CORS_ALLOW_LOCALHOST: "true" + NEXT_PUBLIC_API_BASE_URL: http://127.0.0.1:8010 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: pip + cache-dependency-path: backend/requirements.txt + + - name: Install backend dependencies + run: | + set -euo pipefail + python -m pip install --upgrade pip + pip install -r backend/requirements.txt + + - name: Start backend locally for DAST + run: | + set -euo pipefail + mkdir -p logs + nohup python -m uvicorn backend.api.main:app --host 127.0.0.1 --port 8010 > logs/dast-backend.log 2>&1 & + echo $! > /tmp/dast-backend.pid + + - name: Wait for backend readiness + run: | + set -euo pipefail + for _ in $(seq 1 60); do + if curl -fsS http://127.0.0.1:8010/ready > /dev/null; then + exit 0 + fi + sleep 2 + done + echo "::error::Backend did not become ready for DAST." + tail -n 200 logs/dast-backend.log || true + exit 1 + + - name: Run OWASP ZAP baseline scan + run: | + set -euo pipefail + mkdir -p "${RUNNER_TEMP}/zap" + chmod 777 "${RUNNER_TEMP}/zap" + printf '10049\tIGNORE\tAccepted informational cacheability finding for public discovery routes\n' > "${RUNNER_TEMP}/zap/baseline.conf" + set +e + docker run --rm --network host \ + -v "${RUNNER_TEMP}/zap:/zap/wrk:rw" \ + ghcr.io/zaproxy/zaproxy:stable \ + zap-baseline.py \ + -t http://127.0.0.1:8010 \ + -c baseline.conf \ + -r zap-report.html \ + -J zap-report.json \ + -w zap-warnings.md \ + -m 3 + zap_exit_code=$? + set -e + cp "${RUNNER_TEMP}/zap"/zap-report.html "${GITHUB_WORKSPACE}/zap-report.html" + cp "${RUNNER_TEMP}/zap"/zap-report.json "${GITHUB_WORKSPACE}/zap-report.json" + cp "${RUNNER_TEMP}/zap"/zap-warnings.md "${GITHUB_WORKSPACE}/zap-warnings.md" + exit "${zap_exit_code}" + + - name: Upload DAST artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: dast-report + path: | + zap-report.html + zap-report.json + zap-warnings.md + logs/dast-backend.log + if-no-files-found: warn + retention-days: 30 + + - name: Stop backend + if: always() + run: | + set -euo pipefail + if [ -f /tmp/dast-backend.pid ]; then + kill "$(cat /tmp/dast-backend.pid)" || true + fi + backend-tests: name: Backend Tests runs-on: ubuntu-latest @@ -223,6 +409,56 @@ jobs: print("- `pytest backend/tests/ -v --cov=backend --cov-report=xml --cov-report=term-missing --junitxml=backend/junit.xml --maxfail=5`") PY + migration-validation: + name: Migration Validation + runs-on: ubuntu-latest + permissions: + contents: read + needs: [secret-scanning] + services: + postgres: + image: postgres:15-alpine + env: + POSTGRES_DB: migration_validation + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + ports: ["5432:5432"] + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + env: + POSTGRES_HOST: 127.0.0.1 + POSTGRES_PORT: 5432 + POSTGRES_DB: migration_validation + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_SSL_MODE: disable + SECRETS_PROVIDER: env + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: pip + cache-dependency-path: backend/requirements.txt + + - name: Install backend dependencies + run: | + set -euo pipefail + python -m pip install --upgrade pip + pip install -r backend/requirements.txt + + - name: Validate migrations + run: | + set -euo pipefail + chmod +x scripts/validate-migrations.sh + ./scripts/validate-migrations.sh + frontend-tests: name: Frontend Tests runs-on: ubuntu-latest @@ -329,3 +565,133 @@ jobs: console.log('- `npm test -- --coverage --watchAll=false --passWithNoTests --maxWorkers=2`'); console.log('- `npm run build`'); NODE + + contract-tests: + name: Contract Tests + runs-on: ubuntu-latest + permissions: + contents: read + needs: [secret-scanning] + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + cache: npm + cache-dependency-path: frontend/package-lock.json + + - name: Install frontend dependencies + run: | + set -euo pipefail + cd frontend + npm ci --legacy-peer-deps + + - name: Run proxy contract tests + run: | + set -euo pipefail + cd frontend + npm test -- --runInBand --testPathPattern route-contract --watchAll=false + + security-management: + name: Security Management + runs-on: ubuntu-latest + if: always() + permissions: + contents: read + needs: + - secret-scanning + - dependency-scanning + - sast + - dast + - backend-tests + - frontend-tests + steps: + - name: Publish security posture summary + run: | + set -euo pipefail + + security_failed=0 + for result in \ + "${{ needs.secret-scanning.result }}" \ + "${{ needs.dependency-scanning.result }}" \ + "${{ needs.sast.result }}" \ + "${{ needs.dast.result }}"; do + if [ "${result}" = "failure" ] || [ "${result}" = "cancelled" ]; then + security_failed=1 + fi + done + + { + echo "## Security Management Summary" + echo "" + echo "| Control | Status |" + echo "|---------|--------|" + echo "| Secret scanning | \`${{ needs.secret-scanning.result }}\` |" + echo "| Dependency scanning | \`${{ needs.dependency-scanning.result }}\` |" + echo "| SAST | \`${{ needs.sast.result }}\` |" + echo "| DAST | \`${{ needs.dast.result }}\` |" + echo "| Backend tests | \`${{ needs.backend-tests.result }}\` |" + echo "| Frontend tests | \`${{ needs.frontend-tests.result }}\` |" + echo "" + echo "Artifacts generated by CI security controls:" + echo "- Secret scan and dependency scan reports" + echo "- Semgrep SARIF uploaded to GitHub Security" + echo "- OWASP ZAP baseline HTML/JSON reports" + } >> "$GITHUB_STEP_SUMMARY" + + if [ "${security_failed}" -ne 0 ]; then + echo "::error::One or more security controls failed in CI." + exit 1 + fi + + delivery-readiness: + name: Delivery Readiness + runs-on: ubuntu-latest + if: always() + permissions: + contents: read + needs: + - backend-tests + - frontend-tests + - contract-tests + - migration-validation + - environment-discipline + - security-management + steps: + - name: Publish delivery summary + run: | + set -euo pipefail + + delivery_failed=0 + for result in \ + "${{ needs.backend-tests.result }}" \ + "${{ needs.frontend-tests.result }}" \ + "${{ needs.contract-tests.result }}" \ + "${{ needs.migration-validation.result }}" \ + "${{ needs.environment-discipline.result }}" \ + "${{ needs.security-management.result }}"; do + if [ "${result}" = "failure" ] || [ "${result}" = "cancelled" ]; then + delivery_failed=1 + fi + done + + { + echo "## Delivery Readiness Summary" + echo "" + echo "| Gate | Status |" + echo "|------|--------|" + echo "| Backend tests | \`${{ needs.backend-tests.result }}\` |" + echo "| Frontend tests | \`${{ needs.frontend-tests.result }}\` |" + echo "| Contract tests | \`${{ needs.contract-tests.result }}\` |" + echo "| Migration validation | \`${{ needs.migration-validation.result }}\` |" + echo "| Environment discipline | \`${{ needs.environment-discipline.result }}\` |" + echo "| Security management | \`${{ needs.security-management.result }}\` |" + } >> "$GITHUB_STEP_SUMMARY" + + if [ "${delivery_failed}" -ne 0 ]; then + echo "::error::Delivery readiness gates did not pass." + exit 1 + fi diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..f27df2d7 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,48 @@ +name: CodeQL + +on: + push: + branches: [main, develop] + pull_request: + branches: [main, develop] + schedule: + - cron: "30 3 * * 1" + workflow_dispatch: + +permissions: + actions: read + contents: read + security-events: write + +concurrency: + group: codeql-${{ github.ref }} + cancel-in-progress: true + +jobs: + analyze: + name: CodeQL Analyze (${{ matrix.language }}) + runs-on: ubuntu-latest + timeout-minutes: 45 + strategy: + fail-fast: false + matrix: + include: + - language: python + build-mode: none + - language: javascript-typescript + build-mode: none + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + + - name: Perform CodeQL analysis + uses: github/codeql-action/analyze@v3 + with: + category: /language:${{ matrix.language }} diff --git a/.github/workflows/coderabbit.yml b/.github/workflows/coderabbit.yml index a07739b..0faee67 100644 --- a/.github/workflows/coderabbit.yml +++ b/.github/workflows/coderabbit.yml @@ -46,10 +46,14 @@ jobs: - name: Resolve PR metadata id: pr_meta + env: + PR_NUMBER_INPUT: ${{ github.event.pull_request.number || inputs.pr_number || '0' }} + BASE_REF_INPUT: ${{ github.event.pull_request.base.ref || '' }} + HEAD_REF_INPUT: ${{ github.event.pull_request.head.ref || '' }} run: | - echo "pr_number=${{ github.event.pull_request.number || inputs.pr_number || '0' }}" >> "$GITHUB_OUTPUT" - echo "base_ref=${{ github.event.pull_request.base.ref || '' }}" >> "$GITHUB_OUTPUT" - echo "head_ref=${{ github.event.pull_request.head.ref || '' }}" >> "$GITHUB_OUTPUT" + echo "pr_number=${PR_NUMBER_INPUT}" >> "$GITHUB_OUTPUT" + echo "base_ref=${BASE_REF_INPUT}" >> "$GITHUB_OUTPUT" + echo "head_ref=${HEAD_REF_INPUT}" >> "$GITHUB_OUTPUT" - name: Get PR diff id: diff diff --git a/.github/workflows/database-migrations.yml b/.github/workflows/database-migrations.yml index b9ca054..c95145d 100644 --- a/.github/workflows/database-migrations.yml +++ b/.github/workflows/database-migrations.yml @@ -83,10 +83,11 @@ jobs: id: strategy env: SECRETS_PROVIDER: env + REQUESTED_STRATEGY: ${{ inputs.strategy }} run: | set -euo pipefail - requested="${{ inputs.strategy }}" + requested="${REQUESTED_STRATEGY}" if [ "$requested" = "auto" ]; then if [ -f alembic.ini ] || [ -f backend/alembic.ini ]; then resolved="alembic" @@ -146,20 +147,24 @@ jobs: - name: Migration summary if: always() + env: + SUMMARY_ENVIRONMENT: ${{ inputs.environment_name }} + SUMMARY_GIT_REF: ${{ inputs.git_ref }} + SUMMARY_REQUESTED_STRATEGY: ${{ inputs.strategy }} + SUMMARY_RESOLVED_STRATEGY: ${{ steps.strategy.outputs.resolved }} run: | { echo "## Database Migration Summary" echo "" echo "| Item | Value |" echo "|------|-------|" - echo "| Environment | \`${{ inputs.environment_name }}\` |" - echo "| Git ref | \`${{ inputs.git_ref }}\` |" - echo "| Requested strategy | \`${{ inputs.strategy }}\` |" - echo "| Resolved strategy | \`${{ steps.strategy.outputs.resolved }}\` |" - echo "| Database host | \`${{ env.POSTGRES_HOST }}:${{ env.POSTGRES_PORT }}\` |" - echo "| Database name | \`${{ env.POSTGRES_DB }}\` |" + echo "| Environment | \`${SUMMARY_ENVIRONMENT}\` |" + echo "| Git ref | \`${SUMMARY_GIT_REF}\` |" + echo "| Requested strategy | \`${SUMMARY_REQUESTED_STRATEGY}\` |" + echo "| Resolved strategy | \`${SUMMARY_RESOLVED_STRATEGY}\` |" echo "" echo "Notes:" + echo "- Database connection details are intentionally omitted from this summary because they come from masked environment secrets." echo "- \`alembic\` is the default and runs \`alembic -c alembic.ini upgrade head\`." echo "- \`schema-init\` is the legacy fallback and now delegates to Alembic migrations." } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/deploy-production.yml b/.github/workflows/deploy-production.yml index 0968269..bdab407 100644 --- a/.github/workflows/deploy-production.yml +++ b/.github/workflows/deploy-production.yml @@ -53,15 +53,20 @@ jobs: steps: - name: Resolve deployment target id: target + env: + EVENT_NAME: ${{ github.event_name }} + WORKFLOW_HEAD_SHA: ${{ github.event.workflow_run.head_sha }} + WORKFLOW_HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }} + INPUT_GIT_REF: ${{ inputs.git_ref }} run: | set -euo pipefail - if [ "${{ github.event_name }}" = "workflow_run" ]; then - checkout_ref="${{ github.event.workflow_run.head_sha }}" - deploy_ref="${{ github.event.workflow_run.head_branch }}" + if [ "${EVENT_NAME}" = "workflow_run" ]; then + checkout_ref="${WORKFLOW_HEAD_SHA}" + deploy_ref="${WORKFLOW_HEAD_BRANCH}" else - checkout_ref="${{ inputs.git_ref }}" - deploy_ref="${{ inputs.git_ref }}" + checkout_ref="${INPUT_GIT_REF}" + deploy_ref="${INPUT_GIT_REF}" fi case "${deploy_ref}" in diff --git a/.github/workflows/deploy-staging.yml b/.github/workflows/deploy-staging.yml index 7ebe982..afcdba0 100644 --- a/.github/workflows/deploy-staging.yml +++ b/.github/workflows/deploy-staging.yml @@ -49,15 +49,20 @@ jobs: steps: - name: Resolve deployment target id: target + env: + EVENT_NAME: ${{ github.event_name }} + WORKFLOW_HEAD_SHA: ${{ github.event.workflow_run.head_sha }} + WORKFLOW_HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }} + INPUT_GIT_REF: ${{ inputs.git_ref }} run: | set -euo pipefail - if [ "${{ github.event_name }}" = "workflow_run" ]; then - checkout_ref="${{ github.event.workflow_run.head_sha }}" - deploy_ref="${{ github.event.workflow_run.head_branch }}" + if [ "${EVENT_NAME}" = "workflow_run" ]; then + checkout_ref="${WORKFLOW_HEAD_SHA}" + deploy_ref="${WORKFLOW_HEAD_BRANCH}" else - checkout_ref="${{ inputs.git_ref }}" - deploy_ref="${{ inputs.git_ref }}" + checkout_ref="${INPUT_GIT_REF}" + deploy_ref="${INPUT_GIT_REF}" fi echo "checkout_ref=${checkout_ref}" >> "$GITHUB_OUTPUT" diff --git a/.github/workflows/rollback-production.yml b/.github/workflows/rollback-production.yml index 7d1a1e7..619d250 100644 --- a/.github/workflows/rollback-production.yml +++ b/.github/workflows/rollback-production.yml @@ -43,13 +43,16 @@ jobs: - name: Resolve rollback configuration id: config + env: + INPUT_BACKEND_IMAGE_TAG: ${{ inputs.backend_image_tag }} + INPUT_UPDATE_GITOPS_MANIFEST: ${{ inputs.update_gitops_manifest }} run: | set -euo pipefail - backend_image="${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-backend:${{ inputs.backend_image_tag }}" - archive_name="backend-rollback-${{ inputs.backend_image_tag }}.tar.gz" + backend_image="${REGISTRY}/${IMAGE_NAME}-backend:${INPUT_BACKEND_IMAGE_TAG}" + archive_name="backend-rollback-${INPUT_BACKEND_IMAGE_TAG}.tar.gz" - case "${{ inputs.update_gitops_manifest }}" in + case "${INPUT_UPDATE_GITOPS_MANIFEST}" in yes|YES|true|TRUE|1) update_gitops=true ;; @@ -269,6 +272,7 @@ jobs: if: ${{ steps.config.outputs.update_gitops == 'true' }} env: VERSION: ${{ inputs.backend_image_tag }} + COMMIT_MESSAGE: "chore(gitops): roll back production backend to ${{ inputs.backend_image_tag }}" run: | set -euo pipefail python - <<'PY' @@ -300,20 +304,26 @@ jobs: git config user.name "github-actions[bot]" git config user.email "41898282+github-actions[bot]@users.noreply.github.com" git add helm/smart-ai-tutor/values-production.yaml - git commit -m "chore(gitops): roll back production backend to ${{ inputs.backend_image_tag }}" + git commit -m "${COMMIT_MESSAGE}" git push - name: Rollback summary if: always() + env: + SUMMARY_BACKEND_IMAGE_TAG: ${{ inputs.backend_image_tag }} + SUMMARY_REASON: ${{ inputs.reason }} + SUMMARY_TRANSFER_OUTCOME: ${{ steps.transfer_backend_image.outcome }} + SUMMARY_ROLLBACK_OUTCOME: ${{ steps.rollback_backend.outcome }} + SUMMARY_GITOPS_UPDATE: ${{ steps.config.outputs.update_gitops }} run: | { echo "## Production Rollback Summary" echo "" echo "| Item | Value |" echo "|------|-------|" - echo "| Backend image tag | \`${{ inputs.backend_image_tag }}\` |" - echo "| Reason | \`${{ inputs.reason }}\` |" - echo "| Backend bundle transfer | \`${{ steps.transfer_backend_image.outcome }}\` |" - echo "| Backend rollback | \`${{ steps.rollback_backend.outcome }}\` |" - echo "| GitOps update | \`${{ steps.config.outputs.update_gitops }}\` |" + echo "| Backend image tag | \`${SUMMARY_BACKEND_IMAGE_TAG}\` |" + echo "| Reason | \`${SUMMARY_REASON}\` |" + echo "| Backend bundle transfer | \`${SUMMARY_TRANSFER_OUTCOME}\` |" + echo "| Backend rollback | \`${SUMMARY_ROLLBACK_OUTCOME}\` |" + echo "| GitOps update | \`${SUMMARY_GITOPS_UPDATE}\` |" } >> "$GITHUB_STEP_SUMMARY" diff --git a/.nvmrc b/.nvmrc new file mode 100644 index 0000000..ccc4c6c --- /dev/null +++ b/.nvmrc @@ -0,0 +1 @@ +20.20.2 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6907be0..e93de6f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -61,3 +61,10 @@ repos: language: system pass_filenames: false stages: [push] + + - id: env-contract-drift + name: Check environment contract drift + entry: python scripts/check-env-drift.py + language: system + files: '(^|/)\.env(\.(example|local(\.example)?|prod|production))?$|(^|/)frontend/\.env(\.local(\.example)?)?$' + pass_filenames: false diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..d9c3b55 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,32 @@ +# Contributing + +## Required Flow + +1. Start with a GitHub issue or change request. +2. Document architectural or operational decisions in `docs/adr/` when the change affects contracts, data, rollout, or infrastructure. +3. Keep changes reviewable. Large work should land behind flags or in staged slices. +4. Every PR must include verification and rollback notes. + +## Definition Of Done + +- Requirements and acceptance criteria are linked in the PR. +- Tests were added or updated at the right layer. +- Security, migration, and environment impact were evaluated. +- Rollout and rollback are clear. +- Docs and runbooks were updated if production behavior changed. + +## Local Workflow + +- Install hooks: `make bootstrap` +- Backend checks: `make backend` +- Frontend checks: `make frontend` +- Contract checks: `make contract` +- Migration validation: `make migrate-validate` +- Environment drift check: `make env-check` + +## Review Expectations + +- CODEOWNERS review is expected for touched areas. +- Schema, auth, secrets, and deployment changes require explicit reviewer attention. +- PRs without linked issues, verification, or rollback notes should not merge. + diff --git a/Dockerfile b/Dockerfile index 3c12901..f65cfe0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,6 +23,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ tesseract-ocr \ && rm -rf /var/lib/apt/lists/* +RUN groupadd --system appuser \ + && useradd --system --gid appuser --create-home --home-dir /home/appuser appuser + COPY requirements.txt /tmp/requirements.txt RUN python -m pip install --upgrade pip \ && pip install --no-cache-dir -r /tmp/requirements.txt @@ -30,7 +33,10 @@ RUN python -m pip install --upgrade pip \ COPY . /app COPY docker/entrypoint.sh /entrypoint.sh -RUN chmod +x /entrypoint.sh +RUN chmod +x /entrypoint.sh \ + && chown -R appuser:appuser /app /entrypoint.sh + +USER appuser EXPOSE 8000 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c55120f --- /dev/null +++ b/Makefile @@ -0,0 +1,33 @@ +SHELL := /bin/bash + +.PHONY: bootstrap test backend frontend lint typecheck security contract migrate-validate env-check + +bootstrap: + pre-commit install + +test: + ./scripts/test.sh + +backend: + ./scripts/test.sh --backend + +frontend: + ./scripts/test.sh --frontend + +lint: + ./scripts/test.sh --lint + +typecheck: + ./scripts/test.sh --typecheck + +security: + python scripts/verify_security.py + +contract: + cd frontend && npm test -- --runInBand --testPathPattern route-contract + +migrate-validate: + ./scripts/validate-migrations.sh + +env-check: + python scripts/check-env-drift.py diff --git a/backend/alembic/env.py b/backend/alembic/env.py index 35a9c2a..e2f5ad4 100644 --- a/backend/alembic/env.py +++ b/backend/alembic/env.py @@ -1,5 +1,4 @@ from logging.config import fileConfig -import tempfile import os from alembic import context @@ -14,19 +13,11 @@ if config.config_file_name is not None: fileConfig(config.config_file_name) -# db_url override via alembic -x db_url=... if supported (alembic >= 1.13) -# Otherwise build URL directly bypassing ConfigParser which treats % as -# interpolation operators — corrupting passwords like SmartTutor2025!SecurePass _db_url = os.environ.get("ALEMBIC_DB_URL") or config.get_main_option("db_url") if _db_url: - # Write URL to a temp ini-style file to avoid ConfigParser - # interpreting % chars in passwords - _tmp_fd, _tmp_path = tempfile.mkstemp(suffix=".ini") - with os.fdopen(_tmp_fd, "w") as _f: - _f.write(f"[alembic]\nsqlalchemy.url = {_db_url}\n") - # Override config file so alembic reads our clean URL file - config.config_file_name = _tmp_path - config.set_main_option = lambda _k, _v: None # NOOP after init + # Alembic stores config in a ConfigParser, so `%` must be escaped before + # overriding sqlalchemy.url from environment-driven CI/database targets. + config.set_main_option("sqlalchemy.url", _db_url.replace("%", "%%")) else: config.set_main_option("sqlalchemy.url", build_postgres_url_string()) diff --git a/backend/api/main.py b/backend/api/main.py index 2a67f4c..d478ad4 100644 --- a/backend/api/main.py +++ b/backend/api/main.py @@ -16,6 +16,7 @@ from backend.rate_limiter import limiter # Import from rate_limiter to avoid circular imports logger = logging.getLogger(__name__) +PUBLIC_CACHEABLE_PATHS = {"/", "/robots.txt", "/sitemap.xml"} app = FastAPI( title="Smart AI Tutor API", @@ -110,6 +111,17 @@ async def add_security_headers(request: Request, call_next): response.headers["X-Frame-Options"] = "DENY" response.headers["X-XSS-Protection"] = "1; mode=block" response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin" + response.headers["Cross-Origin-Resource-Policy"] = "same-origin" + response.headers["Cross-Origin-Opener-Policy"] = "same-origin" + response.headers["Cross-Origin-Embedder-Policy"] = "require-corp" + + if request.method == "GET" and request.url.path in PUBLIC_CACHEABLE_PATHS: + response.headers["Cache-Control"] = "public, max-age=300" + if "Pragma" in response.headers: + del response.headers["Pragma"] + else: + response.headers["Cache-Control"] = "no-store, max-age=0" + response.headers["Pragma"] = "no-cache" # Content Security Policy - relaxed for API-only backend if config.ENVIRONMENT == "production": @@ -160,6 +172,27 @@ async def root(): return {"message": "Smart AI Tutor API", "version": "1.0.0"} +@app.get("/robots.txt") +async def robots_txt(): + return Response( + content="User-agent: *\nAllow: /\n", + media_type="text/plain", + ) + + +@app.get("/sitemap.xml") +async def sitemap(): + return Response( + content=( + '' + '' + "https://smart-ai-tutor.local/" + "" + ), + media_type="application/xml", + ) + + @app.get("/health") @limiter.limit("10/minute") # Rate limit health checks async def health_check(request: Request): diff --git a/backend/jwt_blacklist.py b/backend/jwt_blacklist.py index 65941cf..f3232c5 100644 --- a/backend/jwt_blacklist.py +++ b/backend/jwt_blacklist.py @@ -7,6 +7,9 @@ import logging from typing import Optional from datetime import datetime, timedelta +import base64 +import binascii +import json logger = logging.getLogger(__name__) @@ -122,7 +125,7 @@ def is_blacklisted(self, token: str) -> bool: def _get_jti(self, token: str) -> Optional[str]: """ - Extract JTI (JWT ID) from token without verification + Extract JTI (JWT ID) from the token payload without using JWT decode. Args: token: The JWT token @@ -131,10 +134,9 @@ def _get_jti(self, token: str) -> Optional[str]: Optional[str]: The JTI claim if present, None otherwise """ try: - # Decode without verification to get the JTI - payload = jwt.decode(token, options={"verify_signature": False}) + payload = _decode_unverified_payload(token) return payload.get("jti") - except Exception as e: + except (ValueError, json.JSONDecodeError, binascii.Error, UnicodeDecodeError) as e: logger.error(f"Failed to extract JTI from token: {e}") return None @@ -189,6 +191,22 @@ def get_stats(self) -> dict: _jwt_blacklist: Optional[JWTBlacklist] = None +def _decode_unverified_payload(token: str) -> dict: + """Parse the JWT payload segment without trusting it for authentication.""" + + parts = token.split(".") + if len(parts) != 3: + raise ValueError("Invalid JWT format") + + payload_segment = parts[1] + padding = "=" * (-len(payload_segment) % 4) + decoded = base64.urlsafe_b64decode(payload_segment + padding) + payload = json.loads(decoded.decode("utf-8")) + if not isinstance(payload, dict): + raise ValueError("Invalid JWT payload") + return payload + + def get_jwt_blacklist() -> Optional[JWTBlacklist]: """Get the global JWT blacklist instance""" return _jwt_blacklist diff --git a/backend/jwt_service.py b/backend/jwt_service.py index d226283..29ef2b6 100644 --- a/backend/jwt_service.py +++ b/backend/jwt_service.py @@ -10,6 +10,8 @@ from jwt.exceptions import PyJWTError as JWTError from pathlib import Path import uuid +import base64 +import json from .config import config from .logger import get_logger from .exceptions import SessionExpiredError @@ -233,17 +235,11 @@ def get_token_expiry(self, token: str) -> Optional[datetime]: Expiration datetime or None if invalid """ try: - verification_key = self._get_verification_key() - payload = jwt.decode( - token, - options={"verify_signature": False, "verify_exp": False}, - algorithms=[self.algorithm], - key="" - ) + payload = _decode_unverified_payload(token) exp = payload.get("exp") if exp: return datetime.fromtimestamp(exp, tz=timezone.utc) - except JWTError as exc: + except (JWTError, ValueError, TypeError, json.JSONDecodeError) as exc: logger.debug(f"Failed to decode token expiry: {exc}") return None @@ -267,6 +263,22 @@ def is_token_expired(self, token: str) -> bool: _jwt_service = None +def _decode_unverified_payload(token: str) -> Dict[str, Any]: + """Parse the JWT payload segment without trusting it for authentication.""" + + parts = token.split(".") + if len(parts) != 3: + raise ValueError("Invalid JWT format") + + payload_segment = parts[1] + padding = "=" * (-len(payload_segment) % 4) + decoded = base64.urlsafe_b64decode(payload_segment + padding) + payload = json.loads(decoded.decode("utf-8")) + if not isinstance(payload, dict): + raise ValueError("Invalid JWT payload") + return payload + + def get_jwt_service() -> JWTService: """Get singleton JWT service instance""" global _jwt_service diff --git a/backend/tests/test_security.py b/backend/tests/test_security.py index 2b16553..c094058 100644 --- a/backend/tests/test_security.py +++ b/backend/tests/test_security.py @@ -80,3 +80,44 @@ def test_protected_route_malformed_auth_header(self, test_client): """Malformed Authorization header must return 401, not 500""" response = test_client.get("/chat/sessions", headers={"Authorization": "NotBearer token"}) assert response.status_code == 401 + + +class TestSecurityHeaders: + """Security headers must be applied consistently to backend responses.""" + + def test_root_includes_public_cache_and_corp_headers(self, test_client): + response = test_client.get("/") + + assert response.status_code == 200 + assert response.headers["cross-origin-resource-policy"] == "same-origin" + assert response.headers["cross-origin-opener-policy"] == "same-origin" + assert response.headers["cross-origin-embedder-policy"] == "require-corp" + assert response.headers["cache-control"] == "public, max-age=300" + + def test_public_discovery_routes_are_cacheable(self, test_client): + robots = test_client.get("/robots.txt") + sitemap = test_client.get("/sitemap.xml") + + assert robots.status_code == 200 + assert robots.headers["cross-origin-resource-policy"] == "same-origin" + assert robots.headers["cross-origin-opener-policy"] == "same-origin" + assert robots.headers["cross-origin-embedder-policy"] == "require-corp" + assert robots.headers["cache-control"] == "public, max-age=300" + + assert sitemap.status_code == 200 + assert sitemap.headers["cross-origin-resource-policy"] == "same-origin" + assert sitemap.headers["cross-origin-opener-policy"] == "same-origin" + assert sitemap.headers["cross-origin-embedder-policy"] == "require-corp" + assert sitemap.headers["cache-control"] == "public, max-age=300" + + def test_dynamic_not_found_includes_no_store_headers(self, test_client): + response = test_client.get("/robots.txt") + + response = test_client.get("/missing") + + assert response.status_code == 404 + assert response.headers["cross-origin-resource-policy"] == "same-origin" + assert response.headers["cross-origin-opener-policy"] == "same-origin" + assert response.headers["cross-origin-embedder-policy"] == "require-corp" + assert response.headers["cache-control"] == "no-store, max-age=0" + assert response.headers["pragma"] == "no-cache" diff --git a/docs/adr/0000-template.md b/docs/adr/0000-template.md new file mode 100644 index 0000000..a93750d --- /dev/null +++ b/docs/adr/0000-template.md @@ -0,0 +1,26 @@ +# ADR 0000: Title + +- Status: Proposed +- Date: YYYY-MM-DD +- Owners: team or individual + +## Context + +What problem or constraint forced the decision? + +## Decision + +What are we doing? + +## Consequences + +What improves, what gets worse, and what operational burden does this add? + +## Rollout + +How will this be deployed safely? + +## Rollback + +How will this be reversed if it causes regressions? + diff --git a/docs/adr/0001-sdlc-governance-baseline.md b/docs/adr/0001-sdlc-governance-baseline.md new file mode 100644 index 0000000..e9e63b6 --- /dev/null +++ b/docs/adr/0001-sdlc-governance-baseline.md @@ -0,0 +1,32 @@ +# ADR 0001: SDLC Governance Baseline + +- Status: Accepted +- Date: 2026-04-24 +- Owners: Platform Engineering + +## Context + +The repository already had meaningful CI/CD, security scanning, and deployment automation, but the process controls around requirements, testing depth, rollout safety, operations, and environment discipline were incomplete or spread across implementation docs. + +## Decision + +This repository adopts the following baseline controls: + +- changes start from a GitHub issue or change request +- production-impacting decisions use ADRs +- PRs must include verification and rollback notes +- CI validates application tests, security checks, migration safety, and environment discipline +- operational expectations live in runbooks and SDLC docs in-repo + +## Consequences + +This adds process overhead to every change, but it reduces silent risk and makes reviews more deterministic. The repository becomes easier to audit and safer to deploy. + +## Rollout + +The baseline is enforced through repository templates, documentation, local developer tooling, and CI workflow gates. + +## Rollback + +The governance assets can be relaxed by removing the templates and CI jobs, but operationally that would be a regression and should itself require a change request. + diff --git a/docs/adr/README.md b/docs/adr/README.md new file mode 100644 index 0000000..0c34a45 --- /dev/null +++ b/docs/adr/README.md @@ -0,0 +1,12 @@ +# Architecture Decision Records + +Use ADRs for decisions that change: + +- public or internal contracts +- data models or migrations +- deployment or rollback strategy +- security posture +- observability or incident handling + +Create a new ADR from `docs/adr/0000-template.md` and assign the next number. + diff --git a/docs/runbooks/backup-restore-drill.md b/docs/runbooks/backup-restore-drill.md new file mode 100644 index 0000000..5f70cbb --- /dev/null +++ b/docs/runbooks/backup-restore-drill.md @@ -0,0 +1,21 @@ +# Runbook: Backup Restore Drill + +## Goal + +Prove that backups are usable, not just present. + +## Minimum Drill + +1. Select the latest known good backup or snapshot. +2. Restore to an isolated target. +3. Run schema validation and a small application smoke test. +4. Record restore duration, data gap, and follow-up issues. + +## Evidence To Save + +- snapshot or backup identifier +- restore target +- validation commands +- duration and outcome +- issues found during restore + diff --git a/docs/runbooks/db-migration-failure.md b/docs/runbooks/db-migration-failure.md new file mode 100644 index 0000000..ba1e4a9 --- /dev/null +++ b/docs/runbooks/db-migration-failure.md @@ -0,0 +1,27 @@ +# Runbook: Database Migration Failure + +## Trigger + +- migration workflow fails +- deploy fails because schema is behind +- app starts but queries fail after release + +## Immediate Actions + +1. Freeze further deploys. +2. Capture failing migration revision and database target. +3. Check whether the database is partially migrated. +4. Decide between rollback and forward-fix based on blast radius. + +## Validation + +- Review Alembic history and current revision. +- Review application errors tied to changed tables or indexes. +- Confirm backup freshness before any destructive action. + +## Recovery + +- If safe, run the rollback workflow or downgrade step. +- If downgrade is not safe, prepare a forward-fix migration and restore service. +- Record the incident and update the migration notes. + diff --git a/docs/runbooks/deploy-failure.md b/docs/runbooks/deploy-failure.md new file mode 100644 index 0000000..e894d40 --- /dev/null +++ b/docs/runbooks/deploy-failure.md @@ -0,0 +1,28 @@ +# Runbook: Deploy Failure + +## Trigger + +- staging or production deploy workflow fails +- post-deploy smoke test fails +- healthy candidate never becomes ready + +## Immediate Actions + +1. Stop further deploy attempts. +2. Identify the failing version, workflow run, and environment. +3. Check deploy logs, candidate container logs, `/ready`, and `/health`. +4. If user impact exists, start rollback immediately. + +## Validation + +- Confirm image tag and git SHA. +- Confirm secrets and environment file path. +- Confirm migration status. +- Confirm readiness and metrics endpoints. + +## Recovery + +- Use the repository rollback workflow for the environment. +- Re-run smoke checks after rollback. +- Open or update a change request with findings. + diff --git a/docs/sdlc/data-db-lifecycle.md b/docs/sdlc/data-db-lifecycle.md new file mode 100644 index 0000000..7c39ce9 --- /dev/null +++ b/docs/sdlc/data-db-lifecycle.md @@ -0,0 +1,23 @@ +# Data And Database Lifecycle + +## Rules + +- Schema changes must be migration-backed. +- Migrations must be safe on an empty database and on the current production schema line. +- Destructive data changes require explicit rollback notes or forward-fix strategy. +- Backups and restore drills are part of delivery, not just infrastructure setup. + +## Minimum Controls + +- Validate migrations in CI. +- Keep backup and restore scripts versioned. +- Run periodic restore drills and record results. +- Document retention and data ownership for each persisted store. + +## Current Repository Paths + +- Migration workflow: `.github/workflows/database-migrations.yml` +- Backup scripts: `scripts/dr/backup/` +- Restore scripts: `scripts/dr/restore/` +- Local database utility: `scripts/db.sh` + diff --git a/docs/sdlc/deployment-safety.md b/docs/sdlc/deployment-safety.md new file mode 100644 index 0000000..e1393da --- /dev/null +++ b/docs/sdlc/deployment-safety.md @@ -0,0 +1,24 @@ +# Deployment Safety + +## Required Controls + +- All deployments use immutable image tags. +- Staging deploys happen from `develop` after CI succeeds. +- Production deploys happen from `main` after CI succeeds. +- Candidate health must pass before traffic or environment promotion is considered successful. +- Rollback remains a first-class workflow, not an emergency-only script. + +## Release Checklist + +Verify the following before release: +- Linked issue and PR verification. +- Migration impact. +- Smoke and health endpoints. +- Dashboards and alerts to watch. +- Rollback workflow and last known good version. + +## Mandatory Follow-Up + +- Review post-deploy smoke results. +- Review error rate, latency, and auth failures. +- Create a follow-up issue for any non-blocking warning left in the deploy. diff --git a/docs/sdlc/developer-workflow.md b/docs/sdlc/developer-workflow.md new file mode 100644 index 0000000..c24d6c7 --- /dev/null +++ b/docs/sdlc/developer-workflow.md @@ -0,0 +1,24 @@ +# Developer Workflow + +## Local Standards + +- Use `pre-commit`. +- Run repo checks before pushing. +- Prefer reproducible scripts and Make targets over ad hoc shell history. +- Treat rollback and verification as part of implementation. + +## Expected Commands + +- `make bootstrap` +- `make backend` +- `make frontend` +- `make contract` +- `make migrate-validate` +- `make env-check` + +## Review Hygiene + +- Keep PRs narrow when possible. +- Update docs when behavior or operations change. +- Do not merge changes that rely on tribal knowledge for rollout or recovery. + diff --git a/docs/sdlc/environment-discipline.md b/docs/sdlc/environment-discipline.md new file mode 100644 index 0000000..a33a763 --- /dev/null +++ b/docs/sdlc/environment-discipline.md @@ -0,0 +1,16 @@ +# Environment Discipline + +## Principles + +- Build once, promote by immutable version where possible. +- Environment-specific values live in secrets or environment configuration, not source code. +- Example env files define the contract, not the live secret values. +- Drift between env contracts must be visible and reviewed. + +## Required Practices + +- Keep `.env.example` current with required keys and placeholders. +- Validate environment contract drift in CI. +- Never commit live credentials. +- Document production-only configuration and safety switches. + diff --git a/docs/sdlc/ops-maturity.md b/docs/sdlc/ops-maturity.md new file mode 100644 index 0000000..8a93807 --- /dev/null +++ b/docs/sdlc/ops-maturity.md @@ -0,0 +1,22 @@ +# Ops Maturity + +## Service Objectives + +- Availability SLO: 99.9% monthly for customer-facing application paths. +- API readiness SLO: 99.95% monthly for `/ready`. +- Change failure rate target: less than 15% of production deploys. +- Time to mitigate Sev1 incidents: less than 30 minutes. + +## Required Operational Assets + +- Dashboards for latency, error rate, saturation, auth failures, and deploy health. +- Alert routing for production incidents. +- Runbooks for deploy failure, migration failure, high error rate, and restore drills. +- Incident review notes for Sev1 and Sev2 events. + +## Review Cadence + +- Weekly: alerts and noisy monitors +- Monthly: SLO review +- Quarterly: dependency, backup, and runbook drills + diff --git a/docs/sdlc/requirements-change-control.md b/docs/sdlc/requirements-change-control.md new file mode 100644 index 0000000..b577ad3 --- /dev/null +++ b/docs/sdlc/requirements-change-control.md @@ -0,0 +1,23 @@ +# Requirements And Change Control + +## Policy + +- Every material change starts from an issue. +- High-risk changes use the `Change Request` template. +- Architectural, data, deployment, and security decisions require an ADR. +- PRs must link the issue or change request and include verification plus rollback. + +## Required Before Merge + +- Problem statement and acceptance criteria exist. +- Risk and blast radius are stated. +- Rollback path is documented. +- Data and secret impact are reviewed. + +## Required Before Production Release + +- CI is green. +- Migration safety is validated if schemas changed. +- Smoke verification and observability checks are defined. +- Any manual approval happens through the protected environment gate. + diff --git a/docs/sdlc/testing-pyramid.md b/docs/sdlc/testing-pyramid.md new file mode 100644 index 0000000..a7a5872 --- /dev/null +++ b/docs/sdlc/testing-pyramid.md @@ -0,0 +1,24 @@ +# Stronger Testing Pyramid + +## Target Distribution + +- Unit tests cover pure logic and edge cases. +- Integration tests cover FastAPI routes, auth, storage, and middleware behavior. +- Contract tests protect the Next.js proxy and backend interface assumptions. +- End-to-end tests cover primary user journeys. + +## Repository Rules + +- New logic should land with unit or route-level tests first. +- Proxy or API compatibility changes require contract coverage. +- Schema or storage changes require migration validation. +- Production-critical paths should have smoke checks and at least one rollback-safe verification path. + +## CI Gates + +- Backend tests +- Frontend tests +- Contract tests +- Migration validation +- Security checks + diff --git a/frontend/jest.setup.js b/frontend/jest.setup.js index ccc1b6d..8a95be4 100644 --- a/frontend/jest.setup.js +++ b/frontend/jest.setup.js @@ -25,29 +25,31 @@ jest.mock('next/navigation', () => ({ }, })) -// Mock window.matchMedia -Object.defineProperty(window, 'matchMedia', { - writable: true, - value: jest.fn().mockImplementation(query => ({ - matches: false, - media: query, - onchange: null, - addListener: jest.fn(), - removeListener: jest.fn(), - addEventListener: jest.fn(), - removeEventListener: jest.fn(), - dispatchEvent: jest.fn(), - })), -}) +if (typeof window !== 'undefined') { + // Mock window.matchMedia + Object.defineProperty(window, 'matchMedia', { + writable: true, + value: jest.fn().mockImplementation(query => ({ + matches: false, + media: query, + onchange: null, + addListener: jest.fn(), + removeListener: jest.fn(), + addEventListener: jest.fn(), + removeEventListener: jest.fn(), + dispatchEvent: jest.fn(), + })), + }) -// Mock localStorage -const localStorageMock = { - getItem: jest.fn(), - setItem: jest.fn(), - removeItem: jest.fn(), - clear: jest.fn(), + // Mock localStorage + const localStorageMock = { + getItem: jest.fn(), + setItem: jest.fn(), + removeItem: jest.fn(), + clear: jest.fn(), + } + global.localStorage = localStorageMock } -global.localStorage = localStorageMock // Mock fetch global.fetch = jest.fn() diff --git a/frontend/src/app/api/backend/__tests__/route-contract.test.ts b/frontend/src/app/api/backend/__tests__/route-contract.test.ts new file mode 100644 index 0000000..aeab227 --- /dev/null +++ b/frontend/src/app/api/backend/__tests__/route-contract.test.ts @@ -0,0 +1,74 @@ +/** @jest-environment node */ + +import { NextRequest } from "next/server"; + +import { GET, POST } from "../[...path]/route"; + +describe("backend proxy contract", () => { + let fetchSpy: jest.SpiedFunction; + + beforeEach(() => { + jest.clearAllMocks(); + fetchSpy = jest.spyOn(global, "fetch").mockImplementation(jest.fn()); + }); + + afterEach(() => { + fetchSpy.mockRestore(); + }); + + it("blocks path traversal attempts", async () => { + const request = new NextRequest("http://localhost:4000/api/backend/%2e%2e/secrets"); + + const response = await GET(request, { params: { path: ["%2e%2e", "secrets"] } }); + + expect(response.status).toBe(400); + expect(global.fetch).not.toHaveBeenCalled(); + }); + + it("rejects unauthenticated non-public requests before proxying", async () => { + const request = new NextRequest("http://localhost:4000/api/backend/chat"); + + const response = await GET(request, { params: { path: ["chat"] } }); + + expect(response.status).toBe(401); + expect(global.fetch).not.toHaveBeenCalled(); + }); + + it("forwards auth cookies and csrf header for mutation requests", async () => { + fetchSpy.mockResolvedValueOnce( + new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { + "content-type": "application/json", + "set-cookie": "session=value; Path=/; HttpOnly", + }, + }) + ); + + const request = new NextRequest("http://localhost:4000/api/backend/chat", { + method: "POST", + headers: { + cookie: "access_token=abc; csrf_token=csrf123", + }, + body: JSON.stringify({ message: "hello" }), + }); + + const response = await POST(request, { params: { path: ["chat"] } }); + + expect(response.status).toBe(200); + expect(global.fetch).toHaveBeenCalledTimes(1); + + const [target, init] = fetchSpy.mock.calls[0] as [ + string, + RequestInit & { headers: Headers }, + ]; + + const expectedBackendBase = + process.env.BACKEND_API_BASE_URL ?? "http://localhost:8010"; + + expect(target).toBe(`${expectedBackendBase}/chat`); + expect(init.method).toBe("POST"); + expect(init.headers.get("cookie")).toContain("access_token=abc"); + expect(init.headers.get("x-csrf-token")).toBe("csrf123"); + }); +}); diff --git a/scripts/check-env-drift.py b/scripts/check-env-drift.py new file mode 100644 index 0000000..ecb3e7f --- /dev/null +++ b/scripts/check-env-drift.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +from pathlib import Path +import sys + + +def load_keys(path: Path) -> set[str]: + keys: set[str] = set() + for raw_line in path.read_text().splitlines(): + line = raw_line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key = line.split("=", 1)[0].strip() + if key: + keys.add(key) + return keys + + +def main() -> int: + repo_root = Path(__file__).resolve().parent.parent + targets = [ + { + "source": repo_root / ".env.example", + "target": repo_root / ".env.production", + "ignore_prefixes": tuple(), + "ignore_keys": {"JWT_SECRET_KEY"}, + }, + { + "source": repo_root / ".env.example", + "target": repo_root / ".env.prod", + "ignore_prefixes": ("VERCEL_", "TURBO_", "NX_", "NEXT_PUBLIC_", "BACKEND_"), + "ignore_keys": {"VERCEL"}, + }, + { + "source": repo_root / "frontend" / ".env.local.example", + "target": repo_root / "frontend" / ".env.local", + "ignore_prefixes": ("VERCEL_", "TURBO_", "NX_"), + "ignore_keys": {"VERCEL", "BACKEND_API_BASE_URL", "NEXT_PUBLIC_BACKEND_URL"}, + "optional": True, + }, + ] + + failures: list[str] = [] + for item in targets: + source = item["source"] + target = item["target"] + optional = item.get("optional", False) + if not source.exists(): + if optional: + print( + f"Skipping optional contract pair: {source.relative_to(repo_root)} " + f"-> {target.relative_to(repo_root)}" + ) + continue + failures.append(f"missing source contract file: {source}") + continue + + if not target.exists(): + print( + f"Skipping optional target file: {target.relative_to(repo_root)} " + f"(source contract: {source.relative_to(repo_root)})" + ) + continue + + source_keys = load_keys(source) + target_keys = load_keys(target) + + undocumented = sorted( + key + for key in (target_keys - source_keys) + if key not in item["ignore_keys"] + and not any(key.startswith(prefix) for prefix in item["ignore_prefixes"]) + ) + if undocumented: + failures.append( + f"{target.relative_to(repo_root)} contains undocumented keys not declared in {source.relative_to(repo_root)}: {', '.join(undocumented)}" + ) + + if failures: + for failure in failures: + print(f"ERROR: {failure}") + return 1 + + print("Environment contract check passed.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/validate-migrations.sh b/scripts/validate-migrations.sh new file mode 100755 index 0000000..9dc06bc --- /dev/null +++ b/scripts/validate-migrations.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +set -euo pipefail + +POSTGRES_HOST="${POSTGRES_HOST:-127.0.0.1}" +POSTGRES_PORT="${POSTGRES_PORT:-5432}" +POSTGRES_DB="${POSTGRES_DB:-migration_validation}" +POSTGRES_USER="${POSTGRES_USER:-postgres}" +POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-postgres}" +POSTGRES_SSL_MODE="${POSTGRES_SSL_MODE:-disable}" + +export SECRETS_PROVIDER="${SECRETS_PROVIDER:-env}" + +db_url="$(python3 - <<'PY' +import os +from urllib.parse import quote + +print( + "postgresql+psycopg2://" + f"{quote(os.environ['POSTGRES_USER'], safe='')}:" + f"{quote(os.environ['POSTGRES_PASSWORD'], safe='')}" + f"@{os.environ['POSTGRES_HOST']}:{os.environ['POSTGRES_PORT']}" + f"/{quote(os.environ['POSTGRES_DB'], safe='')}" + f"?sslmode={quote(os.environ['POSTGRES_SSL_MODE'], safe='')}" +) +PY +)" + +# Never log db_url here; it contains credentials. +echo "Validating Alembic migrations against ${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}" +ALEMBIC_DB_URL="${db_url}" python -m alembic -c alembic.ini upgrade head +ALEMBIC_DB_URL="${db_url}" python -m alembic -c alembic.ini current diff --git a/scripts/verify_security.py b/scripts/verify_security.py index c96f647..a310917 100755 --- a/scripts/verify_security.py +++ b/scripts/verify_security.py @@ -6,6 +6,8 @@ import sys import os +import base64 +import json from pathlib import Path # Add parent directory to path @@ -17,6 +19,22 @@ logger = get_logger(__name__) +def decode_unverified_payload(token: str) -> dict: + """Parse the JWT payload segment without invoking JWT decode helpers.""" + + parts = token.split(".") + if len(parts) != 3: + raise ValueError("Invalid JWT format") + + payload_segment = parts[1] + padding = "=" * (-len(payload_segment) % 4) + decoded = base64.urlsafe_b64decode(payload_segment + padding) + payload = json.loads(decoded.decode("utf-8")) + if not isinstance(payload, dict): + raise ValueError("Invalid JWT payload") + return payload + + class SecurityVerifier: """Verify security configuration""" @@ -79,9 +97,7 @@ def check_jwt_secret(self): # Create a test token token = jwt_service.create_access_token("test_user", "test@example.com") - # Decode without verification to check JTI - import jwt as pyjwt - payload = pyjwt.decode(token, options={"verify_signature": False}) + payload = decode_unverified_payload(token) if "jti" in payload: self.passed.append("JWT tokens include JTI (revocation enabled)")