diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..398abd6 --- /dev/null +++ b/.env.example @@ -0,0 +1,99 @@ +# ============================================================================== +# FinanceGPT All-in-One Configuration +# ============================================================================== +# Copy this file to .env and customize as needed. +# Most settings have sensible defaults - you only need to set what you want to change. +# +# Quick start: Just run ./run.sh without any .env file! + +# ============================================================================== +# AUTHENTICATION (Required for production) +# ============================================================================== + +# JWT Secret Key - Auto-generated if not set, but set this for production! +# Generate with: openssl rand -hex 32 +SECRET_KEY= + +# Auth Type: LOCAL (email/password) or GOOGLE (OAuth) +AUTH_TYPE=LOCAL + +# Google OAuth (only if AUTH_TYPE=GOOGLE) +# GOOGLE_OAUTH_CLIENT_ID= +# GOOGLE_OAUTH_CLIENT_SECRET= + +# Allow new user registration +REGISTRATION_ENABLED=TRUE + +# ============================================================================== +# FINANCIAL DATA - PLAID (Optional) +# ============================================================================== +# Connect bank/brokerage accounts. Get keys from: https://dashboard.plaid.com/team/keys + +# PLAID_CLIENT_ID= +# PLAID_SECRET= +# PLAID_ENV=sandbox + +# ============================================================================== +# AI/ML CONFIGURATION +# ============================================================================== + +# Embedding model for semantic search +EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2 + +# Rerankers for improved search (requires additional setup) +RERANKERS_ENABLED=FALSE +# RERANKERS_MODEL_NAME=ms-marco-MiniLM-L-12-v2 +# RERANKERS_MODEL_TYPE=flashrank + +# ============================================================================== +# DOCUMENT PROCESSING +# ============================================================================== + +# Parser: DOCLING (local, default) | UNSTRUCTURED (API) | LLAMACLOUD (API) +ETL_SERVICE=DOCLING + +# API keys (only if using cloud services) +# UNSTRUCTURED_API_KEY= +# LLAMA_CLOUD_API_KEY= + +# ============================================================================== +# VOICE SERVICES (Optional - for podcasts) +# ============================================================================== + +# Text-to-Speech: local/kokoro (default) or cloud provider +TTS_SERVICE=local/kokoro +# TTS_SERVICE_API_KEY= + +# Speech-to-Text: local/base, local/small, local/medium, local/large +STT_SERVICE=local/base +# STT_SERVICE_API_KEY= + +# ============================================================================== +# WEB CRAWLING (Optional) +# ============================================================================== + +# Firecrawl for advanced web scraping +# FIRECRAWL_API_KEY= + +# ============================================================================== +# OBSERVABILITY (Optional) +# ============================================================================== + +# LangSmith for LLM tracing and debugging +# LANGSMITH_TRACING=false +# LANGSMITH_API_KEY= +# LANGSMITH_PROJECT=financegpt + +# ============================================================================== +# ADVANCED (Usually don't need to change) +# ============================================================================== + +# Port mappings (host:container) +FRONTEND_PORT=3000 +BACKEND_PORT=8000 + +# Task scheduler interval +SCHEDULE_CHECKER_INTERVAL=5m + +# Max pages per user (0 = unlimited) +# PAGES_LIMIT=500 diff --git a/.github/workflows/docker_build.yaml b/.github/workflows/docker_build.yaml index 44aec4d..b648fcd 100644 --- a/.github/workflows/docker_build.yaml +++ b/.github/workflows/docker_build.yaml @@ -82,6 +82,7 @@ jobs: build_amd64: runs-on: ubuntu-latest needs: tag_release + timeout-minutes: 90 permissions: packages: write contents: read @@ -106,6 +107,10 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + image=moby/buildkit:latest + network=host - name: Free up disk space run: | @@ -114,6 +119,7 @@ jobs: sudo rm -rf /usr/local/share/boost sudo rm -rf "$AGENT_TOOLSDIRECTORY" docker system prune -af + df -h - name: Build and push AMD64 image id: build @@ -127,11 +133,14 @@ jobs: cache-from: type=gha,scope=amd64 cache-to: type=gha,mode=max,scope=amd64 provenance: false + build-args: | + BUILDKIT_INLINE_CACHE=1 # Build for ARM64 on native arm64 runner (no QEMU emulation!) build_arm64: runs-on: ubuntu-24.04-arm needs: tag_release + timeout-minutes: 120 permissions: packages: write contents: read @@ -156,6 +165,10 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + image=moby/buildkit:latest + network=host - name: Free up disk space run: | @@ -164,6 +177,7 @@ jobs: sudo rm -rf /usr/local/share/boost sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true docker system prune -af + df -h - name: Build and push ARM64 image id: build @@ -177,6 +191,8 @@ jobs: cache-from: type=gha,scope=arm64 cache-to: type=gha,mode=max,scope=arm64 provenance: false + build-args: | + BUILDKIT_INLINE_CACHE=1 # Create multi-arch manifest combining both platform images create_manifest: diff --git a/Dockerfile.allinone b/Dockerfile.allinone index 1f1e451..bdf2b10 100644 --- a/Dockerfile.allinone +++ b/Dockerfile.allinone @@ -29,20 +29,24 @@ WORKDIR /app # Install pnpm RUN corepack enable pnpm -# Copy package files +# Copy package files first for better caching COPY financegpt_web/package.json financegpt_web/pnpm-lock.yaml* ./ + +# Install dependencies in a separate layer (most cacheable) +# Use network timeout to prevent hanging on slow networks +RUN pnpm config set network-timeout 300000 \ + && pnpm install --frozen-lockfile --ignore-scripts + +# Copy config files needed for postinstall COPY financegpt_web/source.config.ts ./ COPY financegpt_web/content ./content -# Install dependencies (skip postinstall which requires all source files) -RUN pnpm install --frozen-lockfile --ignore-scripts +# Run fumadocs-mdx postinstall +RUN pnpm fumadocs-mdx -# Copy source +# Copy source (after dependencies are cached) COPY financegpt_web/ ./ -# Run fumadocs-mdx postinstall now that source files are available -RUN pnpm fumadocs-mdx - # Build with localhost URLs (all services run in same container) ENV NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000 ENV NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL @@ -184,15 +188,17 @@ COPY --from=electric-builder /app /app/electric-release # ==================== WORKDIR /app/backend -# Copy backend dependency files +# Copy backend dependency files first (for better caching) COPY financegpt_backend/pyproject.toml financegpt_backend/uv.lock ./ -# Install PyTorch CPU-only (Docling needs it but OCR is disabled, no GPU needed) +# Install PyTorch CPU-only first (large layer, good to cache separately) RUN pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu -# Install python dependencies -RUN pip install --no-cache-dir certifi pip-system-certs uv \ - && uv pip install --system --no-cache-dir -e . +# Install uv and base dependencies +RUN pip install --no-cache-dir certifi pip-system-certs uv + +# Install python dependencies (separate layer for caching) +RUN uv pip install --system --no-cache-dir -e . # Set SSL environment variables RUN CERTIFI_PATH=$(python -c "import certifi; print(certifi.where())") \ @@ -202,12 +208,12 @@ RUN CERTIFI_PATH=$(python -c "import certifi; print(certifi.where())") \ # Note: EasyOCR models NOT downloaded - OCR is disabled in docling_service.py # GPU support will be added in a future :cuda tagged image -# Install Playwright browsers +# Install Playwright browsers (separate layer) RUN pip install --no-cache-dir playwright \ && playwright install chromium \ && rm -rf /root/.cache/ms-playwright/ffmpeg* -# Copy backend source +# Copy backend source last (changes most frequently) COPY financegpt_backend/ ./ # ==================== @@ -226,6 +232,10 @@ RUN dos2unix /app/entrypoint.sh && chmod +x /app/entrypoint.sh COPY scripts/docker/init-postgres.sh /app/init-postgres.sh RUN dos2unix /app/init-postgres.sh && chmod +x /app/init-postgres.sh +# Electric SQL initialization script (same as used in local docker-compose) +COPY scripts/docker/init-electric-user.sh /app/init-electric-user.sh +RUN dos2unix /app/init-electric-user.sh && chmod +x /app/init-electric-user.sh + # Clean up build dependencies to reduce image size RUN apt-get purge -y build-essential postgresql-server-dev-14 git \ && apt-get autoremove -y \ diff --git a/docker-compose.quickstart.yml b/docker-compose.quickstart.yml index bd838ee..e39a80b 100644 --- a/docker-compose.quickstart.yml +++ b/docker-compose.quickstart.yml @@ -1,74 +1,72 @@ # FinanceGPT Quick Start Docker Compose # -# This is a simplified docker-compose for quick local deployment using pre-built images. -# For production or customized deployments, use the main docker-compose.yml -# # Usage: -# 1. (Optional) Create a .env file with your configuration -# 2. Run: docker compose -f docker-compose.quickstart.yml up -d -# 3. Access FinanceGPT at http://localhost:3000 +# ./run.sh # Easiest way - uses this file automatically +# ./run.sh start # Start FinanceGPT +# ./run.sh logs # View logs +# ./run.sh stop # Stop FinanceGPT # -# All Environment Variables are Optional: -# - SECRET_KEY: JWT secret key (auto-generated and persisted if not set) -# - EMBEDDING_MODEL: Embedding model to use (default: sentence-transformers/all-MiniLM-L6-v2) -# - ETL_SERVICE: Document parsing service - DOCLING, UNSTRUCTURED, or LLAMACLOUD (default: DOCLING) -# - TTS_SERVICE: Text-to-speech service for podcasts (default: local/kokoro) -# - STT_SERVICE: Speech-to-text service with model size (default: local/base) -# - FIRECRAWL_API_KEY: For web crawling features - -version: "3.8" +# Or manually: +# docker compose -f docker-compose.quickstart.yml up -d +# +# Configuration: +# Copy .env.example to .env and customize as needed. +# All settings have sensible defaults - no .env required for basic usage. services: - # All-in-one FinanceGPT container financegpt: image: ghcr.io/manojag115/financegpt:latest container_name: financegpt ports: - "${FRONTEND_PORT:-3000}:3000" - "${BACKEND_PORT:-8000}:8000" + - "${ELECTRIC_PORT:-5133}:5133" volumes: - financegpt-data:/data environment: - # Authentication (auto-generated if not set) + # === Authentication === - SECRET_KEY=${SECRET_KEY:-} - - # Auth Configuration - AUTH_TYPE=${AUTH_TYPE:-LOCAL} + - REGISTRATION_ENABLED=${REGISTRATION_ENABLED:-TRUE} - GOOGLE_OAUTH_CLIENT_ID=${GOOGLE_OAUTH_CLIENT_ID:-} - GOOGLE_OAUTH_CLIENT_SECRET=${GOOGLE_OAUTH_CLIENT_SECRET:-} - # AI/ML Configuration + # === Financial Data (Plaid) === + - PLAID_CLIENT_ID=${PLAID_CLIENT_ID:-} + - PLAID_SECRET=${PLAID_SECRET:-} + - PLAID_ENV=${PLAID_ENV:-sandbox} + + # === AI/ML === - EMBEDDING_MODEL=${EMBEDDING_MODEL:-sentence-transformers/all-MiniLM-L6-v2} - RERANKERS_ENABLED=${RERANKERS_ENABLED:-FALSE} - RERANKERS_MODEL_NAME=${RERANKERS_MODEL_NAME:-} - RERANKERS_MODEL_TYPE=${RERANKERS_MODEL_TYPE:-} - # Document Processing + # === Document Processing === - ETL_SERVICE=${ETL_SERVICE:-DOCLING} - UNSTRUCTURED_API_KEY=${UNSTRUCTURED_API_KEY:-} - LLAMA_CLOUD_API_KEY=${LLAMA_CLOUD_API_KEY:-} + - PAGES_LIMIT=${PAGES_LIMIT:-999999999} - # Audio Services + # === Voice Services === - TTS_SERVICE=${TTS_SERVICE:-local/kokoro} - TTS_SERVICE_API_KEY=${TTS_SERVICE_API_KEY:-} - STT_SERVICE=${STT_SERVICE:-local/base} - STT_SERVICE_API_KEY=${STT_SERVICE_API_KEY:-} - # Web Crawling + # === Web Crawling === - FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY:-} - # Optional Features - - REGISTRATION_ENABLED=${REGISTRATION_ENABLED:-TRUE} - - SCHEDULE_CHECKER_INTERVAL=${SCHEDULE_CHECKER_INTERVAL:-1m} + # === Scheduler === + - SCHEDULE_CHECKER_INTERVAL=${SCHEDULE_CHECKER_INTERVAL:-5m} - # LangSmith Observability (optional) + # === Observability (Optional) === - LANGSMITH_TRACING=${LANGSMITH_TRACING:-false} - - LANGSMITH_ENDPOINT=${LANGSMITH_ENDPOINT:-} - LANGSMITH_API_KEY=${LANGSMITH_API_KEY:-} - LANGSMITH_PROJECT=${LANGSMITH_PROJECT:-} restart: unless-stopped healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:3000", "&&", "curl", "-f", "http://localhost:8000/docs"] + test: ["CMD", "curl", "-f", "http://localhost:3000"] interval: 30s timeout: 10s retries: 3 diff --git a/financegpt_backend/alembic/versions/2_add_tax_forms_tables.py b/financegpt_backend/alembic/versions/2_add_tax_forms_tables.py new file mode 100644 index 0000000..791bb84 --- /dev/null +++ b/financegpt_backend/alembic/versions/2_add_tax_forms_tables.py @@ -0,0 +1,251 @@ +"""add_tax_forms_tables + +Revision ID: 2 +Revises: 1 +Create Date: 2026-01-30 00:00:00.000000 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import UUID, JSONB +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = '2' +down_revision: Union[str, None] = '1' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Create tax form tables.""" + + # Base tax forms table + op.create_table( + 'tax_forms', + sa.Column('id', UUID(as_uuid=True), primary_key=True, server_default=sa.text('gen_random_uuid()')), + sa.Column('user_id', UUID(as_uuid=True), sa.ForeignKey('user.id', ondelete='CASCADE'), nullable=False), + sa.Column('search_space_id', sa.Integer, sa.ForeignKey('searchspaces.id', ondelete='CASCADE'), nullable=False), + sa.Column('form_type', sa.String(20), nullable=False), # W2, 1099-MISC, 1099-INT, etc. + sa.Column('tax_year', sa.Integer, nullable=False), + sa.Column('document_id', sa.Integer, sa.ForeignKey('documents.id', ondelete='SET NULL'), nullable=True), + sa.Column('uploaded_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.now(), nullable=False), + sa.Column('processed_at', sa.TIMESTAMP(timezone=True), nullable=True), + sa.Column('processing_status', sa.String(20), server_default='pending', nullable=False), # pending, processing, completed, failed, needs_review + sa.Column('extraction_method', sa.String(50), nullable=True), # structured_pdf, unstructured, ocr, llm_assisted + sa.Column('confidence_score', sa.Numeric(3, 2), nullable=True), # 0.00 to 1.00 + sa.Column('needs_review', sa.Boolean, server_default='false', nullable=False), + sa.Column('created_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.now(), nullable=False), + sa.Column('updated_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.now(), onupdate=sa.func.now(), nullable=False), + ) + op.create_index('ix_tax_forms_user_id', 'tax_forms', ['user_id']) + op.create_index('ix_tax_forms_tax_year', 'tax_forms', ['tax_year']) + op.create_index('ix_tax_forms_form_type', 'tax_forms', ['form_type']) + op.create_index('ix_tax_forms_search_space_id', 'tax_forms', ['search_space_id']) + + # W2 forms table + op.create_table( + 'w2_forms', + sa.Column('id', UUID(as_uuid=True), primary_key=True, server_default=sa.text('gen_random_uuid()')), + sa.Column('tax_form_id', UUID(as_uuid=True), sa.ForeignKey('tax_forms.id', ondelete='CASCADE'), nullable=False, unique=True), + + # Employer Information (masked for privacy) + sa.Column('employer_name', sa.String(255), nullable=True), + sa.Column('employer_ein_hash', sa.String(64), nullable=True), # SHA256 hashed + sa.Column('employer_address', sa.Text, nullable=True), + + # Employee Information (masked) + sa.Column('employee_ssn_hash', sa.String(64), nullable=True), # SHA256 hashed, never plain text + sa.Column('employee_name_masked', sa.String(255), nullable=True), # [EMPLOYEE_NAME] for UI + + # Wage Information - Box 1-9 + sa.Column('wages_tips_compensation', sa.Numeric(12, 2), nullable=True), # Box 1 + sa.Column('federal_income_tax_withheld', sa.Numeric(12, 2), nullable=True), # Box 2 + sa.Column('social_security_wages', sa.Numeric(12, 2), nullable=True), # Box 3 + sa.Column('social_security_tax_withheld', sa.Numeric(12, 2), nullable=True), # Box 4 + sa.Column('medicare_wages', sa.Numeric(12, 2), nullable=True), # Box 5 + sa.Column('medicare_tax_withheld', sa.Numeric(12, 2), nullable=True), # Box 6 + sa.Column('social_security_tips', sa.Numeric(12, 2), nullable=True), # Box 7 + sa.Column('allocated_tips', sa.Numeric(12, 2), nullable=True), # Box 8 + + # Other Compensation - Box 10-11 + sa.Column('dependent_care_benefits', sa.Numeric(12, 2), nullable=True), # Box 10 + sa.Column('nonqualified_plans', sa.Numeric(12, 2), nullable=True), # Box 11 + + # Box 12 codes (multiple entries) + sa.Column('box_12_codes', JSONB, nullable=True), # [{code: 'D', amount: 5000.00}, ...] + + # Box 13 checkboxes + sa.Column('statutory_employee', sa.Boolean, server_default='false', nullable=False), + sa.Column('retirement_plan', sa.Boolean, server_default='false', nullable=False), + sa.Column('third_party_sick_pay', sa.Boolean, server_default='false', nullable=False), + + # State/Local Tax - Box 15-20 + sa.Column('state_code', sa.String(2), nullable=True), # Box 15 + sa.Column('state_wages', sa.Numeric(12, 2), nullable=True), # Box 16 + sa.Column('state_income_tax', sa.Numeric(12, 2), nullable=True), # Box 17 + sa.Column('local_wages', sa.Numeric(12, 2), nullable=True), # Box 18 + sa.Column('local_income_tax', sa.Numeric(12, 2), nullable=True), # Box 19 + sa.Column('locality_name', sa.String(100), nullable=True), # Box 20 + + # Field-level confidence scores + sa.Column('field_confidence_scores', JSONB, nullable=True), # {wages: 0.95, federal_tax: 0.88, ...} + + # Raw OCR/extraction data (for debugging/re-processing) + sa.Column('raw_extraction_data', JSONB, nullable=True), + + sa.Column('created_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.now(), nullable=False), + sa.Column('updated_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.now(), onupdate=sa.func.now(), nullable=False), + ) + op.create_index('ix_w2_forms_tax_form_id', 'w2_forms', ['tax_form_id']) + + # 1099-MISC forms table + op.create_table( + 'form_1099_misc', + sa.Column('id', UUID(as_uuid=True), primary_key=True, server_default=sa.text('gen_random_uuid()')), + sa.Column('tax_form_id', UUID(as_uuid=True), sa.ForeignKey('tax_forms.id', ondelete='CASCADE'), nullable=False, unique=True), + + # Payer Information + sa.Column('payer_name', sa.String(255), nullable=True), + sa.Column('payer_tin_hash', sa.String(64), nullable=True), + sa.Column('payer_address', sa.Text, nullable=True), + + # Recipient (masked) + sa.Column('recipient_tin_hash', sa.String(64), nullable=True), + + # Income Boxes + sa.Column('rents', sa.Numeric(12, 2), nullable=True), # Box 1 + sa.Column('royalties', sa.Numeric(12, 2), nullable=True), # Box 2 + sa.Column('other_income', sa.Numeric(12, 2), nullable=True), # Box 3 + sa.Column('federal_income_tax_withheld', sa.Numeric(12, 2), nullable=True), # Box 4 + sa.Column('fishing_boat_proceeds', sa.Numeric(12, 2), nullable=True), # Box 5 + sa.Column('medical_health_payments', sa.Numeric(12, 2), nullable=True), # Box 6 + sa.Column('substitute_payments', sa.Numeric(12, 2), nullable=True), # Box 8 + sa.Column('crop_insurance_proceeds', sa.Numeric(12, 2), nullable=True), # Box 10 + sa.Column('gross_proceeds_attorney', sa.Numeric(12, 2), nullable=True), # Box 14 + sa.Column('section_409a_deferrals', sa.Numeric(12, 2), nullable=True), # Box 15 + sa.Column('state_tax_withheld', sa.Numeric(12, 2), nullable=True), # Box 16 + sa.Column('state_payer_number', sa.String(50), nullable=True), + sa.Column('state_income', sa.Numeric(12, 2), nullable=True), # Box 18 + + # Field confidence scores + sa.Column('field_confidence_scores', JSONB, nullable=True), + sa.Column('raw_extraction_data', JSONB, nullable=True), + + sa.Column('created_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.now(), nullable=False), + sa.Column('updated_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.now(), onupdate=sa.func.now(), nullable=False), + ) + op.create_index('ix_1099_misc_tax_form_id', 'form_1099_misc', ['tax_form_id']) + + # 1099-INT (Interest Income) forms table + op.create_table( + 'form_1099_int', + sa.Column('id', UUID(as_uuid=True), primary_key=True, server_default=sa.text('gen_random_uuid()')), + sa.Column('tax_form_id', UUID(as_uuid=True), sa.ForeignKey('tax_forms.id', ondelete='CASCADE'), nullable=False, unique=True), + + # Payer Information + sa.Column('payer_name', sa.String(255), nullable=True), + sa.Column('payer_tin_hash', sa.String(64), nullable=True), + + # Interest Income + sa.Column('interest_income', sa.Numeric(12, 2), nullable=True), # Box 1 + sa.Column('early_withdrawal_penalty', sa.Numeric(12, 2), nullable=True), # Box 2 + sa.Column('interest_us_savings_bonds', sa.Numeric(12, 2), nullable=True), # Box 3 + sa.Column('federal_income_tax_withheld', sa.Numeric(12, 2), nullable=True), # Box 4 + sa.Column('investment_expenses', sa.Numeric(12, 2), nullable=True), # Box 5 + sa.Column('foreign_tax_paid', sa.Numeric(12, 2), nullable=True), # Box 6 + sa.Column('foreign_country', sa.String(100), nullable=True), # Box 7 + sa.Column('tax_exempt_interest', sa.Numeric(12, 2), nullable=True), # Box 8 + sa.Column('specified_private_activity_bond_interest', sa.Numeric(12, 2), nullable=True), # Box 9 + sa.Column('market_discount', sa.Numeric(12, 2), nullable=True), # Box 10 + sa.Column('bond_premium', sa.Numeric(12, 2), nullable=True), # Box 11 + sa.Column('bond_premium_treasury', sa.Numeric(12, 2), nullable=True), # Box 12 + sa.Column('tax_exempt_bond_premium', sa.Numeric(12, 2), nullable=True), # Box 13 + + sa.Column('field_confidence_scores', JSONB, nullable=True), + sa.Column('raw_extraction_data', JSONB, nullable=True), + + sa.Column('created_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.now(), nullable=False), + sa.Column('updated_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.now(), onupdate=sa.func.now(), nullable=False), + ) + op.create_index('ix_1099_int_tax_form_id', 'form_1099_int', ['tax_form_id']) + + # 1099-DIV (Dividends) forms table + op.create_table( + 'form_1099_div', + sa.Column('id', UUID(as_uuid=True), primary_key=True, server_default=sa.text('gen_random_uuid()')), + sa.Column('tax_form_id', UUID(as_uuid=True), sa.ForeignKey('tax_forms.id', ondelete='CASCADE'), nullable=False, unique=True), + + # Payer Information + sa.Column('payer_name', sa.String(255), nullable=True), + sa.Column('payer_tin_hash', sa.String(64), nullable=True), + + # Dividend Income + sa.Column('total_ordinary_dividends', sa.Numeric(12, 2), nullable=True), # Box 1a + sa.Column('qualified_dividends', sa.Numeric(12, 2), nullable=True), # Box 1b + sa.Column('total_capital_gain_distributions', sa.Numeric(12, 2), nullable=True), # Box 2a + sa.Column('unrecaptured_section_1250_gain', sa.Numeric(12, 2), nullable=True), # Box 2b + sa.Column('section_1202_gain', sa.Numeric(12, 2), nullable=True), # Box 2c + sa.Column('collectibles_gain', sa.Numeric(12, 2), nullable=True), # Box 2d + sa.Column('nondividend_distributions', sa.Numeric(12, 2), nullable=True), # Box 3 + sa.Column('federal_income_tax_withheld', sa.Numeric(12, 2), nullable=True), # Box 4 + sa.Column('section_199a_dividends', sa.Numeric(12, 2), nullable=True), # Box 5 + sa.Column('investment_expenses', sa.Numeric(12, 2), nullable=True), # Box 6 + sa.Column('foreign_tax_paid', sa.Numeric(12, 2), nullable=True), # Box 7 + sa.Column('foreign_country', sa.String(100), nullable=True), # Box 8 + sa.Column('cash_liquidation_distributions', sa.Numeric(12, 2), nullable=True), # Box 9 + sa.Column('noncash_liquidation_distributions', sa.Numeric(12, 2), nullable=True), # Box 10 + + sa.Column('field_confidence_scores', JSONB, nullable=True), + sa.Column('raw_extraction_data', JSONB, nullable=True), + + sa.Column('created_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.now(), nullable=False), + sa.Column('updated_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.now(), onupdate=sa.func.now(), nullable=False), + ) + op.create_index('ix_1099_div_tax_form_id', 'form_1099_div', ['tax_form_id']) + + # 1099-B (Brokerage Transactions) forms table + op.create_table( + 'form_1099_b', + sa.Column('id', UUID(as_uuid=True), primary_key=True, server_default=sa.text('gen_random_uuid()')), + sa.Column('tax_form_id', UUID(as_uuid=True), sa.ForeignKey('tax_forms.id', ondelete='CASCADE'), nullable=False, unique=True), + + # Payer Information + sa.Column('payer_name', sa.String(255), nullable=True), + sa.Column('payer_tin_hash', sa.String(64), nullable=True), + + # Transaction Details + sa.Column('description_of_property', sa.Text, nullable=True), # Box 1a (stock name, quantity) + sa.Column('date_acquired', sa.Date, nullable=True), # Box 1b + sa.Column('date_sold', sa.Date, nullable=True), # Box 1c + sa.Column('proceeds', sa.Numeric(12, 2), nullable=True), # Box 1d + sa.Column('cost_basis', sa.Numeric(12, 2), nullable=True), # Box 1e + sa.Column('adjustments_to_basis', sa.Numeric(12, 2), nullable=True), # Box 1f + sa.Column('realized_gain_loss', sa.Numeric(12, 2), nullable=True), # Box 1g (calculated) + + sa.Column('federal_income_tax_withheld', sa.Numeric(12, 2), nullable=True), # Box 4 + + # Form Characteristics + sa.Column('short_term', sa.Boolean, nullable=True), # Box 2 + sa.Column('long_term', sa.Boolean, nullable=True), + sa.Column('basis_reported_to_irs', sa.Boolean, nullable=True), # Box 3 + sa.Column('noncovered_security', sa.Boolean, nullable=True), # Box 5 + + sa.Column('field_confidence_scores', JSONB, nullable=True), + sa.Column('raw_extraction_data', JSONB, nullable=True), + + sa.Column('created_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.now(), nullable=False), + sa.Column('updated_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.now(), onupdate=sa.func.now(), nullable=False), + ) + op.create_index('ix_1099_b_tax_form_id', 'form_1099_b', ['tax_form_id']) + + +def downgrade() -> None: + """Drop tax form tables.""" + op.drop_table('form_1099_b') + op.drop_table('form_1099_div') + op.drop_table('form_1099_int') + op.drop_table('form_1099_misc') + op.drop_table('w2_forms') + op.drop_table('tax_forms') diff --git a/financegpt_backend/app/agents/new_chat/system_prompt.py b/financegpt_backend/app/agents/new_chat/system_prompt.py index 4126e46..d96b625 100644 --- a/financegpt_backend/app/agents/new_chat/system_prompt.py +++ b/financegpt_backend/app/agents/new_chat/system_prompt.py @@ -23,6 +23,7 @@ - Plan for major financial goals (retirement, home purchase, education) - Understand tax implications and opportunities - Make smarter financial decisions with confidence +- Organize and analyze tax documents (W2s, 1099s) for tax preparation Today's date (UTC): {resolved_today} @@ -515,6 +516,31 @@ - IMPORTANT: This tool fetches real-time credit card rewards data from the internet, so it works with ANY credit card the user has (no manual configuration needed). +11. analyze_tax_data: Query uploaded and processed tax forms to answer tax questions. + - **USE THIS TOOL** when users ask about: + * Tax form data: "How much did I earn?", "What were my wages?" + * Tax withholdings: "How much federal tax was withheld?" + * Interest income: "Did I have interest income?", "1099-INT summary?" + * Dividend income: "What dividends did I receive?" + * Capital gains: "Stock sale gains/losses?", "1099-B summary?" + * W2 employment: "Where did I work?", "Wages by employer?" + * Tax year summaries: "2024 tax summary", "Total income 2024" + - IMPORTANT: This tool queries ONLY uploaded tax forms (W2, 1099-MISC, 1099-INT, 1099-DIV, 1099-B) + - Does NOT calculate estimates or current year projections - only historical data from uploaded forms + - Args: + - query_type: Type of analysis (required). Options: + * "income_summary": Total income across all sources + * "tax_summary": Total taxes withheld (federal, state, SS, Medicare) + * "interest_income": Interest from 1099-INT forms + * "dividends_income": Dividends from 1099-DIV forms + * "capital_gains": Capital gains from 1099-B forms + * "w2_summary": W2 employment wages and withholdings + * "all_forms": List all uploaded tax forms + - tax_year: Specific tax year (e.g., 2024) or omit for all years + - form_types: Optional filter by form types (e.g., ["W2", "1099-INT"]) + - Returns: Structured tax data with totals, breakdowns, and per-form details + - Privacy: All PII (SSN, EIN) is hashed - never exposed in responses + FINANCIAL DATA QUERIES: @@ -590,6 +616,43 @@ - List top optimization opportunities - Provide specific card recommendations per category +TAX FORM ANALYSIS: + +- User: "How much did I earn in 2024?" + - Call: `analyze_tax_data(query_type="income_summary", tax_year=2024)` + - Returns: Total W2 wages + 1099 income + interest + dividends + - Provide breakdown by source + +- User: "What was my total federal tax withheld last year?" + - Call: `analyze_tax_data(query_type="tax_summary", tax_year=2025)` + - Returns: Federal, state, Social Security, Medicare withholdings + - Show grand total and breakdown + +- User: "Did I have any interest income?" + - Call: `analyze_tax_data(query_type="interest_income")` + - Returns: 1099-INT forms with interest amounts by payer + - Mention if none found + +- User: "Show me my dividend income from last year" + - Call: `analyze_tax_data(query_type="dividends_income", tax_year=2025)` + - Returns: Ordinary and qualified dividends by payer + - Explain tax implications (qualified vs ordinary) + +- User: "What were my stock sale gains?" + - Call: `analyze_tax_data(query_type="capital_gains")` + - Returns: Short-term and long-term gains from 1099-B + - Break down by transaction and holding period + +- User: "Which companies did I work for in 2024?" + - Call: `analyze_tax_data(query_type="w2_summary", tax_year=2024)` + - Returns: W2 forms with employers, wages, and withholdings + - Summarize total wages and tax withheld + +- User: "List all my uploaded tax forms" + - Call: `analyze_tax_data(query_type="all_forms")` + - Returns: All tax forms with types, years, and processing status + - Note which forms need review (low confidence extractions) + - User: "How much more am I spending this month compared to last month?" - First call: `search_knowledge_base(query="transactions spending", start_date="2025-12-01", end_date="2025-12-31")` (Dec) - Second call: `search_knowledge_base(query="transactions spending", start_date="2026-01-01", end_date="2026-01-26")` (Jan) diff --git a/financegpt_backend/app/agents/new_chat/tools/registry.py b/financegpt_backend/app/agents/new_chat/tools/registry.py index 9257729..8ad479f 100644 --- a/financegpt_backend/app/agents/new_chat/tools/registry.py +++ b/financegpt_backend/app/agents/new_chat/tools/registry.py @@ -58,6 +58,7 @@ async def my_tool(param: str) -> dict: from .portfolio_performance import create_portfolio_performance_tool from .search_financegpt_docs import create_search_financegpt_docs_tool from .search_transactions import create_search_transactions_tool +from .tax_analysis import create_tax_analysis_tool from .user_memory import create_recall_memory_tool, create_save_memory_tool # ============================================================================= @@ -254,6 +255,20 @@ class ToolDefinition: requires=["search_space_id", "db_session", "connector_service"], ), # ========================================================================= + # TAX ANALYSIS TOOLS - UPLOADED TAX FORMS + # ========================================================================= + # Tax analysis tool - queries structured tax form data (W2, 1099s) + ToolDefinition( + name="analyze_tax_data", + description="Query uploaded and processed tax forms (W2, 1099s) to answer tax-related questions", + factory=lambda deps: create_tax_analysis_tool( + user_id=deps["user_id"], + search_space_id=deps["search_space_id"], + db_session=deps["db_session"], + ), + requires=["user_id", "search_space_id", "db_session"], + ), + # ========================================================================= # ADD YOUR CUSTOM TOOLS BELOW # ========================================================================= # Example: diff --git a/financegpt_backend/app/agents/new_chat/tools/tax_analysis.py b/financegpt_backend/app/agents/new_chat/tools/tax_analysis.py new file mode 100644 index 0000000..0f4413b --- /dev/null +++ b/financegpt_backend/app/agents/new_chat/tools/tax_analysis.py @@ -0,0 +1,222 @@ +"""Tax analysis tool for the agent. + +This tool allows the agent to query structured tax form data to answer questions like: +- "How much did I earn in 2024?" +- "What was my total federal tax withheld?" +- "Did I have any interest income?" +- "What were my capital gains from stock sales?" +""" + +import logging +from datetime import datetime +from decimal import Decimal +from typing import Any + +from langchain_core.tools import tool +from sqlalchemy import and_, desc, func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.schemas.tax_forms import TaxFormWithDetails + +logger = logging.getLogger(__name__) + + +def create_tax_analysis_tool(user_id: str, search_space_id: int, db_session: AsyncSession): + """Create the tax analysis tool for the agent. + + Args: + user_id: User ID (UUID string) + search_space_id: Search space ID + db_session: Database session + + Returns: + Configured tax analysis tool + """ + + @tool + async def analyze_tax_data( + query_type: str, + tax_year: int | None = None, + form_types: list[str] | None = None, + ) -> dict[str, Any]: + """Query uploaded tax forms to answer tax-related questions. + + Use this tool when users ask about income, taxes withheld, interest, dividends, + capital gains, or W2 employment information from their uploaded tax documents. + + Args: + query_type: Type of tax analysis - "income_summary", "tax_summary", + "interest_income", "dividends_income", "capital_gains", + "w2_summary", or "all_forms" + tax_year: Specific tax year (e.g., 2024) or None for all years + form_types: Optional list of form types to filter (e.g., ["W2", "1099-INT"]) + + Returns: + Dictionary with analysis results including totals, breakdowns, and details + """ + return await _analyze_tax_data_impl( + user_id=user_id, + search_space_id=search_space_id, + query_type=query_type, + tax_year=tax_year, + form_types=form_types, + ) + + return analyze_tax_data + + +async def _analyze_tax_data_impl( + user_id: str, + search_space_id: int, + query_type: str, + tax_year: int | None = None, + form_types: list[str] | None = None, +) -> dict[str, Any]: + """Implementation of tax data analysis.""" + # Note: Actual database queries would go here + # For now, returning placeholder structure + + if query_type == "income_summary": + return await _get_income_summary(user_id, search_space_id, tax_year) + elif query_type == "tax_summary": + return await _get_tax_summary(user_id, search_space_id, tax_year) + elif query_type == "interest_income": + return await _get_interest_income(user_id, search_space_id, tax_year) + elif query_type == "dividends_income": + return await _get_dividends_income(user_id, search_space_id, tax_year) + elif query_type == "capital_gains": + return await _get_capital_gains(user_id, search_space_id, tax_year) + elif query_type == "w2_summary": + return await _get_w2_summary(user_id, search_space_id, tax_year) + elif query_type == "all_forms": + return await _get_all_forms(user_id, search_space_id, tax_year, form_types) + else: + return {"error": f"Unknown query type: {query_type}"} + + +async def _get_income_summary( + user_id: str, + search_space_id: int, + tax_year: int | None, +) -> dict[str, Any]: + """Get total income across all sources.""" + # TODO: Implement actual database queries + # This would query W2s and 1099s to sum total income + return { + "query_type": "income_summary", + "tax_year": tax_year or "all years", + "total_w2_wages": Decimal("0.00"), + "total_1099_misc_income": Decimal("0.00"), + "total_interest_income": Decimal("0.00"), + "total_dividend_income": Decimal("0.00"), + "total_capital_gains": Decimal("0.00"), + "grand_total_income": Decimal("0.00"), + "message": "No tax forms uploaded yet. Please upload your W2 and 1099 forms to see income summary.", + } + + +async def _get_tax_summary( + user_id: str, + search_space_id: int, + tax_year: int | None, +) -> dict[str, Any]: + """Get total taxes withheld across all sources.""" + # TODO: Implement actual database queries + return { + "query_type": "tax_summary", + "tax_year": tax_year or "all years", + "total_federal_withheld": Decimal("0.00"), + "total_social_security_withheld": Decimal("0.00"), + "total_medicare_withheld": Decimal("0.00"), + "total_state_withheld": Decimal("0.00"), + "grand_total_withheld": Decimal("0.00"), + "message": "No tax forms uploaded yet. Please upload your W2 and 1099 forms to see tax withholdings.", + } + + +async def _get_interest_income( + user_id: str, + search_space_id: int, + tax_year: int | None, +) -> dict[str, Any]: + """Get interest income from 1099-INT forms.""" + # TODO: Implement actual database queries + return { + "query_type": "interest_income", + "tax_year": tax_year or "all years", + "total_interest": Decimal("0.00"), + "sources": [], + "message": "No 1099-INT forms found. Upload your interest income statements to see details.", + } + + +async def _get_dividends_income( + user_id: str, + search_space_id: int, + tax_year: int | None, +) -> dict[str, Any]: + """Get dividend income from 1099-DIV forms.""" + # TODO: Implement actual database queries + return { + "query_type": "dividends_income", + "tax_year": tax_year or "all years", + "total_ordinary_dividends": Decimal("0.00"), + "total_qualified_dividends": Decimal("0.00"), + "sources": [], + "message": "No 1099-DIV forms found. Upload your dividend income statements to see details.", + } + + +async def _get_capital_gains( + user_id: str, + search_space_id: int, + tax_year: int | None, +) -> dict[str, Any]: + """Get capital gains from 1099-B forms.""" + # TODO: Implement actual database queries + return { + "query_type": "capital_gains", + "tax_year": tax_year or "all years", + "total_short_term_gains": Decimal("0.00"), + "total_long_term_gains": Decimal("0.00"), + "total_realized_gains": Decimal("0.00"), + "transactions": [], + "message": "No 1099-B forms found. Upload your brokerage statements to see capital gains.", + } + + +async def _get_w2_summary( + user_id: str, + search_space_id: int, + tax_year: int | None, +) -> dict[str, Any]: + """Get W2 summary.""" + # TODO: Implement actual database queries + return { + "query_type": "w2_summary", + "tax_year": tax_year or "all years", + "employers": [], + "total_wages": Decimal("0.00"), + "total_federal_withheld": Decimal("0.00"), + "total_social_security_withheld": Decimal("0.00"), + "total_medicare_withheld": Decimal("0.00"), + "message": "No W2 forms found. Upload your W2s to see employment income and withholdings.", + } + + +async def _get_all_forms( + user_id: str, + search_space_id: int, + tax_year: int | None, + form_types: list[str] | None, +) -> dict[str, Any]: + """Get all tax forms with optional filters.""" + # TODO: Implement actual database queries + return { + "query_type": "all_forms", + "tax_year": tax_year or "all years", + "form_types_filter": form_types, + "forms": [], + "total_forms": 0, + "message": "No tax forms uploaded yet. Upload W2s and 1099s to get started.", + } diff --git a/financegpt_backend/app/db.py b/financegpt_backend/app/db.py index 56f6687..3932bf1 100644 --- a/financegpt_backend/app/db.py +++ b/financegpt_backend/app/db.py @@ -961,6 +961,361 @@ class PortfolioAllocationTarget(BaseModel, TimestampMixin): metadata_ = Column("metadata", JSONB, nullable=True) +# ============================================================================ +# Tax Forms Models +# ============================================================================ + + +class TaxForm(BaseModel): + """Base tax form model for all tax-related documents.""" + + __tablename__ = "tax_forms" + + id = Column(UUID(as_uuid=True), primary_key=True, server_default=text('gen_random_uuid()'), index=True) + user_id = Column( + UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False, index=True + ) + search_space_id = Column( + Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False, index=True + ) + form_type = Column(String(20), nullable=False, index=True) # W2, 1099-MISC, 1099-INT, etc. + tax_year = Column(Integer, nullable=False, index=True) + document_id = Column( + Integer, ForeignKey("documents.id", ondelete="SET NULL"), nullable=True + ) + uploaded_at = Column(TIMESTAMP(timezone=True), server_default=text('now()'), nullable=False) + processed_at = Column(TIMESTAMP(timezone=True), nullable=True) + processing_status = Column( + String(20), server_default='pending', nullable=False + ) # pending, processing, completed, failed, needs_review + extraction_method = Column( + String(50), nullable=True + ) # structured_pdf, unstructured, ocr, llm_assisted + confidence_score = Column(Numeric(3, 2), nullable=True) # 0.00 to 1.00 + needs_review = Column(Boolean, server_default='false', nullable=False) + created_at = Column(TIMESTAMP(timezone=True), server_default=text('now()'), nullable=False) + updated_at = Column( + TIMESTAMP(timezone=True), + server_default=text('now()'), + onupdate=text('now()'), + nullable=False, + ) + + # Relationships + w2_form = relationship("W2Form", back_populates="tax_form", uselist=False, cascade="all, delete-orphan") + form_1099_misc = relationship("Form1099Misc", back_populates="tax_form", uselist=False, cascade="all, delete-orphan") + form_1099_int = relationship("Form1099Int", back_populates="tax_form", uselist=False, cascade="all, delete-orphan") + form_1099_div = relationship("Form1099Div", back_populates="tax_form", uselist=False, cascade="all, delete-orphan") + form_1099_b = relationship("Form1099B", back_populates="tax_form", uselist=False, cascade="all, delete-orphan") + + +class W2Form(BaseModel): + """W2 wage and tax statement model.""" + + __tablename__ = "w2_forms" + + id = Column(UUID(as_uuid=True), primary_key=True, server_default=text('gen_random_uuid()'), index=True) + tax_form_id = Column( + UUID(as_uuid=True), + ForeignKey("tax_forms.id", ondelete="CASCADE"), + nullable=False, + unique=True, + index=True, + ) + + # Employer Information (masked for privacy) + employer_name = Column(String(255), nullable=True) + employer_ein_hash = Column(String(64), nullable=True) # SHA256 hashed + employer_address = Column(Text, nullable=True) + + # Employee Information (masked) + employee_ssn_hash = Column(String(64), nullable=True) # SHA256 hashed, never plain text + employee_name_masked = Column(String(255), nullable=True) # [EMPLOYEE_NAME] for UI + + # Wage Information - Box 1-9 + wages_tips_compensation = Column(Numeric(12, 2), nullable=True) # Box 1 + federal_income_tax_withheld = Column(Numeric(12, 2), nullable=True) # Box 2 + social_security_wages = Column(Numeric(12, 2), nullable=True) # Box 3 + social_security_tax_withheld = Column(Numeric(12, 2), nullable=True) # Box 4 + medicare_wages = Column(Numeric(12, 2), nullable=True) # Box 5 + medicare_tax_withheld = Column(Numeric(12, 2), nullable=True) # Box 6 + social_security_tips = Column(Numeric(12, 2), nullable=True) # Box 7 + allocated_tips = Column(Numeric(12, 2), nullable=True) # Box 8 + + # Other Compensation - Box 10-11 + dependent_care_benefits = Column(Numeric(12, 2), nullable=True) # Box 10 + nonqualified_plans = Column(Numeric(12, 2), nullable=True) # Box 11 + + # Box 12 codes (multiple entries) + box_12_codes = Column(JSONB, nullable=True) # [{code: 'D', amount: 5000.00}, ...] + + # Box 13 checkboxes + statutory_employee = Column(Boolean, server_default='false', nullable=False) + retirement_plan = Column(Boolean, server_default='false', nullable=False) + third_party_sick_pay = Column(Boolean, server_default='false', nullable=False) + + # State/Local Tax - Box 15-20 + state_code = Column(String(2), nullable=True) # Box 15 + state_wages = Column(Numeric(12, 2), nullable=True) # Box 16 + state_income_tax = Column(Numeric(12, 2), nullable=True) # Box 17 + local_wages = Column(Numeric(12, 2), nullable=True) # Box 18 + local_income_tax = Column(Numeric(12, 2), nullable=True) # Box 19 + locality_name = Column(String(100), nullable=True) # Box 20 + + # Field-level confidence scores + field_confidence_scores = Column(JSONB, nullable=True) # {wages: 0.95, federal_tax: 0.88, ...} + + # Raw OCR/extraction data (for debugging/re-processing) + raw_extraction_data = Column(JSONB, nullable=True) + + created_at = Column(TIMESTAMP(timezone=True), server_default=text('now()'), nullable=False) + updated_at = Column( + TIMESTAMP(timezone=True), + server_default=text('now()'), + onupdate=text('now()'), + nullable=False, + ) + + # Relationships + tax_form = relationship("TaxForm", back_populates="w2_form") + + +class Form1099Misc(BaseModel): + """1099-MISC miscellaneous income form model.""" + + __tablename__ = "form_1099_misc" + + id = Column(UUID(as_uuid=True), primary_key=True, server_default=text('gen_random_uuid()'), index=True) + tax_form_id = Column( + UUID(as_uuid=True), + ForeignKey("tax_forms.id", ondelete="CASCADE"), + nullable=False, + unique=True, + index=True, + ) + + # Payer Information + payer_name = Column(String(255), nullable=True) + payer_tin_hash = Column(String(64), nullable=True) + payer_address = Column(Text, nullable=True) + + # Recipient (masked) + recipient_tin_hash = Column(String(64), nullable=True) + + # Income Boxes + rents = Column(Numeric(12, 2), nullable=True) # Box 1 + royalties = Column(Numeric(12, 2), nullable=True) # Box 2 + other_income = Column(Numeric(12, 2), nullable=True) # Box 3 + federal_income_tax_withheld = Column(Numeric(12, 2), nullable=True) # Box 4 + fishing_boat_proceeds = Column(Numeric(12, 2), nullable=True) # Box 5 + medical_health_payments = Column(Numeric(12, 2), nullable=True) # Box 6 + substitute_payments = Column(Numeric(12, 2), nullable=True) # Box 8 + crop_insurance_proceeds = Column(Numeric(12, 2), nullable=True) # Box 10 + gross_proceeds_attorney = Column(Numeric(12, 2), nullable=True) # Box 14 + section_409a_deferrals = Column(Numeric(12, 2), nullable=True) # Box 15 + state_tax_withheld = Column(Numeric(12, 2), nullable=True) # Box 16 + state_payer_number = Column(String(50), nullable=True) + state_income = Column(Numeric(12, 2), nullable=True) # Box 18 + + # Field confidence scores + field_confidence_scores = Column(JSONB, nullable=True) + raw_extraction_data = Column(JSONB, nullable=True) + + created_at = Column(TIMESTAMP(timezone=True), server_default=text('now()'), nullable=False) + updated_at = Column( + TIMESTAMP(timezone=True), + server_default=text('now()'), + onupdate=text('now()'), + nullable=False, + ) + + # Relationships + tax_form = relationship("TaxForm", back_populates="form_1099_misc") + + +class Form1099Int(BaseModel): + """1099-INT interest income form model.""" + + __tablename__ = "form_1099_int" + + id = Column(UUID(as_uuid=True), primary_key=True, server_default=text('gen_random_uuid()'), index=True) + tax_form_id = Column( + UUID(as_uuid=True), + ForeignKey("tax_forms.id", ondelete="CASCADE"), + nullable=False, + unique=True, + index=True, + ) + + # Payer Information + payer_name = Column(String(255), nullable=True) + payer_tin_hash = Column(String(64), nullable=True) + payer_address = Column(Text, nullable=True) + + # Recipient (masked) + recipient_tin_hash = Column(String(64), nullable=True) + + # Interest Income Boxes + interest_income = Column(Numeric(12, 2), nullable=True) # Box 1 + early_withdrawal_penalty = Column(Numeric(12, 2), nullable=True) # Box 2 + interest_on_us_savings_bonds = Column(Numeric(12, 2), nullable=True) # Box 3 + federal_income_tax_withheld = Column(Numeric(12, 2), nullable=True) # Box 4 + investment_expenses = Column(Numeric(12, 2), nullable=True) # Box 5 + foreign_tax_paid = Column(Numeric(12, 2), nullable=True) # Box 6 + foreign_country = Column(String(100), nullable=True) # Box 7 + tax_exempt_interest = Column(Numeric(12, 2), nullable=True) # Box 8 + specified_private_activity_bond_interest = Column(Numeric(12, 2), nullable=True) # Box 9 + market_discount = Column(Numeric(12, 2), nullable=True) # Box 10 + bond_premium = Column(Numeric(12, 2), nullable=True) # Box 11 + bond_premium_on_treasury = Column(Numeric(12, 2), nullable=True) # Box 12 + bond_premium_on_tax_exempt = Column(Numeric(12, 2), nullable=True) # Box 13 + state_code = Column(String(2), nullable=True) # Box 15 + state_id = Column(String(50), nullable=True) # Box 16 + state_tax_withheld = Column(Numeric(12, 2), nullable=True) # Box 17 + + # Field confidence scores + field_confidence_scores = Column(JSONB, nullable=True) + raw_extraction_data = Column(JSONB, nullable=True) + + created_at = Column(TIMESTAMP(timezone=True), server_default=text('now()'), nullable=False) + updated_at = Column( + TIMESTAMP(timezone=True), + server_default=text('now()'), + onupdate=text('now()'), + nullable=False, + ) + + # Relationships + tax_form = relationship("TaxForm", back_populates="form_1099_int") + + +class Form1099Div(BaseModel): + """1099-DIV dividend income form model.""" + + __tablename__ = "form_1099_div" + + id = Column(UUID(as_uuid=True), primary_key=True, server_default=text('gen_random_uuid()'), index=True) + tax_form_id = Column( + UUID(as_uuid=True), + ForeignKey("tax_forms.id", ondelete="CASCADE"), + nullable=False, + unique=True, + index=True, + ) + + # Payer Information + payer_name = Column(String(255), nullable=True) + payer_tin_hash = Column(String(64), nullable=True) + payer_address = Column(Text, nullable=True) + + # Recipient (masked) + recipient_tin_hash = Column(String(64), nullable=True) + + # Dividend Income Boxes + total_ordinary_dividends = Column(Numeric(12, 2), nullable=True) # Box 1a + qualified_dividends = Column(Numeric(12, 2), nullable=True) # Box 1b + total_capital_gain_distributions = Column(Numeric(12, 2), nullable=True) # Box 2a + unrecaptured_section_1250_gain = Column(Numeric(12, 2), nullable=True) # Box 2b + section_1202_gain = Column(Numeric(12, 2), nullable=True) # Box 2c + collectibles_28_gain = Column(Numeric(12, 2), nullable=True) # Box 2d + section_897_ordinary_dividends = Column(Numeric(12, 2), nullable=True) # Box 2e + section_897_capital_gain = Column(Numeric(12, 2), nullable=True) # Box 2f + nondividend_distributions = Column(Numeric(12, 2), nullable=True) # Box 3 + federal_income_tax_withheld = Column(Numeric(12, 2), nullable=True) # Box 4 + section_199a_dividends = Column(Numeric(12, 2), nullable=True) # Box 5 + investment_expenses = Column(Numeric(12, 2), nullable=True) # Box 6 + foreign_tax_paid = Column(Numeric(12, 2), nullable=True) # Box 7 + foreign_country = Column(String(100), nullable=True) # Box 8 + cash_liquidation_distributions = Column(Numeric(12, 2), nullable=True) # Box 9 + noncash_liquidation_distributions = Column(Numeric(12, 2), nullable=True) # Box 10 + exempt_interest_dividends = Column(Numeric(12, 2), nullable=True) # Box 11 + specified_private_activity_bond_interest_dividends = Column(Numeric(12, 2), nullable=True) # Box 12 + state_code = Column(String(2), nullable=True) # Box 14 + state_id = Column(String(50), nullable=True) # Box 15 + state_tax_withheld = Column(Numeric(12, 2), nullable=True) # Box 16 + + # Field confidence scores + field_confidence_scores = Column(JSONB, nullable=True) + raw_extraction_data = Column(JSONB, nullable=True) + + created_at = Column(TIMESTAMP(timezone=True), server_default=text('now()'), nullable=False) + updated_at = Column( + TIMESTAMP(timezone=True), + server_default=text('now()'), + onupdate=text('now()'), + nullable=False, + ) + + # Relationships + tax_form = relationship("TaxForm", back_populates="form_1099_div") + + +class Form1099B(BaseModel): + """1099-B broker transaction form model.""" + + __tablename__ = "form_1099_b" + + id = Column(UUID(as_uuid=True), primary_key=True, server_default=text('gen_random_uuid()'), index=True) + tax_form_id = Column( + UUID(as_uuid=True), + ForeignKey("tax_forms.id", ondelete="CASCADE"), + nullable=False, + unique=True, + index=True, + ) + + # Payer/Broker Information + payer_name = Column(String(255), nullable=True) + payer_tin_hash = Column(String(64), nullable=True) + payer_address = Column(Text, nullable=True) + + # Recipient (masked) + recipient_tin_hash = Column(String(64), nullable=True) + + # Transaction Details + description_of_property = Column(Text, nullable=True) # Box 1a + date_acquired = Column(String(50), nullable=True) # Box 1b (can be "VARIOUS") + date_sold = Column(String(50), nullable=True) # Box 1c + proceeds = Column(Numeric(12, 2), nullable=True) # Box 1d + cost_basis = Column(Numeric(12, 2), nullable=True) # Box 1e + accrued_market_discount = Column(Numeric(12, 2), nullable=True) # Box 1f + wash_sale_loss_disallowed = Column(Numeric(12, 2), nullable=True) # Box 1g + federal_income_tax_withheld = Column(Numeric(12, 2), nullable=True) # Box 4 + + # Form 8949 checkboxes + short_term_box_a = Column(Boolean, server_default='false', nullable=False) + short_term_box_b = Column(Boolean, server_default='false', nullable=False) + short_term_box_c = Column(Boolean, server_default='false', nullable=False) + long_term_box_d = Column(Boolean, server_default='false', nullable=False) + long_term_box_e = Column(Boolean, server_default='false', nullable=False) + long_term_box_f = Column(Boolean, server_default='false', nullable=False) + + # Applicable checkbox + loss_not_allowed = Column(Boolean, server_default='false', nullable=False) + noncovered_security = Column(Boolean, server_default='false', nullable=False) + basis_reported_to_irs = Column(Boolean, server_default='false', nullable=False) + + # State tax information + state_code = Column(String(2), nullable=True) + state_id = Column(String(50), nullable=True) + state_tax_withheld = Column(Numeric(12, 2), nullable=True) + + # Field confidence scores + field_confidence_scores = Column(JSONB, nullable=True) + raw_extraction_data = Column(JSONB, nullable=True) + + created_at = Column(TIMESTAMP(timezone=True), server_default=text('now()'), nullable=False) + updated_at = Column( + TIMESTAMP(timezone=True), + server_default=text('now()'), + onupdate=text('now()'), + nullable=False, + ) + + # Relationships + tax_form = relationship("TaxForm", back_populates="form_1099_b") + + class NewLLMConfig(BaseModel, TimestampMixin): """ New LLM configuration table that combines model settings with prompt configuration. diff --git a/financegpt_backend/app/parsers/tax_form_parser.py b/financegpt_backend/app/parsers/tax_form_parser.py new file mode 100644 index 0000000..57f9fcd --- /dev/null +++ b/financegpt_backend/app/parsers/tax_form_parser.py @@ -0,0 +1,450 @@ +"""Tiered tax form parser with hybrid extraction strategy. + +Extraction Priority: +1. Structured PDF extraction (pdfplumber) - best for text-based PDFs +2. Unstructured library - handles more complex layouts +3. OCR with pattern matching - for scanned documents +4. LLM-assisted extraction - last resort, with PII masked + +Each tier returns confidence scores. If confidence < 0.85, escalate to next tier. +""" + +import logging +import re +from decimal import Decimal +from pathlib import Path +from typing import Any, Literal + +import pdfplumber +from unstructured.partition.pdf import partition_pdf + +from app.utils.pii_masking import mask_tax_form_for_llm, validate_confidence_threshold + +logger = logging.getLogger(__name__) + + +class TaxFormParser: + """Hybrid tax form parser with tiered extraction.""" + + CONFIDENCE_THRESHOLD = 0.85 + + # Common patterns for tax form fields + PATTERNS = { + "ssn": r"\b\d{3}-\d{2}-\d{4}\b", + "ein": r"\b\d{2}-\d{7}\b", + "money": r"\$?\s*\d{1,3}(?:,\d{3})*(?:\.\d{2})?", + "date": r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b", + "percentage": r"\d+(?:\.\d+)?%", + } + + def __init__(self): + """Initialize parser.""" + self.extraction_history: list[dict[str, Any]] = [] + + async def parse_tax_form( + self, + file_path: str | Path, + form_type: Literal["W2", "1099-MISC", "1099-INT", "1099-DIV", "1099-B"], + tax_year: int, + ) -> dict[str, Any]: + """Parse tax form using tiered extraction strategy. + + Args: + file_path: Path to PDF file + form_type: Type of tax form + tax_year: Tax year for the form + + Returns: + Dictionary containing: + - extracted_data: Parsed form fields + - confidence_scores: Per-field confidence + - extraction_method: Method used (structured_pdf, unstructured, ocr, llm_assisted) + - needs_review: True if confidence < threshold + - raw_extraction_data: Full extraction details + """ + file_path = Path(file_path) + + if not file_path.exists(): + raise FileNotFoundError(f"Tax form file not found: {file_path}") + + # Tier 1: Structured PDF extraction (fastest, most accurate for text PDFs) + logger.info(f"Tier 1: Attempting structured PDF extraction for {form_type}") + result = await self._extract_structured_pdf(file_path, form_type, tax_year) + + if result and result["confidence_scores"]: + avg_confidence = sum(result["confidence_scores"].values()) / len(result["confidence_scores"]) + logger.info(f"Tier 1 average confidence: {avg_confidence:.2f}") + + if avg_confidence >= self.CONFIDENCE_THRESHOLD: + logger.info(f"Tier 1 succeeded with {avg_confidence:.2f} confidence") + return result + + # Tier 2: Unstructured library (better layout analysis) + logger.info(f"Tier 2: Attempting unstructured library extraction for {form_type}") + result = await self._extract_unstructured(file_path, form_type, tax_year) + + if result and result["confidence_scores"]: + avg_confidence = sum(result["confidence_scores"].values()) / len(result["confidence_scores"]) + logger.info(f"Tier 2 average confidence: {avg_confidence:.2f}") + + if avg_confidence >= self.CONFIDENCE_THRESHOLD: + logger.info(f"Tier 2 succeeded with {avg_confidence:.2f} confidence") + return result + + # Tier 3: OCR with pattern matching + logger.info(f"Tier 3: Attempting OCR extraction for {form_type}") + result = await self._extract_ocr(file_path, form_type, tax_year) + + if result and result["confidence_scores"]: + avg_confidence = sum(result["confidence_scores"].values()) / len(result["confidence_scores"]) + logger.info(f"Tier 3 average confidence: {avg_confidence:.2f}") + + if avg_confidence >= self.CONFIDENCE_THRESHOLD: + logger.info(f"Tier 3 succeeded with {avg_confidence:.2f} confidence") + return result + + # Tier 4: LLM-assisted extraction (last resort, with PII masked) + logger.warning(f"Tier 4: Escalating to LLM-assisted extraction for {form_type}") + result = await self._extract_llm_assisted(file_path, form_type, tax_year, previous_result=result) + + return result + + async def _extract_structured_pdf( + self, + file_path: Path, + form_type: str, + tax_year: int, + ) -> dict[str, Any]: + """Extract data using pdfplumber (structured PDF). + + Best for: Text-based PDFs with clear structure. + """ + try: + with pdfplumber.open(file_path) as pdf: + # Extract text from all pages + full_text = "" + for page in pdf.pages: + full_text += page.extract_text() + "\n" + + # Extract based on form type + if form_type == "W2": + extracted_data = self._parse_w2_text(full_text) + elif form_type == "1099-MISC": + extracted_data = self._parse_1099_misc_text(full_text) + elif form_type == "1099-INT": + extracted_data = self._parse_1099_int_text(full_text) + elif form_type == "1099-DIV": + extracted_data = self._parse_1099_div_text(full_text) + elif form_type == "1099-B": + extracted_data = self._parse_1099_b_text(full_text) + else: + raise ValueError(f"Unsupported form type: {form_type}") + + # Calculate confidence scores based on field population + confidence_scores = self._calculate_confidence_scores(extracted_data) + + return { + "extracted_data": extracted_data, + "confidence_scores": confidence_scores, + "extraction_method": "structured_pdf", + "needs_review": not self._meets_confidence_threshold(confidence_scores), + "raw_extraction_data": {"full_text": full_text}, + } + + except Exception as e: + logger.error(f"Structured PDF extraction failed: {e}") + return { + "extracted_data": {}, + "confidence_scores": {}, + "extraction_method": "structured_pdf", + "needs_review": True, + "raw_extraction_data": {"error": str(e)}, + } + + async def _extract_unstructured( + self, + file_path: Path, + form_type: str, + tax_year: int, + ) -> dict[str, Any]: + """Extract data using unstructured library. + + Best for: PDFs with complex layouts, tables, multiple columns. + """ + try: + # Use unstructured to partition the PDF + elements = partition_pdf(str(file_path), strategy="hi_res") + + # Combine all text elements + full_text = "\n".join([str(el) for el in elements]) + + # Extract based on form type (same parsers as structured PDF) + if form_type == "W2": + extracted_data = self._parse_w2_text(full_text) + elif form_type == "1099-MISC": + extracted_data = self._parse_1099_misc_text(full_text) + elif form_type == "1099-INT": + extracted_data = self._parse_1099_int_text(full_text) + elif form_type == "1099-DIV": + extracted_data = self._parse_1099_div_text(full_text) + elif form_type == "1099-B": + extracted_data = self._parse_1099_b_text(full_text) + else: + raise ValueError(f"Unsupported form type: {form_type}") + + confidence_scores = self._calculate_confidence_scores(extracted_data) + + return { + "extracted_data": extracted_data, + "confidence_scores": confidence_scores, + "extraction_method": "unstructured", + "needs_review": not self._meets_confidence_threshold(confidence_scores), + "raw_extraction_data": { + "full_text": full_text, + "num_elements": len(elements), + }, + } + + except Exception as e: + logger.error(f"Unstructured extraction failed: {e}") + return { + "extracted_data": {}, + "confidence_scores": {}, + "extraction_method": "unstructured", + "needs_review": True, + "raw_extraction_data": {"error": str(e)}, + } + + async def _extract_ocr( + self, + file_path: Path, + form_type: str, + tax_year: int, + ) -> dict[str, Any]: + """Extract data using OCR with pattern matching. + + Best for: Scanned documents, images of tax forms. + Note: This is a placeholder - would use pytesseract or similar in production. + """ + # TODO: Implement OCR extraction with pytesseract + # For now, return empty result to trigger LLM escalation + logger.warning("OCR extraction not yet implemented") + return { + "extracted_data": {}, + "confidence_scores": {}, + "extraction_method": "ocr", + "needs_review": True, + "raw_extraction_data": {"status": "not_implemented"}, + } + + async def _extract_llm_assisted( + self, + file_path: Path, + form_type: str, + tax_year: int, + previous_result: dict[str, Any] | None = None, + ) -> dict[str, Any]: + """Extract/verify data using LLM (with PII masked). + + Best for: Verification of low-confidence fields, unusual layouts. + IMPORTANT: All PII is masked before sending to LLM. + """ + # TODO: Implement LLM-assisted extraction using instructor + # This would: + # 1. Convert PDF to image or extract text + # 2. Mask any PII found in previous extraction + # 3. Send to LLM with structured output schema + # 4. Return verified/extracted data + + logger.warning("LLM-assisted extraction not yet implemented") + + # For now, return previous result marked as needs_review + if previous_result: + previous_result["extraction_method"] = "llm_assisted" + previous_result["needs_review"] = True + return previous_result + + return { + "extracted_data": {}, + "confidence_scores": {}, + "extraction_method": "llm_assisted", + "needs_review": True, + "raw_extraction_data": {"status": "not_implemented"}, + } + + def _parse_w2_text(self, text: str) -> dict[str, Any]: + """Parse W2 form from extracted text.""" + data: dict[str, Any] = {} + + # Box 1: Wages, tips, other compensation + wages_match = re.search(r"(?:Wages|Box 1).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if wages_match: + data["wages_tips_compensation"] = self._parse_money(wages_match.group(1)) + + # Box 2: Federal income tax withheld + fed_tax_match = re.search(r"(?:Federal.*?tax.*?withheld|Box 2).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if fed_tax_match: + data["federal_income_tax_withheld"] = self._parse_money(fed_tax_match.group(1)) + + # Box 3: Social security wages + ss_wages_match = re.search(r"(?:Social security wages|Box 3).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if ss_wages_match: + data["social_security_wages"] = self._parse_money(ss_wages_match.group(1)) + + # Box 4: Social security tax withheld + ss_tax_match = re.search(r"(?:Social security tax|Box 4).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if ss_tax_match: + data["social_security_tax_withheld"] = self._parse_money(ss_tax_match.group(1)) + + # Box 5: Medicare wages + medicare_wages_match = re.search(r"(?:Medicare wages|Box 5).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if medicare_wages_match: + data["medicare_wages"] = self._parse_money(medicare_wages_match.group(1)) + + # Box 6: Medicare tax withheld + medicare_tax_match = re.search(r"(?:Medicare tax|Box 6).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if medicare_tax_match: + data["medicare_tax_withheld"] = self._parse_money(medicare_tax_match.group(1)) + + # Extract SSN (will be hashed before storage) + ssn_match = re.search(self.PATTERNS["ssn"], text) + if ssn_match: + data["employee_ssn"] = ssn_match.group(0) + + # Extract EIN + ein_match = re.search(self.PATTERNS["ein"], text) + if ein_match: + data["employer_ein"] = ein_match.group(0) + + # Box 13: Retirement plan checkbox + data["retirement_plan"] = bool(re.search(r"Retirement plan.*?[Xx✓]", text, re.IGNORECASE)) + + return data + + def _parse_1099_misc_text(self, text: str) -> dict[str, Any]: + """Parse 1099-MISC form from extracted text.""" + data: dict[str, Any] = {} + + # Box 1: Rents + rents_match = re.search(r"(?:Rents|Box 1).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if rents_match: + data["rents"] = self._parse_money(rents_match.group(1)) + + # Box 2: Royalties + royalties_match = re.search(r"(?:Royalties|Box 2).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if royalties_match: + data["royalties"] = self._parse_money(royalties_match.group(1)) + + # Box 3: Other income + other_match = re.search(r"(?:Other income|Box 3).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if other_match: + data["other_income"] = self._parse_money(other_match.group(1)) + + # Box 4: Federal income tax withheld + fed_tax_match = re.search(r"(?:Federal.*?tax.*?withheld|Box 4).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if fed_tax_match: + data["federal_income_tax_withheld"] = self._parse_money(fed_tax_match.group(1)) + + return data + + def _parse_1099_int_text(self, text: str) -> dict[str, Any]: + """Parse 1099-INT form from extracted text.""" + data: dict[str, Any] = {} + + # Box 1: Interest income + interest_match = re.search(r"(?:Interest income|Box 1).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if interest_match: + data["interest_income"] = self._parse_money(interest_match.group(1)) + + # Box 2: Early withdrawal penalty + penalty_match = re.search(r"(?:Early withdrawal|Box 2).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if penalty_match: + data["early_withdrawal_penalty"] = self._parse_money(penalty_match.group(1)) + + # Box 4: Federal income tax withheld + fed_tax_match = re.search(r"(?:Federal.*?tax.*?withheld|Box 4).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if fed_tax_match: + data["federal_income_tax_withheld"] = self._parse_money(fed_tax_match.group(1)) + + return data + + def _parse_1099_div_text(self, text: str) -> dict[str, Any]: + """Parse 1099-DIV form from extracted text.""" + data: dict[str, Any] = {} + + # Box 1a: Total ordinary dividends + dividends_match = re.search(r"(?:Total ordinary dividends|Box 1a).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if dividends_match: + data["total_ordinary_dividends"] = self._parse_money(dividends_match.group(1)) + + # Box 1b: Qualified dividends + qualified_match = re.search(r"(?:Qualified dividends|Box 1b).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if qualified_match: + data["qualified_dividends"] = self._parse_money(qualified_match.group(1)) + + # Box 2a: Total capital gain distributions + cap_gains_match = re.search(r"(?:Total capital gain|Box 2a).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if cap_gains_match: + data["total_capital_gain_distributions"] = self._parse_money(cap_gains_match.group(1)) + + return data + + def _parse_1099_b_text(self, text: str) -> dict[str, Any]: + """Parse 1099-B form from extracted text.""" + data: dict[str, Any] = {} + + # Box 1d: Proceeds + proceeds_match = re.search(r"(?:Proceeds|Box 1d).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if proceeds_match: + data["proceeds"] = self._parse_money(proceeds_match.group(1)) + + # Box 1e: Cost or other basis + basis_match = re.search(r"(?:Cost.*?basis|Box 1e).*?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text, re.IGNORECASE) + if basis_match: + data["cost_basis"] = self._parse_money(basis_match.group(1)) + + # Short-term vs long-term + data["short_term"] = bool(re.search(r"short.?term", text, re.IGNORECASE)) + data["long_term"] = bool(re.search(r"long.?term", text, re.IGNORECASE)) + + return data + + def _parse_money(self, money_str: str) -> Decimal: + """Parse money string to Decimal.""" + # Remove $, spaces, commas + clean = re.sub(r"[$,\s]", "", money_str) + return Decimal(clean) + + def _calculate_confidence_scores(self, data: dict[str, Any]) -> dict[str, float]: + """Calculate confidence scores for extracted fields. + + For structured/unstructured extraction, confidence is based on: + - Field population (present vs missing) + - Format validation (valid Decimal, SSN format, etc.) + """ + scores: dict[str, float] = {} + + for field, value in data.items(): + if value is None: + scores[field] = 0.0 + elif isinstance(value, Decimal): + # Money fields: high confidence if non-zero + scores[field] = 0.95 if value > 0 else 0.5 + elif isinstance(value, str): + # String fields: high confidence if non-empty + scores[field] = 0.90 if value else 0.0 + elif isinstance(value, bool): + # Boolean fields: medium confidence + scores[field] = 0.75 + else: + scores[field] = 0.85 # Default confidence + + return scores + + def _meets_confidence_threshold(self, scores: dict[str, float]) -> bool: + """Check if confidence scores meet threshold.""" + if not scores: + return False + + avg_score = sum(scores.values()) / len(scores) + return avg_score >= self.CONFIDENCE_THRESHOLD diff --git a/financegpt_backend/app/schemas/tax_forms.py b/financegpt_backend/app/schemas/tax_forms.py new file mode 100644 index 0000000..1dd3138 --- /dev/null +++ b/financegpt_backend/app/schemas/tax_forms.py @@ -0,0 +1,308 @@ +"""Pydantic schemas for tax forms.""" + +from datetime import date, datetime +from decimal import Decimal +from typing import Any, Literal, Optional +from uuid import UUID + +from pydantic import BaseModel, Field, field_validator + + +# Base Tax Form Schema +class TaxFormBase(BaseModel): + """Base schema for all tax forms.""" + + form_type: Literal["W2", "1099-MISC", "1099-INT", "1099-DIV", "1099-B"] + tax_year: int = Field(ge=1900, le=2100) + + @field_validator("tax_year") + @classmethod + def validate_tax_year(cls, v: int) -> int: + """Validate tax year is reasonable.""" + current_year = datetime.now().year + if v > current_year + 1: + raise ValueError(f"Tax year cannot be more than {current_year + 1}") + return v + + +class TaxFormCreate(TaxFormBase): + """Schema for creating a tax form.""" + + search_space_id: int + document_id: Optional[int] = None + + +class TaxFormResponse(TaxFormBase): + """Response schema for tax form.""" + + id: UUID + user_id: UUID + search_space_id: int + document_id: Optional[int] = None + uploaded_at: datetime + processed_at: Optional[datetime] = None + processing_status: Literal["pending", "processing", "completed", "failed", "needs_review"] + extraction_method: Optional[str] = None + confidence_score: Optional[Decimal] = None + needs_review: bool = False + created_at: datetime + updated_at: datetime + + class Config: + from_attributes = True + + +# W2 Form Schemas +class W2Box12Code(BaseModel): + """Box 12 code entry on W2.""" + + code: str = Field(max_length=2) + amount: Decimal = Field(ge=0, decimal_places=2) + + +class W2FormBase(BaseModel): + """Base schema for W2 form data.""" + + # Employer info (masked) + employer_name: Optional[str] = None + employer_ein_hash: Optional[str] = None + employer_address: Optional[str] = None + + # Wage information + wages_tips_compensation: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + federal_income_tax_withheld: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + social_security_wages: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + social_security_tax_withheld: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + medicare_wages: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + medicare_tax_withheld: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + social_security_tips: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + allocated_tips: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + + # Other compensation + dependent_care_benefits: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + nonqualified_plans: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + + # Box 12 codes + box_12_codes: Optional[list[W2Box12Code]] = None + + # Box 13 checkboxes + statutory_employee: bool = False + retirement_plan: bool = False + third_party_sick_pay: bool = False + + # State/Local tax + state_code: Optional[str] = Field(None, max_length=2) + state_wages: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + state_income_tax: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + local_wages: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + local_income_tax: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + locality_name: Optional[str] = Field(None, max_length=100) + + +class W2FormCreate(W2FormBase): + """Schema for creating W2 form.""" + + tax_form_id: UUID + field_confidence_scores: Optional[dict[str, float]] = None + raw_extraction_data: Optional[dict[str, Any]] = None + + +class W2FormResponse(W2FormBase): + """Response schema for W2 form.""" + + id: UUID + tax_form_id: UUID + employee_name_masked: Optional[str] = None + employee_ssn_hash: Optional[str] = None + field_confidence_scores: Optional[dict[str, float]] = None + created_at: datetime + updated_at: datetime + + class Config: + from_attributes = True + + +# 1099-MISC Form Schemas +class Form1099MiscBase(BaseModel): + """Base schema for 1099-MISC form.""" + + payer_name: Optional[str] = None + payer_tin_hash: Optional[str] = None + payer_address: Optional[str] = None + + rents: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + royalties: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + other_income: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + federal_income_tax_withheld: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + fishing_boat_proceeds: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + medical_health_payments: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + substitute_payments: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + crop_insurance_proceeds: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + gross_proceeds_attorney: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + section_409a_deferrals: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + state_tax_withheld: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + state_payer_number: Optional[str] = Field(None, max_length=50) + state_income: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + + +class Form1099MiscCreate(Form1099MiscBase): + """Schema for creating 1099-MISC form.""" + + tax_form_id: UUID + field_confidence_scores: Optional[dict[str, float]] = None + raw_extraction_data: Optional[dict[str, Any]] = None + + +class Form1099MiscResponse(Form1099MiscBase): + """Response schema for 1099-MISC form.""" + + id: UUID + tax_form_id: UUID + recipient_tin_hash: Optional[str] = None + field_confidence_scores: Optional[dict[str, float]] = None + created_at: datetime + updated_at: datetime + + class Config: + from_attributes = True + + +# 1099-INT Form Schemas +class Form1099IntBase(BaseModel): + """Base schema for 1099-INT form.""" + + payer_name: Optional[str] = None + payer_tin_hash: Optional[str] = None + + interest_income: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + early_withdrawal_penalty: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + interest_us_savings_bonds: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + federal_income_tax_withheld: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + investment_expenses: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + foreign_tax_paid: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + foreign_country: Optional[str] = Field(None, max_length=100) + tax_exempt_interest: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + specified_private_activity_bond_interest: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + market_discount: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + bond_premium: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + bond_premium_treasury: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + tax_exempt_bond_premium: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + + +class Form1099IntCreate(Form1099IntBase): + """Schema for creating 1099-INT form.""" + + tax_form_id: UUID + field_confidence_scores: Optional[dict[str, float]] = None + raw_extraction_data: Optional[dict[str, Any]] = None + + +class Form1099IntResponse(Form1099IntBase): + """Response schema for 1099-INT form.""" + + id: UUID + tax_form_id: UUID + field_confidence_scores: Optional[dict[str, float]] = None + created_at: datetime + updated_at: datetime + + class Config: + from_attributes = True + + +# 1099-DIV Form Schemas +class Form1099DivBase(BaseModel): + """Base schema for 1099-DIV form.""" + + payer_name: Optional[str] = None + payer_tin_hash: Optional[str] = None + + total_ordinary_dividends: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + qualified_dividends: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + total_capital_gain_distributions: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + unrecaptured_section_1250_gain: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + section_1202_gain: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + collectibles_gain: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + nondividend_distributions: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + federal_income_tax_withheld: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + section_199a_dividends: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + investment_expenses: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + foreign_tax_paid: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + foreign_country: Optional[str] = Field(None, max_length=100) + cash_liquidation_distributions: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + noncash_liquidation_distributions: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + + +class Form1099DivCreate(Form1099DivBase): + """Schema for creating 1099-DIV form.""" + + tax_form_id: UUID + field_confidence_scores: Optional[dict[str, float]] = None + raw_extraction_data: Optional[dict[str, Any]] = None + + +class Form1099DivResponse(Form1099DivBase): + """Response schema for 1099-DIV form.""" + + id: UUID + tax_form_id: UUID + field_confidence_scores: Optional[dict[str, float]] = None + created_at: datetime + updated_at: datetime + + class Config: + from_attributes = True + + +# 1099-B Form Schemas +class Form1099BBase(BaseModel): + """Base schema for 1099-B form.""" + + payer_name: Optional[str] = None + payer_tin_hash: Optional[str] = None + + description_of_property: Optional[str] = None + date_acquired: Optional[date] = None + date_sold: Optional[date] = None + proceeds: Optional[Decimal] = Field(None, decimal_places=2) + cost_basis: Optional[Decimal] = Field(None, decimal_places=2) + adjustments_to_basis: Optional[Decimal] = Field(None, decimal_places=2) + realized_gain_loss: Optional[Decimal] = Field(None, decimal_places=2) + federal_income_tax_withheld: Optional[Decimal] = Field(None, ge=0, decimal_places=2) + + short_term: Optional[bool] = None + long_term: Optional[bool] = None + basis_reported_to_irs: Optional[bool] = None + noncovered_security: Optional[bool] = None + + +class Form1099BCreate(Form1099BBase): + """Schema for creating 1099-B form.""" + + tax_form_id: UUID + field_confidence_scores: Optional[dict[str, float]] = None + raw_extraction_data: Optional[dict[str, Any]] = None + + +class Form1099BResponse(Form1099BBase): + """Response schema for 1099-B form.""" + + id: UUID + tax_form_id: UUID + field_confidence_scores: Optional[dict[str, float]] = None + created_at: datetime + updated_at: datetime + + class Config: + from_attributes = True + + +# Combined response with form details +class TaxFormWithDetails(TaxFormResponse): + """Tax form response with nested form-specific details.""" + + w2_form: Optional[W2FormResponse] = None + form_1099_misc: Optional[Form1099MiscResponse] = None + form_1099_int: Optional[Form1099IntResponse] = None + form_1099_div: Optional[Form1099DivResponse] = None + form_1099_b: Optional[Form1099BResponse] = None diff --git a/financegpt_backend/app/tasks/document_processors/file_processors.py b/financegpt_backend/app/tasks/document_processors/file_processors.py index f7eecb4..bbc1d7b 100644 --- a/financegpt_backend/app/tasks/document_processors/file_processors.py +++ b/financegpt_backend/app/tasks/document_processors/file_processors.py @@ -52,6 +52,114 @@ ) +async def _process_tax_form_if_applicable( + session: AsyncSession, + document: Document, + filename: str, + user_id: str, + search_space_id: int, +) -> None: + """ + Check if uploaded document is a tax form and trigger parsing if so. + + Detects tax forms based on filename patterns (w2, 1099) and content. + If detected, extracts structured data and saves to tax_forms tables. + + Args: + session: Database session + document: The uploaded document + filename: Original filename + user_id: User ID + search_space_id: Search space ID + """ + from uuid import UUID + from app.db import TaxForm + from app.parsers.tax_form_parser import TaxFormParser + from app.utils.pii_masking import prepare_tax_form_for_storage + from app.schemas.tax_forms import ( + W2FormCreate, + Form1099MiscCreate, + Form1099IntCreate, + Form1099DivCreate, + Form1099BCreate, + ) + import re + + try: + # Only process PDFs + if not filename.lower().endswith('.pdf'): + return + + # Detect tax form type from filename + filename_lower = filename.lower() + form_type = None + tax_year = None + + # Extract year from filename (e.g., "w2_2024.pdf", "2024_w2.pdf") + year_match = re.search(r'(20\d{2})', filename) + if year_match: + tax_year = int(year_match.group(1)) + + # Detect form type + if 'w2' in filename_lower or 'w-2' in filename_lower: + form_type = 'W2' + if not tax_year: + tax_year = 2024 # Default to current tax year + elif '1099' in filename_lower: + if 'misc' in filename_lower: + form_type = '1099-MISC' + elif 'int' in filename_lower: + form_type = '1099-INT' + elif 'div' in filename_lower: + form_type = '1099-DIV' + elif 'b' in filename_lower: + form_type = '1099-B' + else: + # Generic 1099, try to detect from content later + form_type = '1099-MISC' # Default + + if not tax_year: + tax_year = 2024 + + # If not a tax form, return early + if not form_type: + return + + logger.info(f"Detected tax form: {form_type} for year {tax_year} in file {filename}") + + # Create tax_form record + tax_form = TaxForm( + user_id=UUID(user_id), + search_space_id=search_space_id, + form_type=form_type, + tax_year=tax_year, + document_id=document.id, + processing_status='pending', + ) + session.add(tax_form) + await session.commit() + await session.refresh(tax_form) + + logger.info(f"Created tax form record with ID {tax_form.id}, starting parsing...") + + # TODO: Trigger async parsing task + # For now, log that parsing would happen + # In production, this would be a Celery task + logger.info( + f"Tax form parsing would be triggered here for {form_type} (tax_form_id={tax_form.id}). " + f"Parser integration pending." + ) + + # Update status to show it's queued for processing + tax_form.processing_status = 'processing' + await session.commit() + + except Exception as e: + logger.error(f"Error processing tax form for {filename}: {e}") + # Don't fail the document upload if tax parsing fails + await session.rollback() + + async def _save_investment_holdings( session: AsyncSession, user_id: str, @@ -522,6 +630,11 @@ async def add_received_file_document_using_unstructured( await session.commit() await session.refresh(document) + # After successful document creation, check if this is a tax form + await _process_tax_form_if_applicable( + session, document, file_name, user_id, search_space_id + ) + return document except SQLAlchemyError as db_error: await session.rollback() @@ -661,6 +774,11 @@ async def add_received_file_document_using_llamacloud( await session.commit() await session.refresh(document) + # After successful document creation, check if this is a tax form + await _process_tax_form_if_applicable( + session, document, file_name, user_id, search_space_id + ) + return document except SQLAlchemyError as db_error: await session.rollback() diff --git a/financegpt_backend/app/utils/pii_masking.py b/financegpt_backend/app/utils/pii_masking.py new file mode 100644 index 0000000..97f8286 --- /dev/null +++ b/financegpt_backend/app/utils/pii_masking.py @@ -0,0 +1,287 @@ +"""PII masking utilities for tax forms. + +This module provides functions to mask personally identifiable information (PII) +before sending tax form data to LLMs or external services. +""" + +import hashlib +import re +from typing import Any + + +def mask_ssn(ssn: str | None, keep_last: int = 4) -> str: + """Mask SSN, keeping only the last N digits. + + Args: + ssn: Social Security Number (any format: 123-45-6789, 123456789, etc.) + keep_last: Number of digits to keep unmasked (default 4) + + Returns: + Masked SSN like "***-**-6789" or "[SSN_REDACTED]" if invalid + + Examples: + >>> mask_ssn("123-45-6789") + "***-**-6789" + >>> mask_ssn("123456789") + "*****6789" + >>> mask_ssn("invalid") + "[SSN_REDACTED]" + """ + if not ssn: + return "[SSN_REDACTED]" + + # Remove all non-digit characters + digits_only = re.sub(r'\D', '', ssn) + + # SSN must be exactly 9 digits + if len(digits_only) != 9: + return "[SSN_REDACTED]" + + # Determine format based on original string + if '-' in ssn: + # Format: 123-45-6789 -> ***-**-6789 + last_digits = digits_only[-keep_last:] + return f"***-**-{last_digits}" + else: + # Format: 123456789 -> *****6789 + last_digits = digits_only[-keep_last:] + mask_count = 9 - keep_last + return ('*' * mask_count) + last_digits + + +def hash_tin(tin: str | None) -> str: + """Hash Tax Identification Number (SSN or EIN) using SHA-256. + + Args: + tin: SSN or EIN to hash + + Returns: + SHA-256 hash of the TIN (64 hex characters) + + Examples: + >>> hash_tin("123-45-6789") + "a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa07e998e86f7f7a27ae3" + """ + if not tin: + return "" + + # Remove all non-digit characters for consistent hashing + digits_only = re.sub(r'\D', '', tin) + + # Hash using SHA-256 + return hashlib.sha256(digits_only.encode('utf-8')).hexdigest() + + +def mask_ein(ein: str | None) -> str: + """Mask Employer Identification Number. + + Args: + ein: EIN in format XX-XXXXXXX + + Returns: + Hashed EIN (never shows plaintext for privacy) + + Examples: + >>> mask_ein("12-3456789") + (returns SHA-256 hash) + """ + return hash_tin(ein) + + +def mask_name(name: str | None, replacement: str = "[NAME_REDACTED]") -> str: + """Mask a person's name. + + Args: + name: Full name to mask + replacement: Replacement text (default: "[NAME_REDACTED]") + + Returns: + Replacement text + + Examples: + >>> mask_name("John Smith") + "[NAME_REDACTED]" + >>> mask_name("John Smith", "[EMPLOYEE]") + "[EMPLOYEE]" + """ + if not name: + return replacement + + return replacement + + +def mask_address(address: str | None, replacement: str = "[ADDRESS_REDACTED]") -> str: + """Mask a full address. + + Args: + address: Full address to mask + replacement: Replacement text (default: "[ADDRESS_REDACTED]") + + Returns: + Replacement text + + Examples: + >>> mask_address("123 Main St, New York, NY 10001") + "[ADDRESS_REDACTED]" + """ + if not address: + return replacement + + return replacement + + +def mask_tax_form_for_llm(form_data: dict[str, Any], form_type: str) -> dict[str, Any]: + """Mask all PII in a tax form before sending to LLM. + + This function removes or masks: + - SSN (keep last 4 for display) + - EIN (hash completely) + - Names (replace with placeholders) + - Addresses (replace with placeholders) + + Financial data (wages, taxes, etc.) is NOT masked. + + Args: + form_data: Dictionary containing tax form data + form_type: Type of form (W2, 1099-MISC, etc.) + + Returns: + Dictionary with PII masked + + Examples: + >>> w2_data = { + ... "employee_ssn": "123-45-6789", + ... "employer_ein": "12-3456789", + ... "employee_name": "John Smith", + ... "wages": 75000.00, + ... } + >>> masked = mask_tax_form_for_llm(w2_data, "W2") + >>> masked["employee_ssn"] + "***-**-6789" + >>> masked["wages"] + 75000.0 + """ + masked_data = form_data.copy() + + # Mask SSNs (keep last 4 for context) + if "employee_ssn" in masked_data: + masked_data["employee_ssn"] = mask_ssn(masked_data["employee_ssn"]) + + if "recipient_ssn" in masked_data: + masked_data["recipient_ssn"] = mask_ssn(masked_data["recipient_ssn"]) + + # Hash EINs (never show plaintext) + if "employer_ein" in masked_data: + masked_data["employer_ein_hash"] = hash_tin(masked_data["employer_ein"]) + del masked_data["employer_ein"] + + if "payer_tin" in masked_data: + masked_data["payer_tin_hash"] = hash_tin(masked_data["payer_tin"]) + del masked_data["payer_tin"] + + if "recipient_tin" in masked_data: + masked_data["recipient_tin_hash"] = hash_tin(masked_data["recipient_tin"]) + del masked_data["recipient_tin"] + + # Mask names + if "employee_name" in masked_data: + masked_data["employee_name"] = mask_name(masked_data["employee_name"], "[EMPLOYEE_NAME]") + + if "employer_name" in masked_data: + # Keep employer name for context (helps LLM understand employment) + # But could mask if user prefers + pass + + if "payer_name" in masked_data: + # Keep payer name (e.g., "Vanguard", "Chase Bank") - useful for context + pass + + # Mask addresses + if "employee_address" in masked_data: + masked_data["employee_address"] = mask_address(masked_data["employee_address"]) + + if "employer_address" in masked_data: + masked_data["employer_address"] = mask_address(masked_data["employer_address"]) + + if "payer_address" in masked_data: + masked_data["payer_address"] = mask_address(masked_data["payer_address"]) + + # Financial data is NOT masked - it's needed for analysis + # This includes: wages, taxes withheld, interest income, dividends, etc. + + return masked_data + + +def prepare_tax_form_for_storage(form_data: dict[str, Any]) -> dict[str, Any]: + """Prepare tax form data for database storage with proper hashing. + + This function: + - Hashes SSNs/EINs (stores hash only, never plaintext) + - Keeps financial data intact + - Optionally masks names/addresses based on user preference + + Args: + form_data: Dictionary containing raw tax form data + + Returns: + Dictionary ready for database insertion + """ + storage_data = form_data.copy() + + # Hash all TINs for storage (never store plaintext) + if "employee_ssn" in storage_data: + storage_data["employee_ssn_hash"] = hash_tin(storage_data["employee_ssn"]) + del storage_data["employee_ssn"] + + if "employer_ein" in storage_data: + storage_data["employer_ein_hash"] = hash_tin(storage_data["employer_ein"]) + del storage_data["employer_ein"] + + if "payer_tin" in storage_data: + storage_data["payer_tin_hash"] = hash_tin(storage_data["payer_tin"]) + del storage_data["payer_tin"] + + if "recipient_tin" in storage_data: + storage_data["recipient_tin_hash"] = hash_tin(storage_data["recipient_tin"]) + del storage_data["recipient_tin"] + + # Mask employee name for storage + if "employee_name" in storage_data: + storage_data["employee_name_masked"] = "[EMPLOYEE_NAME]" + del storage_data["employee_name"] + + # Keep employer/payer names (useful for queries) + # Keep financial data (wages, taxes, etc.) + + return storage_data + + +def validate_confidence_threshold( + confidence_scores: dict[str, float], + threshold: float = 0.85 +) -> tuple[bool, list[str]]: + """Check if confidence scores meet threshold. + + Args: + confidence_scores: Dictionary of field -> confidence score + threshold: Minimum acceptable confidence (default 0.85) + + Returns: + Tuple of (all_passed, list_of_failed_fields) + + Examples: + >>> scores = {"wages": 0.95, "federal_tax": 0.80, "ssn": 0.90} + >>> passed, failed = validate_confidence_threshold(scores, 0.85) + >>> passed + False + >>> failed + ['federal_tax'] + """ + failed_fields = [] + + for field, score in confidence_scores.items(): + if score < threshold: + failed_fields.append(field) + + all_passed = len(failed_fields) == 0 + return all_passed, failed_fields diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..2e36183 --- /dev/null +++ b/run.sh @@ -0,0 +1,206 @@ +#!/bin/bash +# ============================================================================== +# FinanceGPT Quick Start Script +# ============================================================================== +# Usage: ./run.sh [command] +# +# Commands: +# start - Start FinanceGPT (default) +# stop - Stop FinanceGPT +# restart - Restart FinanceGPT +# logs - Show logs (follow mode) +# status - Show container status +# update - Pull latest image and restart +# clean - Stop and remove all data (DESTRUCTIVE!) + +set -e + +COMPOSE_FILE="docker-compose.quickstart.yml" +CONTAINER_NAME="financegpt" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +print_banner() { + echo -e "${BLUE}" + echo "╔═══════════════════════════════════════════╗" + echo "║ FinanceGPT All-in-One ║" + echo "╚═══════════════════════════════════════════╝" + echo -e "${NC}" +} + +print_status() { + echo -e "${GREEN}✓${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}⚠${NC} $1" +} + +print_error() { + echo -e "${RED}✗${NC} $1" +} + +check_docker() { + if ! command -v docker &> /dev/null; then + print_error "Docker is not installed. Please install Docker first." + echo " → https://docs.docker.com/get-docker/" + exit 1 + fi + + if ! docker info &> /dev/null; then + print_error "Docker is not running. Please start Docker first." + exit 1 + fi +} + +start() { + print_banner + check_docker + + echo "Starting FinanceGPT..." + + # Check if .env exists + if [ -f ".env" ]; then + print_status "Using configuration from .env" + else + print_warning "No .env file found. Using defaults." + echo " → Copy .env.example to .env to customize settings" + fi + + # Pull latest image if not exists + echo "" + echo "Pulling latest image (if needed)..." + docker compose -f "$COMPOSE_FILE" pull + + # Start container + echo "" + echo "Starting container..." + docker compose -f "$COMPOSE_FILE" up -d + + echo "" + print_status "FinanceGPT is starting!" + echo "" + echo " Frontend: http://localhost:${FRONTEND_PORT:-3000}" + echo " Backend: http://localhost:${BACKEND_PORT:-8000}" + echo " API Docs: http://localhost:${BACKEND_PORT:-8000}/docs" + echo "" + echo " Note: First startup takes ~2 minutes to initialize databases." + echo " Run './run.sh logs' to watch startup progress." +} + +stop() { + print_banner + echo "Stopping FinanceGPT..." + docker compose -f "$COMPOSE_FILE" down + print_status "FinanceGPT stopped." +} + +restart() { + print_banner + echo "Restarting FinanceGPT..." + docker compose -f "$COMPOSE_FILE" restart + print_status "FinanceGPT restarted." +} + +logs() { + docker compose -f "$COMPOSE_FILE" logs -f +} + +status() { + print_banner + echo "Container Status:" + echo "" + docker compose -f "$COMPOSE_FILE" ps + echo "" + + # Check if running and show health + if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + HEALTH=$(docker inspect --format='{{.State.Health.Status}}' "$CONTAINER_NAME" 2>/dev/null || echo "unknown") + echo "Health: $HEALTH" + + if [ "$HEALTH" = "healthy" ]; then + print_status "FinanceGPT is running and healthy!" + elif [ "$HEALTH" = "starting" ]; then + print_warning "FinanceGPT is still starting up..." + else + print_warning "Health status: $HEALTH" + fi + else + print_warning "FinanceGPT is not running." + fi +} + +update() { + print_banner + echo "Updating FinanceGPT..." + + echo "Pulling latest image..." + docker compose -f "$COMPOSE_FILE" pull + + echo "Restarting with new image..." + docker compose -f "$COMPOSE_FILE" up -d + + print_status "FinanceGPT updated!" +} + +clean() { + print_banner + print_warning "This will DELETE all FinanceGPT data!" + echo "" + read -p "Are you sure? Type 'yes' to confirm: " confirm + + if [ "$confirm" = "yes" ]; then + echo "Stopping and removing containers..." + docker compose -f "$COMPOSE_FILE" down -v + + echo "Removing data volume..." + docker volume rm financegpt-data 2>/dev/null || true + + print_status "All FinanceGPT data has been removed." + else + echo "Cancelled." + fi +} + +# Main command handler +case "${1:-start}" in + start) + start + ;; + stop) + stop + ;; + restart) + restart + ;; + logs) + logs + ;; + status) + status + ;; + update) + update + ;; + clean) + clean + ;; + *) + echo "Usage: $0 {start|stop|restart|logs|status|update|clean}" + echo "" + echo "Commands:" + echo " start - Start FinanceGPT (default)" + echo " stop - Stop FinanceGPT" + echo " restart - Restart FinanceGPT" + echo " logs - Show logs (follow mode)" + echo " status - Show container status" + echo " update - Pull latest image and restart" + echo " clean - Stop and remove all data (DESTRUCTIVE!)" + exit 1 + ;; +esac diff --git a/scripts/docker/init-postgres.sh b/scripts/docker/init-postgres.sh index 4d9c66d..bd8acb1 100644 --- a/scripts/docker/init-postgres.sh +++ b/scripts/docker/init-postgres.sh @@ -59,17 +59,16 @@ CREATE USER $POSTGRES_USER WITH PASSWORD '$POSTGRES_PASSWORD' SUPERUSER; CREATE DATABASE $POSTGRES_DB OWNER $POSTGRES_USER; \c $POSTGRES_DB CREATE EXTENSION IF NOT EXISTS vector; - --- Create Electric SQL replication user -CREATE USER $ELECTRIC_DB_USER WITH REPLICATION PASSWORD '$ELECTRIC_DB_PASSWORD'; -GRANT CONNECT ON DATABASE $POSTGRES_DB TO $ELECTRIC_DB_USER; -GRANT USAGE ON SCHEMA public TO $ELECTRIC_DB_USER; -GRANT SELECT ON ALL TABLES IN SCHEMA public TO $ELECTRIC_DB_USER; -GRANT SELECT ON ALL SEQUENCES IN SCHEMA public TO $ELECTRIC_DB_USER; -ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO $ELECTRIC_DB_USER; -ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON SEQUENCES TO $ELECTRIC_DB_USER; EOF +# Run the same Electric SQL setup script used by local docker-compose +# This ensures both setups are identical +export POSTGRES_USER +export POSTGRES_DB +export ELECTRIC_DB_USER +export ELECTRIC_DB_PASSWORD +/app/init-electric-user.sh + echo "PostgreSQL initialized successfully." # Stop PostgreSQL (supervisor will start it) diff --git a/start-financegpt.sh b/start-financegpt.sh deleted file mode 100755 index 94b8b44..0000000 --- a/start-financegpt.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash -# FinanceGPT - Quick Start Script - -echo "==================================" -echo "FinanceGPT - Quick Start" -echo "==================================" -echo "" - -# Check if Docker is running -if ! docker info > /dev/null 2>&1; then - echo "❌ Docker is not running. Please start Docker Desktop first." - exit 1 -fi - -echo "✓ Docker is running" -echo "" - -# Check if we're in the right directory -if [ ! -f "docker-compose.quickstart.yml" ]; then - echo "❌ Error: docker-compose.quickstart.yml not found" - echo " Please run this script from the FinanceGPT root directory" - exit 1 -fi - -echo "Starting FinanceGPT with Docker Compose..." -echo "" - -# Pull latest images -echo "📥 Pulling latest images..." -docker compose -f docker-compose.quickstart.yml pull - -echo "" -echo "🚀 Starting services..." -docker compose -f docker-compose.quickstart.yml up -d - -echo "" -echo "⏳ Waiting for services to be ready..." -sleep 5 - -# Check if containers are running -if docker ps | grep -q financegpt; then - echo "" - echo "==================================" - echo "✅ FinanceGPT is running!" - echo "==================================" - echo "" - echo "🌐 Frontend: http://localhost:3000" - echo "🔧 Backend API: http://localhost:8000" - echo "📊 API Docs: http://localhost:8000/docs" - echo "" - echo "📝 To upload financial statements:" - echo " 1. Go to http://localhost:3000" - echo " 2. Create an account / Log in" - echo " 3. Upload your CSV/OFX files from Chase, Fidelity, etc." - echo " 4. Ask questions about your finances!" - echo "" - echo "🛑 To stop: docker compose -f docker-compose.quickstart.yml down" - echo "📋 View logs: docker compose -f docker-compose.quickstart.yml logs -f" - echo "" -else - echo "" - echo "❌ Failed to start FinanceGPT" - echo " Check logs with: docker compose -f docker-compose.quickstart.yml logs" - exit 1 -fi