UCSolarCarTeam · VyapakBansal · May 16, 2026 · May 16, 2026 · May 16, 2026 · May 16, 2026
diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml
@@ -0,0 +1,38 @@
+name: Scrape & Ingest Pipeline
+on:
+  schedule:
+    - cron: "0 0 1 * *" # monthly on the 1st at midnight
+  workflow_dispatch: # manual trigger
+
+jobs:
+  run-pipeline:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: scripts
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Cache pip dependencies
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('scripts/requirements.txt') }}
+
+      - name: Cache sentence-transformers model
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface
+          key: ${{ runner.os }}-hf-all-MiniLM-L6-v2
+
+      - name: Install dependencies
+        run: pip install -r requirements.txt
+
+      - name: Run pipeline
+        env:
+          DATABASE_URL: ${{ secrets.DATABASE_URL }}
+        run: python main.py
diff --git a/.gitignore b/.gitignore
@@ -46,4 +46,6 @@ yarn-error.log*
 .idea
 
 # clerk configuration (can include secrets)
-/.clerk/
+/.clerk/
+*.pyc
+.vscode/settings.json
diff --git a/package.json b/package.json
@@ -21,6 +21,8 @@
   "dependencies": {
     "@clerk/nextjs": "^7.3.5",
     "@flags-sdk/vercel": "1.3.0",
+    "@ai-sdk/openai": "^3.0.63",
+    "@ai-sdk/react": "^3.0.179",
     "@prisma/client": "^6.4.1",
     "@supabase/supabase-js": "^2.45.4",
     "@t3-oss/env-nextjs": "^0.10.1",
@@ -31,6 +33,7 @@
     "@trpc/react-query": "^11.0.0-rc.446",
     "@trpc/server": "^11.0.0-rc.446",
     "@vercel/analytics": "^2.0.1",
+    "ai": "^6.0.177",
     "axios": "^1.7.9",
     "browser-image-compression": "^2.0.2",
     "classnames": "^2.5.1",

diff --git a/prisma/schema.prisma b/prisma/schema.prisma
@@ -1,11 +1,13 @@
 generator client {
-  provider = "prisma-client-js"
+  provider        = "prisma-client-js"
+  previewFeatures = ["postgresqlExtensions"]
 }
 
 datasource db {
-  provider  = "postgresql"
-  url       = env("DATABASE_URL")
-  directUrl = env("DIRECT_URL")
+  provider   = "postgresql"
+  url        = env("DATABASE_URL")
+  directUrl  = env("DIRECT_URL")
+  extensions = [vector]
 }
 
 model User {
@@ -127,3 +129,13 @@ enum AllTeamRoles {
   // multi team
   MultiTeam @map("Multi Team")
 }
+
+model Document {
+  id        BigInt                 @id @default(autoincrement())
+  content   String?
+  metadata  Json?
+  embedding Unsupported("vector(384)")?
+
+  @@map("documents")
+}
+
diff --git a/public/assets/HeliosSideview.png b/public/assets/HeliosSideview.png
diff --git a/public/assets/Logo.png b/public/assets/Logo.png
diff --git a/scripts/clean.py b/scripts/clean.py
@@ -0,0 +1,144 @@
+import re
+
+INPUT_PATH = "/tmp/documents.json"
+OUTPUT_PATH = "/tmp/documents_clean.json"
+
+# --- Patterns to strip ---
+
+# Image markdown: ![alt](url)
+IMAGE_PATTERN = re.compile(r'!\[.*?\]\(.*?\)')
+
+# Inline links: [text](url) -> keep just the text
+INLINE_LINK_PATTERN = re.compile(r'\[([^\]]+)\]\([^\)]+\)')
+
+# Reference-style links and bare URLs in angle brackets
+REF_LINK_PATTERN = re.compile(r'\[.*?\]\[.*?\]')
+
+# The repeated footer block that appears in every Calgary Solar Car page
+FOOTER_MARKERS = [
+    "Follow us on our Social Media",
+    "Contact Information",
+    "communications@calgarysolarcar.ca",
+    "sponsorship@calgarysolarcar.ca",
+    "ENC 36, Schulich School of Engineering",
+    "© 2026 Calgary Solar Car",
+]
+
+# Wikipedia boilerplate sections to drop entirely (these appear as headings)
+WIKIPEDIA_DROP_SECTIONS = [
+    "## References",
+    "## External links",
+    "## See also",
+]
+
+# Wikipedia navigation tables that are pure link noise
+# These large navbox tables start with "| [Photovoltaics]" or "| [Energy]" etc.
+NAVBOX_PATTERN = re.compile(
+    r'\| \[(?:Photovoltaics|Energy|Electric vehicles|Alternative fuel vehicles|'
+    r'The Sun|Natural resources|World Solar Challenge|American Solar Challenge|'
+    r'Formula Sun Grand Prix|University of Calgary)\].*',
+    re.DOTALL
+)
+
+# Wikipedia citation noise: [\1], [^1], \[1\], etc.
+CITATION_PATTERN = re.compile(r'\[\\?\^?\\?\d+\\?\]')
+
+# Escaped brackets from markdown conversion
+ESCAPED_BRACKET_PATTERN = re.compile(r'\\[\[\]]')
+
+
+def strip_footer(text: str) -> str:
+    """Remove the repeated Calgary Solar Car footer from a document."""
+    for marker in FOOTER_MARKERS:
+        idx = text.find(marker)
+        if idx != -1:
+            # Walk back to find the start of the footer block
+            text = text[:idx].rstrip()
+            break
+    return text
+
+
+def strip_wikipedia_boilerplate(text: str) -> str:
+    """Remove References, External links, See also sections and navboxes."""
+    for section_header in WIKIPEDIA_DROP_SECTIONS:
+        idx = text.find(section_header)
+        if idx != -1:
+            text = text[:idx].rstrip()
+            break
+
+    # Remove navbox tables (large repeated link blocks at the end)
+    text = NAVBOX_PATTERN.sub("", text)
+    return text
+
+
+def clean_text(text: str, source: str) -> str:
+    # 1. Strip image markdown
+    text = IMAGE_PATTERN.sub("", text)
+
+    # 2. Strip citations like [1], [^2], \[3\]
+    text = CITATION_PATTERN.sub("", text)
+    text = ESCAPED_BRACKET_PATTERN.sub("", text)
+
+    # 3. Source-specific cleanup
+    if "calgarysolarcar.ca" in source:
+        text = strip_footer(text)
+
+    if "wikipedia.org" in source:
+        text = strip_wikipedia_boilerplate(text)
+        # Convert inline links to plain text for Wikipedia
+        text = INLINE_LINK_PATTERN.sub(r'\1', text)
+        text = REF_LINK_PATTERN.sub("", text)
+
+    # 4. Collapse excessive blank lines (more than 2 in a row -> 2)
+    text = re.sub(r'\n{3,}', '\n\n', text)
+
+    # 5. Strip leading/trailing whitespace
+    text = text.strip()
+
+    return text
+
+
+def is_empty_doc(text: str) -> bool:
+    """Return True if the document has no meaningful content after cleaning."""
+    # Remove all markdown, whitespace, and punctuation
+    stripped = re.sub(r'[#\s\-\*_>|]', '', text)
+    return len(stripped) < 100
+
+
+def clean(docs):
+    print(f"Cleaning {len(docs)} documents.")
+
+    cleaned_docs = []
+    skipped = []
+
+    for doc in docs:
+        source = doc.get("source", "")
+        original_text = doc.get("content", "")
+        cleaned_text = clean_text(original_text, source)
+
+        if is_empty_doc(cleaned_text):
+            skipped.append(source)
+            print(f"  SKIPPED (empty after cleaning): {source}")
+            continue
+
+        original_len = len(original_text)
+        cleaned_len = len(cleaned_text)
+        reduction = 100 * (1 - cleaned_len / original_len) if original_len > 0 else 0
+
+        print(f"  OK: {source}")
+        print(f"      {original_len:,} chars -> {cleaned_len:,} chars ({reduction:.1f}% reduction)")
+
+        cleaned_docs.append({
+            "id": doc.get("id"),
+            "content": cleaned_text,
+            "source": source,
+        })
+
+    print(f"\nDone. {len(cleaned_docs)} documents cleaned.")
+    if skipped:
+        print(f"Skipped {len(skipped)} empty documents: {skipped}")
+
+    return cleaned_docs
+
+if __name__ == "__main__":
+    pass
diff --git a/scripts/ingest.py b/scripts/ingest.py
@@ -0,0 +1,115 @@
+import json
+import os
+import sys
+from pathlib import Path
+
+# Attempt to load required libraries, guide user if not installed
+try:
+    from langchain_text_splitters import RecursiveCharacterTextSplitter
+    from sentence_transformers import SentenceTransformer
+    import psycopg2
+    from dotenv import load_dotenv
+except ImportError as e:
+    print(f"Missing dependency: {e}")
+    print("Please install requirements: pip install langchain-text-splitters sentence-transformers psycopg2-binary python-dotenv")
+    sys.exit(1)
+
+# Load environment variables
+dotenv_path = Path(__file__).parent.parent / ".env.local"
+load_dotenv(dotenv_path)
+
+db_url = os.environ.get("DATABASE_URL")
+if not db_url:
+    print("Error: DATABASE_URL is missing. It should be set as a GitHub Actions secret containing the Supabase connection string.")
+    sys.exit(1)
+
+
+def ingest(data):
+    # 1. Text Splitter
+    print("Splitting text...")
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1024,
+        chunk_overlap=128,
+        separators=["\n\n", "\n", " ", ""]
+    )
+
+    chunks = []
+    for doc in data:
+        splits = text_splitter.split_text(doc["content"])
+        for i, split in enumerate(splits):
+            chunks.append({
+                "content": split,
+                "metadata": {
+                    "source": doc.get("source", "Unknown"),
+                    "chunk_id": f"{doc.get('id', 'doc')}_{i}"
+                }
+            })
+
+    # Process members.json if it exists
+    members_path = os.path.join(os.path.dirname(__file__), '..', 'members.json')
+    if os.path.exists(members_path):
+        print("Processing members data...")
+        with open(members_path, "r", encoding="utf-8") as f:
+            members_data = json.load(f)
+
+        for i, member in enumerate(members_data):
+            name = f"{member.get('firstName', '')} {member.get('lastName', '')}".strip()
+            role = member.get('teamRole', 'Unknown Role')
+            study = member.get('fieldOfStudy', 'Unknown Field')
+            year = member.get('schoolYear', '')
+            joined = member.get('yearJoined', '')
+            email = "redacted"
+            about = member.get('about', '')
+            linkedin = "redacted"
+
+            content = f"Team Member: {name}\nRole: {role}\nField of Study: {study} (Year: {year})\nJoined Team in: {joined}\nContact: {email}\nAbout: {about}\nLinkedIn: {linkedin}"
+
+            chunks.append({
+                "content": content,
+                "metadata": {
+                    "source": "members.json",
+                    "chunk_id": f"member_{i}"
+                }
+            })
+
+    print(f"Total chunks created: {len(chunks)}")
+
+    # 2. Generate Embeddings locally (no API key needed)
+    print("Loading embedding model (all-MiniLM-L6-v2: 384 dimensions)...")
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+
+    texts = [chunk["content"] for chunk in chunks]
+    print("Computing embeddings...")
+    embeddings = model.encode(texts, show_progress_bar=True)
+
+    # 3. Store in local PostgreSQL
+    print("Storing vectors in local PostgreSQL...")
+    conn = None
+    try:
+        conn = psycopg2.connect(db_url)
+        with conn.cursor() as cur:
+            try:
+                # Clear old data so re-runs don't duplicate
+                cur.execute("DELETE FROM documents")
+
+                for i, chunk in enumerate(chunks):
+                    embedding_list = embeddings[i].tolist()
+                    cur.execute(
+                        "INSERT INTO documents (content, metadata, embedding) VALUES (%s, %s, %s)",
+                        (chunk["content"], json.dumps(chunk["metadata"]), str(embedding_list))
+                    )
+                conn.commit()
+            except Exception as db_err:
+                conn.rollback()
+                raise db_err
+        print(f"Success! {len(chunks)} documents stored in local PostgreSQL.")
+    except Exception as e:
+        print(f"Database operation failed: {e}")
+        raise e
+    finally:
+        if conn is not None:
+            conn.close()
+
+
+if __name__ == "__main__":
+    pass