Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .github/workflows/pipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: Scrape & Ingest Pipeline
on:
schedule:
- cron: "0 0 1 * *" # monthly on the 1st at midnight
workflow_dispatch: # manual trigger

jobs:
run-pipeline:
runs-on: ubuntu-latest
defaults:
run:
working-directory: scripts
steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: "3.11"

- name: Cache pip dependencies
uses: actions/cache@v4
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('scripts/requirements.txt') }}

- name: Cache sentence-transformers model
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
key: ${{ runner.os }}-hf-all-MiniLM-L6-v2

- name: Install dependencies
run: pip install -r requirements.txt

- name: Run pipeline
env:
DATABASE_URL: ${{ secrets.DATABASE_URL }}
run: python main.py
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,6 @@ yarn-error.log*
.idea

# clerk configuration (can include secrets)
/.clerk/
/.clerk/
*.pyc
.vscode/settings.json
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
"dependencies": {
"@clerk/nextjs": "^7.3.5",
"@flags-sdk/vercel": "1.3.0",
"@ai-sdk/openai": "^3.0.63",
"@ai-sdk/react": "^3.0.179",
"@prisma/client": "^6.4.1",
"@supabase/supabase-js": "^2.45.4",
"@t3-oss/env-nextjs": "^0.10.1",
Expand All @@ -31,6 +33,7 @@
"@trpc/react-query": "^11.0.0-rc.446",
"@trpc/server": "^11.0.0-rc.446",
"@vercel/analytics": "^2.0.1",
"ai": "^6.0.177",
"axios": "^1.7.9",
"browser-image-compression": "^2.0.2",
"classnames": "^2.5.1",
Expand Down
20 changes: 16 additions & 4 deletions prisma/schema.prisma
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
generator client {
provider = "prisma-client-js"
provider = "prisma-client-js"
previewFeatures = ["postgresqlExtensions"]
}

datasource db {
provider = "postgresql"
url = env("DATABASE_URL")
directUrl = env("DIRECT_URL")
provider = "postgresql"
url = env("DATABASE_URL")
directUrl = env("DIRECT_URL")
extensions = [vector]
}

model User {
Expand Down Expand Up @@ -127,3 +129,13 @@ enum AllTeamRoles {
// multi team
MultiTeam @map("Multi Team")
}

model Document {
id BigInt @id @default(autoincrement())
content String?
metadata Json?
embedding Unsupported("vector(384)")?

@@map("documents")
}

Binary file added public/assets/HeliosSideview.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added public/assets/Logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
144 changes: 144 additions & 0 deletions scripts/clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import re

INPUT_PATH = "/tmp/documents.json"
OUTPUT_PATH = "/tmp/documents_clean.json"

# --- Patterns to strip ---

# Image markdown: ![alt](url)
IMAGE_PATTERN = re.compile(r'!\[.*?\]\(.*?\)')

# Inline links: [text](url) -> keep just the text
INLINE_LINK_PATTERN = re.compile(r'\[([^\]]+)\]\([^\)]+\)')

# Reference-style links and bare URLs in angle brackets
REF_LINK_PATTERN = re.compile(r'\[.*?\]\[.*?\]')

# The repeated footer block that appears in every Calgary Solar Car page
FOOTER_MARKERS = [
"Follow us on our Social Media",
"Contact Information",
"communications@calgarysolarcar.ca",
"sponsorship@calgarysolarcar.ca",
"ENC 36, Schulich School of Engineering",
"© 2026 Calgary Solar Car",
]

# Wikipedia boilerplate sections to drop entirely (these appear as headings)
WIKIPEDIA_DROP_SECTIONS = [
"## References",
"## External links",
"## See also",
]

# Wikipedia navigation tables that are pure link noise
# These large navbox tables start with "| [Photovoltaics]" or "| [Energy]" etc.
NAVBOX_PATTERN = re.compile(
r'\| \[(?:Photovoltaics|Energy|Electric vehicles|Alternative fuel vehicles|'
r'The Sun|Natural resources|World Solar Challenge|American Solar Challenge|'
r'Formula Sun Grand Prix|University of Calgary)\].*',
re.DOTALL
)

# Wikipedia citation noise: [\1], [^1], \[1\], etc.
CITATION_PATTERN = re.compile(r'\[\\?\^?\\?\d+\\?\]')

# Escaped brackets from markdown conversion
ESCAPED_BRACKET_PATTERN = re.compile(r'\\[\[\]]')


def strip_footer(text: str) -> str:
"""Remove the repeated Calgary Solar Car footer from a document."""
for marker in FOOTER_MARKERS:
idx = text.find(marker)
if idx != -1:
# Walk back to find the start of the footer block
text = text[:idx].rstrip()
break
return text


def strip_wikipedia_boilerplate(text: str) -> str:
"""Remove References, External links, See also sections and navboxes."""
for section_header in WIKIPEDIA_DROP_SECTIONS:
idx = text.find(section_header)
if idx != -1:
text = text[:idx].rstrip()
break

# Remove navbox tables (large repeated link blocks at the end)
text = NAVBOX_PATTERN.sub("", text)
return text


def clean_text(text: str, source: str) -> str:
# 1. Strip image markdown
text = IMAGE_PATTERN.sub("", text)

# 2. Strip citations like [1], [^2], \[3\]
text = CITATION_PATTERN.sub("", text)
text = ESCAPED_BRACKET_PATTERN.sub("", text)

# 3. Source-specific cleanup
if "calgarysolarcar.ca" in source:
text = strip_footer(text)

if "wikipedia.org" in source:
text = strip_wikipedia_boilerplate(text)
# Convert inline links to plain text for Wikipedia
text = INLINE_LINK_PATTERN.sub(r'\1', text)
text = REF_LINK_PATTERN.sub("", text)

# 4. Collapse excessive blank lines (more than 2 in a row -> 2)
text = re.sub(r'\n{3,}', '\n\n', text)

# 5. Strip leading/trailing whitespace
text = text.strip()

return text


def is_empty_doc(text: str) -> bool:
"""Return True if the document has no meaningful content after cleaning."""
# Remove all markdown, whitespace, and punctuation
stripped = re.sub(r'[#\s\-\*_>|]', '', text)
return len(stripped) < 100


def clean(docs):
print(f"Cleaning {len(docs)} documents.")

cleaned_docs = []
skipped = []

for doc in docs:
source = doc.get("source", "")
original_text = doc.get("content", "")
cleaned_text = clean_text(original_text, source)

if is_empty_doc(cleaned_text):
skipped.append(source)
print(f" SKIPPED (empty after cleaning): {source}")
continue

original_len = len(original_text)
cleaned_len = len(cleaned_text)
reduction = 100 * (1 - cleaned_len / original_len) if original_len > 0 else 0

print(f" OK: {source}")
print(f" {original_len:,} chars -> {cleaned_len:,} chars ({reduction:.1f}% reduction)")

cleaned_docs.append({
"id": doc.get("id"),
"content": cleaned_text,
"source": source,
})

print(f"\nDone. {len(cleaned_docs)} documents cleaned.")
if skipped:
print(f"Skipped {len(skipped)} empty documents: {skipped}")

return cleaned_docs

if __name__ == "__main__":
pass
115 changes: 115 additions & 0 deletions scripts/ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import json
import os
import sys
from pathlib import Path

# Attempt to load required libraries, guide user if not installed
try:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import psycopg2
from dotenv import load_dotenv
except ImportError as e:
print(f"Missing dependency: {e}")
print("Please install requirements: pip install langchain-text-splitters sentence-transformers psycopg2-binary python-dotenv")
sys.exit(1)

# Load environment variables
dotenv_path = Path(__file__).parent.parent / ".env.local"
load_dotenv(dotenv_path)

db_url = os.environ.get("DATABASE_URL")
if not db_url:
print("Error: DATABASE_URL is missing. It should be set as a GitHub Actions secret containing the Supabase connection string.")
sys.exit(1)


def ingest(data):
# 1. Text Splitter
print("Splitting text...")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1024,
chunk_overlap=128,
separators=["\n\n", "\n", " ", ""]
)

chunks = []
for doc in data:
splits = text_splitter.split_text(doc["content"])
for i, split in enumerate(splits):
chunks.append({
"content": split,
"metadata": {
"source": doc.get("source", "Unknown"),
"chunk_id": f"{doc.get('id', 'doc')}_{i}"
}
})

# Process members.json if it exists
members_path = os.path.join(os.path.dirname(__file__), '..', 'members.json')
if os.path.exists(members_path):
print("Processing members data...")
with open(members_path, "r", encoding="utf-8") as f:
members_data = json.load(f)

for i, member in enumerate(members_data):
name = f"{member.get('firstName', '')} {member.get('lastName', '')}".strip()
role = member.get('teamRole', 'Unknown Role')
study = member.get('fieldOfStudy', 'Unknown Field')
year = member.get('schoolYear', '')
joined = member.get('yearJoined', '')
email = "redacted"
about = member.get('about', '')
linkedin = "redacted"

content = f"Team Member: {name}\nRole: {role}\nField of Study: {study} (Year: {year})\nJoined Team in: {joined}\nContact: {email}\nAbout: {about}\nLinkedIn: {linkedin}"

chunks.append({
"content": content,
"metadata": {
"source": "members.json",
"chunk_id": f"member_{i}"
}
})

print(f"Total chunks created: {len(chunks)}")

# 2. Generate Embeddings locally (no API key needed)
print("Loading embedding model (all-MiniLM-L6-v2: 384 dimensions)...")
model = SentenceTransformer('all-MiniLM-L6-v2')

texts = [chunk["content"] for chunk in chunks]
print("Computing embeddings...")
embeddings = model.encode(texts, show_progress_bar=True)

# 3. Store in local PostgreSQL
print("Storing vectors in local PostgreSQL...")
conn = None
try:
conn = psycopg2.connect(db_url)
with conn.cursor() as cur:
try:
# Clear old data so re-runs don't duplicate
cur.execute("DELETE FROM documents")

for i, chunk in enumerate(chunks):
embedding_list = embeddings[i].tolist()
cur.execute(
"INSERT INTO documents (content, metadata, embedding) VALUES (%s, %s, %s)",
(chunk["content"], json.dumps(chunk["metadata"]), str(embedding_list))
)
conn.commit()
except Exception as db_err:
conn.rollback()
raise db_err
print(f"Success! {len(chunks)} documents stored in local PostgreSQL.")
except Exception as e:
print(f"Database operation failed: {e}")
raise e
finally:
if conn is not None:
conn.close()


if __name__ == "__main__":
pass
Loading
Loading