diff --git a/src/goal_from_code.py b/src/goal_from_code.py
new file mode 100644
index 0000000..05afd08
--- /dev/null
+++ b/src/goal_from_code.py
@@ -0,0 +1,644 @@
+# src/goal_from_code.py
+
+"""
+Extract the "goal" or purpose of a code repository by analyzing its code/config files.
+"""
+
+import pathlib, subprocess, os, textwrap
+from typing import Iterable, List, Tuple, Optional
+import shutil
+import math
+
+BINARY_EXTS = {
+    # images & raster
+    ".png",
+    ".jpg",
+    ".jpeg",
+    ".gif",
+    ".bmp",
+    ".tiff",
+    ".tif",
+    ".ico",
+    ".svs",
+    ".webp",
+    # docs/binaries
+    ".pdf",
+    ".xlsx",
+    ".docx",
+    ".pptx",
+    # archives (single-suffix forms)
+    ".zip",
+    ".gz",
+    ".bz2",
+    ".xz",
+    ".7z",
+    ".rar",
+    # 3D / meshes (single-suffix)
+    ".fbx",
+    ".glb",
+    ".gltf",
+    ".stl",
+    ".ply",
+    ".las",
+    ".objz",
+    ".3ds",
+    # med/geo (single-suffix; multi-suffix handled above)
+    ".nii",
+    ".nrrd",
+    ".mhd",
+    ".mha",
+    ".geotiff",
+    # audio/video
+    ".mp4",
+    ".mp3",
+    ".wav",
+    ".avi",
+    ".mov",
+    ".webm",
+    ".m4a",
+    ".aac",
+    ".flac",
+    # fonts / wasm
+    ".woff",
+    ".woff2",
+    ".ttf",
+    ".otf",
+    ".wasm",
+    # design
+    ".psd",
+    ".ai",
+    ".xcf",
+    # db / sqlite
+    ".sqlite",
+    ".db",
+    ".db3",
+    # native libs / executables
+    ".so",
+    ".dylib",
+    ".dll",
+    ".exe",
+    ".bin",
+    ".obj",
+    # columnar / arrays / hdf
+    ".parquet",
+    ".feather",
+    ".h5",
+    ".hdf5",
+    ".npz",
+    ".npy",
+    # ML / notebooks / checkpoints
+    ".ipynb",
+    ".tfrecord",
+    ".pb",
+    ".onnx",
+    ".safetensors",
+    ".ckpt",
+    ".pt",
+    ".pth",
+    ".pkl",
+    ".pickle",
+    ".joblib",
+    # chunked array stores
+    ".zarr",
+}
+
+# Code/config extensions we will consider for goal synthesis
+CODE_EXTS = {
+    ".py",
+    ".pyi",
+    ".r",
+    ".rmd",
+    ".jl",
+    ".m",
+    ".c",
+    ".h",
+    ".hpp",
+    ".hxx",
+    ".hh",
+    ".cc",
+    ".cpp",
+    ".cxx",
+    ".cu",
+    ".cuh",
+    ".ino",
+    ".java",
+    ".scala",
+    ".kt",
+    ".kts",
+    ".groovy",
+    ".go",
+    ".rs",
+    ".swift",
+    ".php",
+    ".rb",
+    ".pl",
+    ".pm",
+    ".t",
+    ".lua",
+    ".fs",
+    ".fsx",
+    ".f90",
+    ".f95",
+    ".f03",
+    ".f08",
+    ".for",
+    ".ftn",
+    ".f",
+    ".cs",
+    ".vb",
+    ".vbs",
+    ".js",
+    ".mjs",
+    ".cjs",
+    ".jsx",
+    ".ts",
+    ".tsx",
+    ".json",
+    ".yml",
+    ".yaml",
+    ".toml",
+    ".ini",
+    ".cfg",
+    ".conf",
+    ".properties",
+    ".sh",
+    ".bash",
+    ".zsh",
+    ".fish",
+    ".bat",
+    ".cmd",
+    ".ps1",
+    ".psm1",
+    ".psd1",
+    ".html",
+    ".htm",
+    ".xhtml",
+    ".xml",
+    ".xsl",
+    ".xslt",
+    ".svg",
+    ".sql",
+    ".psql",
+    ".mysql",
+    ".pgsql",
+    ".hql",
+    ".cmake",
+    ".ninja",
+    ".bazel",
+    ".bzl",
+    ".gradle",
+    ".mk",
+    ".tex",
+    ".sty",
+    ".cls",
+    ".bib",
+    ".rst",
+    ".md",
+    ".markdown",
+    ".txt",
+    ".proto",
+    ".thrift",
+    ".avdl",
+    ".graphql",
+    ".gql",
+    ".sol",
+    ".asm",
+    ".s",
+    ".v",
+    ".vh",
+    ".sv",
+    ".svh",
+    ".vhdl",
+    ".vhd",
+    ".dart",
+    ".coffee",
+    ".erl",
+    ".hrl",
+    ".ex",
+    ".exs",
+    ".nim",
+    ".clj",
+    ".cljs",
+    ".edn",
+    ".lisp",
+    ".el",
+    ".scm",
+    ".ss",
+    ".cr",
+    ".mli",
+    ".ml",
+    ".re",
+    ".rei",
+    ".hx",
+    ".hxml",
+    ".wgsl",
+    ".metal",
+    ".glsl",
+    ".vert",
+    ".frag",
+    ".shader",
+}
+
+# Code/config files that often have no extension but are meaningful
+CODE_BASENAMES = {
+    "Dockerfile",
+    "Makefile",
+    "CMakeLists.txt",
+    "WORKSPACE",
+    "BUILD",
+    "BUILD.bazel",
+    "Gemfile",
+    "Rakefile",
+    "Procfile",
+    ".env",
+    ".env.example",
+    ".envrc",
+    ".gitignore",
+    ".gitattributes",
+    ".editorconfig",
+    "Pipfile",
+    "requirements.txt",
+    "pyproject.toml",
+    "setup.cfg",
+    "setup.py",
+    "package.json",
+    "package-lock.json",
+    "pnpm-lock.yaml",
+    "yarn.lock",
+    "tsconfig.json",
+    ".babelrc",
+    ".eslintrc",
+    ".prettierrc",
+    ".prettierignore",
+    ".ruff.toml",
+    ".flake8",
+}
+
+# Common dirs we gnore even if texty (not the project’s purpose):
+IGNORE_DIRS = {
+    ".git",
+    ".github",
+    ".gitlab",
+    ".svn",
+    ".hg",
+    "assets",
+    "static",
+    "public",
+    "media",
+    "images",
+    "img",
+    "figures",
+    "screenshots",
+    "thumbnails",
+    "downloads",
+    "node_modules",
+    "dist",
+    "build",
+    "out",
+    "target",
+    "__pycache__",
+    ".ipynb_checkpoints",
+    ".mypy_cache",
+    ".pytest_cache",
+    ".ruff_cache",
+    ".venv",
+    "venv",
+    "env",
+    ".idea",
+    ".vscode",
+    ".next",
+    ".cache",
+    ".parcel-cache",
+    "third_party",
+    "vendor",
+    ".tox",
+    ".eggs",
+    ".gradle",
+    ".nuget",
+    "Pods",
+    "Packages",
+    ".Rproj.user",
+    "models",
+    "model",
+    "checkpoints",
+    "artifacts",
+    "data",
+    "datasets",
+    "samples",
+    "sample-data",
+    "logs",
+    "log",
+    "tmp",
+    "temp",
+    ".coverage",
+    "coverage",
+}
+
+BINARY_MULTI_EXTS = (
+    ".tar.gz",
+    ".tgz",
+    ".tar.bz2",
+    ".tbz2",
+    ".tar.xz",
+    ".txz",
+    ".nii.gz",
+    ".ome.tif",
+    ".ome.tiff",
+)
+EXT_SIZE_LIMITS = {
+    ".json": 2_000_000,  # 2 MB
+    ".md": 1_500_000,
+    ".xml": 2_000_000,
+    ".svg": 800_000,
+    ".html": 1_500_000,
+}
+
+TEXT_MAX_BYTES_PER_CHUNK = 16384  # per chunk (bytes)
+CHUNKS_PER_FILE_CAP = 64  # hard cap per file to avoid huge files
+FILES_PER_DIR_CAP = 300  # safety: extremely large dirs will cap at this
+
+
+def delete_clone_path(p: pathlib.Path):
+    """Remove a previously cloned repo folder; ignore if missing."""
+    try:
+        if p.exists():
+            shutil.rmtree(p)
+    except FileNotFoundError:
+        pass
+    except Exception as e:
+        print(f"[warn] failed to delete clone at {p}: {e}")
+
+
+def _run(cmd: list[str], cwd: Optional[pathlib.Path] = None) -> str:
+    p = subprocess.run(
+        cmd, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+    )
+    if p.returncode != 0:
+        raise RuntimeError(f"Command failed: {' '.join(cmd)}\n{p.stderr.strip()}")
+    return p.stdout
+
+
+def shallow_clone(owner: str, repo: str, dest_root: pathlib.Path) -> pathlib.Path:
+    """
+    Clone or refresh a shallow checkout of https://github.com/{owner}/{repo}.git
+    into dest_root/owner/repo. Returns the repo path.
+    """
+    url = f"https://github.com/{owner}/{repo}.git"
+    dest = dest_root / owner / repo
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    if (dest / ".git").exists():
+        _run(["git", "fetch", "--depth", "1", "origin"], cwd=dest)
+        _run(["git", "reset", "--hard", "origin/HEAD"], cwd=dest)
+    else:
+        _run(["git", "clone", "--depth", "1", "--single-branch", url, str(dest)])
+    return dest
+
+
+def _is_binary_path(p: pathlib.Path) -> bool:
+    name_l = p.name.lower()
+    if name_l.endswith(BINARY_MULTI_EXTS):
+        return True
+    if p.suffix.lower() in BINARY_EXTS:
+        return True
+    try:
+        return p.stat().st_size > 2_000_000
+    except Exception:
+        return True
+
+
+def _should_skip_dir(p: pathlib.Path) -> bool:
+    name = p.name.lower()
+    return name in IGNORE_DIRS
+
+
+def iter_text_files(root: pathlib.Path) -> Iterable[pathlib.Path]:
+    """
+    Yield only code/config/text files under root, skipping obvious binary/vendor dirs.
+    """
+    for dirpath, dirnames, filenames in os.walk(root):
+        # prune ignored dirs
+        dirnames[:] = [
+            d for d in dirnames if not _should_skip_dir(pathlib.Path(dirpath) / d)
+        ]
+
+        # dir-level cap to avoid explosion
+        if len(filenames) > FILES_PER_DIR_CAP:
+            filenames = sorted(filenames)[:FILES_PER_DIR_CAP]
+
+        for fn in filenames:
+            p = pathlib.Path(dirpath) / fn
+
+            # skip files inside .git (belt & suspenders)
+            if ".git" in p.parts:
+                continue
+
+            # skip binary/huge files first
+            if _is_binary_path(p):
+                continue
+
+            ext = p.suffix.lower()
+            if p.name.startswith(".") and p.name not in CODE_BASENAMES:
+                continue
+            if ext not in CODE_EXTS and p.name not in CODE_BASENAMES:
+                continue
+            # cap oversized text-like files that aren’t useful for “goal” synthesis
+            name = p.name.lower()
+            if name.endswith(".min.js"):
+                continue
+            if name.endswith(".map"):
+                continue
+            lim = EXT_SIZE_LIMITS.get(ext)
+            if lim is not None:
+                try:
+                    if p.stat().st_size > lim:
+                        continue
+                except Exception:
+                    pass
+            yield p
+
+
+def chunk_file_bytes(
+    p: pathlib.Path,
+    max_chunk_bytes: int = TEXT_MAX_BYTES_PER_CHUNK,
+    max_chunks: int = CHUNKS_PER_FILE_CAP,
+    huge_threshold: Optional[int] = None,
+    windows: int = 8,
+):
+    """
+    Yield text chunks from a file with these rules:
+      - ≤ max_chunk_bytes: one chunk (whole file)
+      - (max_chunk_bytes, huge_threshold]: split into N near-equal chunks (N = ceil(size/max_chunk_bytes), capped)
+      - > huge_threshold: sample `windows` spans of size max_chunk_bytes (head/tail/middles)
+    """
+    if huge_threshold is None:
+        huge_threshold = max_chunk_bytes * max_chunks
+
+    try:
+        n = p.stat().st_size
+    except Exception:
+        return
+
+    if n == 0:
+        return
+
+    if n <= max_chunk_bytes:
+        try:
+            yield p.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            try:
+                yield p.read_bytes().decode("utf-8", errors="ignore")
+            except Exception:
+                return
+        return
+
+    if n > huge_threshold:
+        win_size = max_chunk_bytes
+        spans = []
+        spans.append((0, min(win_size, n)))
+        spans.append((max(0, n - win_size), n))
+        if windows > 2:
+            step = max((n - 2 * win_size) // (windows - 2), 1)
+            pos = win_size
+            for _ in range(windows - 2):
+                start = min(max(pos, 0), max(0, n - win_size))
+                spans.append((start, min(start + win_size, n)))
+                pos += step
+        try:
+            with p.open("rb") as f:
+                for s, e in spans[:max_chunks]:
+                    f.seek(s)
+                    chunk = f.read(e - s)
+                    yield chunk.decode("utf-8", errors="ignore")
+        except Exception:
+            return
+        return
+
+    # Medium: read once, but only as much as needed
+    try:
+        data = p.read_bytes()
+    except Exception:
+        return
+    num_chunks = min(max_chunks, max(2, math.ceil(n / max_chunk_bytes)))
+    chunk_size = math.ceil(n / num_chunks)
+    start = 0
+    for _ in range(num_chunks):
+        end = min(start + chunk_size, n)
+        if end <= start:
+            break
+        yield data[start:end].decode("utf-8", errors="ignore")
+        start = end
+
+
+def summarize_file_chunks(
+    path: str, chunks: List[str], model_low: str, model_medium: str, call_llm_fn
+) -> str:
+    """
+    Map step: summarize a single file's purpose-only signals from all its chunks.
+    """
+    bullets: List[str] = []
+    for i, ch in enumerate(chunks, 1):
+        prompt = textwrap.dedent(
+            f"""
+        From the file below, write 2-12 bullets capturing a summary that has ONLY purpose/intent (not usage/install/code minutiae).
+
+        FILE: {path} (chunk {i}/{len(chunks)})
+        ---
+        {ch}
+        """
+        ).strip()
+        msg = [
+            {
+                "role": "system",
+                "content": "Extract purpose-only bullets from file text. Keep it minimal.",
+            },
+            {"role": "user", "content": prompt},
+        ]
+        bullets.append(call_llm_fn(msg, model=model_low))
+    # reduce bullets -> one-liner for file
+    reduce_prompt = textwrap.dedent(
+        f"""
+    Combine the bullets below into a single 2-12 sentence purpose statement/summary for this file.
+    Avoid implementation details. If purpose is unclear, say 'unclear'.
+
+    FILE: {path}
+    BULLETS:
+    {chr(10).join(bullets)}
+    """
+    ).strip()
+    return call_llm_fn(
+        [
+            {
+                "role": "system",
+                "content": "Distill bullets into one short purpose sentence.",
+            },
+            {"role": "user", "content": reduce_prompt},
+        ],
+        model=model_medium,
+    )
+
+
+def summarize_directory(
+    file_summaries: List[Tuple[str, str]], model: str, call_llm_fn
+) -> str:
+    """
+    Reduce step: combine many file-purpose sentences into a directory-level 2-12 sentence purpose.
+    """
+    body = "\n".join([f"- {p}: {s}" for p, s in file_summaries])
+    prompt = textwrap.dedent(
+        f"""
+    Summarize the unified purpose for this collection of files in **2-12 sentences**.
+    Focus on what the code aims to do. Avoid lists and specifics.
+
+    FILE PURPOSES:
+    {body[:18000]}
+    """
+    ).strip()
+    return call_llm_fn(
+        [
+            {
+                "role": "system",
+                "content": "Synthesize a concise, faithful purpose across files.",
+            },
+            {"role": "user", "content": prompt},
+        ],
+        model=model,
+    )
+
+
+def synthesize_repo_goal_from_code(
+    repo_root: pathlib.Path,
+    model_medium: str,
+    model_low: str,
+    model_high: str,
+    call_llm_fn,
+) -> str:
+    """
+    Full map-reduce across ALL text files:
+      file chunks -> file purpose -> repo purpose (final goal).
+    """
+    # 1) Map: per-file purpose
+    file_purposes: List[Tuple[str, str]] = []
+    for p in iter_text_files(repo_root):
+        rel = str(p.relative_to(repo_root))
+        chunks = list(chunk_file_bytes(p))
+        if not chunks:
+            continue
+        try:
+            purpose = summarize_file_chunks(
+                rel,
+                chunks,
+                model_low=model_low,
+                model_medium=model_medium,
+                call_llm_fn=call_llm_fn,
+            )
+            if purpose and purpose.strip():
+                file_purposes.append((rel, purpose.strip()))
+        except Exception:
+            # skip noisy failures, continue
+            continue
+
+    if not file_purposes:
+        return "Goal not explicitly stated."
+
+    # 2) Reduce: repo-level purpose
+    return summarize_directory(file_purposes, model=model_high, call_llm_fn=call_llm_fn)
diff --git a/src/rollup_projects.py b/src/rollup_projects.py
new file mode 100644
index 0000000..354978b
--- /dev/null
+++ b/src/rollup_projects.py
@@ -0,0 +1,600 @@
+# src/rollup_projects.py
+"""
+Aggregate per-repo activity tables into per-project summaries.
+"""
+
+import os, json
+from datetime import datetime, timezone
+import numpy as np
+import pandas as pd
+from collections import Counter
+
+
+# ----------- Paths -----------
+CLEAN_DIR = "data/clean"  # where normalize_activity.py wrote the _all_*.parquet files
+OUT_DIR = "data/summary"  # where we’ll write per-project JSON and CSV
+os.makedirs(OUT_DIR, exist_ok=True)
+SEED_CSV = "data/projects_seed.csv"
+RAW_DIR = "data/raw/github"
+
+
+# ----------- Small utilities -----------
+def _jsonable(v):
+    """Convert pandas/NumPy/time values to JSON-safe Python types/strings."""
+    # pandas/pyarrow timestamps -> ISO8601 (UTC)
+    if isinstance(v, pd.Timestamp):
+        if v.tzinfo is None:
+            v = v.tz_localize("UTC")
+        else:
+            v = v.tz_convert("UTC")
+        return v.isoformat()
+    # python datetime -> ISO8601 (UTC)
+    if isinstance(v, datetime):
+        if v.tzinfo is None:
+            v = v.replace(tzinfo=timezone.utc)
+        else:
+            v = v.astimezone(timezone.utc)
+        return v.isoformat()
+    # NumPy scalars -> native Python
+    if isinstance(v, np.generic):
+        return v.item()
+    return v
+
+
+def load_or_empty(path: str) -> pd.DataFrame:
+    """Read a Parquet if it exists; else return an empty DataFrame (so code can proceed)."""
+    return pd.read_parquet(path) if os.path.exists(path) else pd.DataFrame()
+
+
+def load_seed(path: str = SEED_CSV) -> pd.DataFrame:
+    df = pd.read_csv(path)
+    # normalize columns we care about
+    for col in ["project_id", "project_name", "owner", "repo"]:
+        if col not in df.columns:
+            df[col] = None
+    # enforce string where applicable
+    if "project_id" in df.columns:
+        df["project_id"] = df["project_id"].astype("string")
+    if "project_name" in df.columns:
+        df["project_name"] = df["project_name"].astype("string")
+    return df
+
+
+def read_raw_json(owner: str, repo: str) -> dict | None:
+    path = os.path.join(RAW_DIR, f"{owner}__{repo}.json")
+    if not os.path.exists(path):
+        return None
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def top_n(counter_like, n=5):
+    """
+    Accepts:
+      - a list of values (we'll count them), or
+      - a Counter
+    Returns [{'value': <thing>, 'count': <int>}, ...] top n by frequency.
+    """
+    return [
+        {"value": k, "count": int(v)} for k, v in Counter(counter_like).most_common(n)
+    ]
+
+
+def explode_tags(series: pd.Series) -> list[str]:
+    """
+    Intent tags are stored like 'feature|fix|docs'.
+    Turn a series of such strings into a flat list of tag tokens.
+    """
+    vals = []
+    for s in series.dropna():
+        vals.extend([t for t in str(s).split("|") if t])
+    return vals
+
+
+def collect_examples(
+    df: pd.DataFrame, cols: list[str], n=5, sort_col: str | None = None
+) -> list[dict]:
+    """
+    Take the first n rows of df (optionally after sorting by sort_col DESC),
+    extract a subset of fields named in `cols`, converting values to JSON-safe types/strings.
+    """
+    out = []
+    if df is None or df.empty:
+        return out
+
+    sub = df
+    if sort_col and sort_col in df.columns:
+        try:
+            sub = df.sort_values(sort_col, ascending=False)
+        except Exception:
+            # If dtype is mixed/invalid for sorting, fall back to original order
+            sub = df
+
+    for _, r in sub.head(n).iterrows():
+        item = {}
+        for c in cols:
+            v = r.get(c)
+            item[c] = _jsonable(v)
+        out.append(item)
+    return out
+
+
+from collections import Counter
+
+
+def _iter_project_repos(
+    per_project_dfs: dict[str, pd.DataFrame], seed_slice: pd.DataFrame | None = None
+) -> list[tuple[str, str]]:
+    """
+    Return sorted list of (owner, repo) pairs present in activity tables OR listed in the seed.
+    """
+    pairs = set()
+    for df in per_project_dfs.values():
+        if (
+            isinstance(df, pd.DataFrame)
+            and not df.empty
+            and {"owner", "repo"}.issubset(df.columns)
+        ):
+            sub = df[["owner", "repo"]].dropna().astype(str)
+            pairs.update(map(tuple, sub.values))
+    if isinstance(seed_slice, pd.DataFrame) and not seed_slice.empty:
+        for _, r in seed_slice.dropna(subset=["owner", "repo"]).astype(str).iterrows():
+            pairs.add((r["owner"], r["repo"]))
+    return sorted(pairs)
+
+
+def _extract_single_repo_fields(
+    per_project_dfs: dict[str, pd.DataFrame], owner: str, repo: str
+) -> dict:
+    """
+    Pull context for one repo from any activity table row; fallback to raw JSON if needed.
+    Returns fields: description, homepage, topics[], primary_language, languages[], readme (str|None).
+    """
+    fields = {
+        "description": None,
+        "homepage": None,
+        "topics": [],
+        "primary_language": None,
+        "languages": [],
+        "readme": None,
+    }
+
+    def _split_csv(val):
+        if val is None or (isinstance(val, float) and pd.isna(val)):
+            return []
+        return [x.strip() for x in str(val).split(",") if x.strip()]
+
+    # Try to find a row for this repo in any table (prefer one with readme_text)
+    tables = [
+        "commits",
+        "issues",
+        "pull_requests",
+        "releases",
+        "pr_files",
+        "stargazers",
+        "forks",
+    ]
+    best = None
+    for t in tables:
+        df = per_project_dfs.get(t)
+        if not (isinstance(df, pd.DataFrame) and not df.empty):
+            continue
+        sub = df[(df.get("owner") == owner) & (df.get("repo") == repo)]
+        if sub.empty:
+            continue
+        with_readme = (
+            sub[~sub.get("readme_text").isna()]
+            if "readme_text" in sub.columns
+            else pd.DataFrame()
+        )
+        best = with_readme.iloc[0] if not with_readme.empty else sub.iloc[0]
+        break
+
+    if best is not None:
+        fields["description"] = best.get("repo_description") or None
+        fields["homepage"] = best.get("repo_homepage") or None
+        fields["topics"] = _split_csv(best.get("repo_topics"))
+        fields["primary_language"] = best.get("repo_primary_language") or None
+        fields["languages"] = _split_csv(best.get("repo_languages"))
+        fields["readme"] = best.get("readme_text") or None
+        # early return if we already have a README
+        if fields["readme"]:
+            return fields
+
+    # Fallback to raw JSON (helps for inactive repos)
+    raw = read_raw_json(owner, repo) or {}
+    if raw:
+        # topics
+        topics = []
+        try:
+            nodes = (raw.get("repositoryTopics") or {}).get("nodes", []) or []
+            topics = [
+                n["topic"]["name"]
+                for n in nodes
+                if n and n.get("topic") and n["topic"].get("name")
+            ]
+        except Exception:
+            pass
+        # languages
+        langs = []
+        try:
+            nodes = (raw.get("languages") or {}).get("nodes", []) or []
+            langs = [n.get("name") for n in nodes if n and n.get("name")]
+        except Exception:
+            pass
+        fields["description"] = fields["description"] or raw.get("description")
+        fields["homepage"] = fields["homepage"] or raw.get("homepageUrl")
+        fields["topics"] = fields["topics"] or topics
+        fields["primary_language"] = fields["primary_language"] or (
+            (raw.get("primaryLanguage") or {}).get("name")
+        )
+        fields["languages"] = fields["languages"] or langs
+        fields["readme"] = fields["readme"] or raw.get("__readme_text")
+
+    return fields
+
+
+def build_repo_context_all(
+    per_project_dfs: dict[str, pd.DataFrame],
+    seed_slice: pd.DataFrame | None = None,
+    readme_chars: int = 20000,
+) -> dict:
+    """
+    Aggregate context across ALL repos in the project (including inactive ones listed in the seed).
+    - description: most common non-empty; if multiple distinct, join a few unique variants (<=500 chars).
+    - homepage: most common non-empty
+    - topics: frequency-sorted union
+    - primary_language: most common
+    - languages: frequency-sorted union
+    - readme: concatenation of short per-repo README excerpts with 'owner/repo' headers (truncated to readme_chars)
+    """
+    pairs = _iter_project_repos(per_project_dfs, seed_slice=seed_slice)
+    if not pairs:
+        return {
+            "description": None,
+            "homepage": None,
+            "topics": [],
+            "primary_language": None,
+            "languages": [],
+            "readme": None,
+        }
+
+    descs, homes = [], []
+    topic_ctr, lang_ctr, primary_ctr = Counter(), Counter(), Counter()
+    parts = []
+
+    for owner, repo in pairs:
+        f = _extract_single_repo_fields(per_project_dfs, owner, repo)
+        if f["description"]:
+            descs.append(f["description"].strip())
+        if f["homepage"]:
+            homes.append(f["homepage"].strip())
+        topic_ctr.update([t for t in f["topics"] if t])
+        lang_ctr.update([l for l in f["languages"] if l])
+        if f["primary_language"]:
+            primary_ctr.update([f["primary_language"]])
+        if f["readme"]:
+            excerpt = f["readme"].strip()
+            # 2K per-repo excerpt to keep the total bounded
+            parts.append(f"### {owner}/{repo}\n{excerpt[:2000]}")
+
+    # Choose description/homepage by frequency; if many distinct descriptions, join a few
+    description = None
+    if descs:
+        desc_counts = Counter(descs).most_common()
+        description = desc_counts[0][0]
+        if len(desc_counts) > 1:
+            uniq = []
+            seen = set()
+            for d, _ in desc_counts:
+                if d not in seen:
+                    seen.add(d)
+                    uniq.append(d)
+                if len(" | ".join(uniq)) > 500:
+                    break
+            description = " | ".join(uniq)
+
+    homepage = Counter(homes).most_common(1)[0][0] if homes else None
+    topics = [t for t, _ in topic_ctr.most_common(50)]
+    languages = [l for l, _ in lang_ctr.most_common(50)]
+    primary_language = primary_ctr.most_common(1)[0][0] if primary_ctr else None
+    readme = (("\n\n").join(parts)[:readme_chars]) if parts else None
+
+    return {
+        "description": description,
+        "homepage": homepage,
+        "topics": topics,
+        "primary_language": primary_language,
+        "languages": languages,
+        "readme": readme,
+    }
+
+
+# ----------- Per-project summarization -----------
+
+
+def summarize_project(
+    project_id: str, project_name: str | None, per_project_dfs: dict[str, pd.DataFrame]
+) -> dict:
+    """
+    Build a single JSON-serializable summary for ONE project_id from its sliced tables.
+    Expects per_project_dfs to contain DataFrames for:
+      commits, issues, pull_requests, stargazers, forks, releases, pr_files
+    (Any may be empty DataFrames.)
+    """
+    # Pull per-table slices with defaults
+    commits = per_project_dfs.get("commits", pd.DataFrame())
+    issues = per_project_dfs.get("issues", pd.DataFrame())
+    prs = per_project_dfs.get("pull_requests", pd.DataFrame())
+    stars = per_project_dfs.get("stargazers", pd.DataFrame())
+    forks = per_project_dfs.get("forks", pd.DataFrame())
+    releases = per_project_dfs.get("releases", pd.DataFrame())
+    pr_files = per_project_dfs.get("pr_files", pd.DataFrame())
+
+    # Consider project "active" if any table has ≥1 row (forks are weak activity but still signal)
+    active = any(len(df) > 0 for df in [commits, issues, prs, releases, stars, forks])
+
+    # Aggregate intent tags across commits/issues/PRs/releases to top themes
+    theme_tags = explode_tags(
+        pd.concat(
+            [
+                commits.get("intent_tags", pd.Series(dtype=str)),
+                issues.get("intent_tags", pd.Series(dtype=str)),
+                prs.get("intent_tags", pd.Series(dtype=str)),
+                releases.get("intent_tags", pd.Series(dtype=str)),
+            ],
+            ignore_index=True,
+        )
+    )
+    themes = top_n(theme_tags, 6)
+
+    # Build example lists (recent first, robust datetime sort)
+    # Include owner/repo so higher-level summaries can balance across repos
+    release_examples = collect_examples(
+        releases,
+        ["owner", "repo", "published_at", "release_name", "release_tag", "release_url"],
+        n=5,
+        sort_col="published_at",
+    )
+    commit_examples = collect_examples(
+        commits,
+        [
+            "owner",
+            "repo",
+            "committed_at",
+            "author_login",
+            "message_headline",
+            "commit_url",
+        ],
+        n=5,
+        sort_col="committed_at",
+    )
+    pr_examples = collect_examples(
+        prs,
+        ["owner", "repo", "created_at", "author_login", "title", "pr_url", "state"],
+        n=5,
+        sort_col="created_at",
+    )
+    issue_examples = collect_examples(
+        issues,
+        ["owner", "repo", "created_at", "author_login", "title", "issue_url", "labels"],
+        n=5,
+        sort_col="created_at",
+    )
+
+    # “Areas touched” = frequent top-level directories and/or file extensions from PR files
+    areas = []
+    if (
+        isinstance(pr_files, pd.DataFrame)
+        and not pr_files.empty
+        and "path" in pr_files.columns
+    ):
+        paths = pr_files["path"].dropna().astype(str).tolist()
+        top_dirs = [
+            p.split("/")[0] for p in paths if "/" in p
+        ]  # e.g., 'api', 'src', 'docs'
+        exts = [
+            p.rsplit(".", 1)[-1] for p in paths if "." in p
+        ]  # e.g., 'py', 'md', 'yaml'
+        areas = top_n(top_dirs + exts, 10)
+
+    # Contributors = commit authors + PR authors (by login)
+    commit_authors = commits.get("author_login", pd.Series(dtype=str)).dropna().tolist()
+    pr_authors = prs.get("author_login", pd.Series(dtype=str)).dropna().tolist()
+    contributors = top_n(commit_authors + pr_authors, 8)
+
+    # Issue filers + issue label themes
+    issue_authors = issues.get("author_login", pd.Series(dtype=str)).dropna().tolist()
+    issue_labels = []
+    if "labels" in issues.columns:
+        for lbls in issues["labels"].dropna():
+            issue_labels.extend([x.strip() for x in str(lbls).split(",") if x.strip()])
+    issue_themes = top_n(
+        issue_labels + theme_tags, 8
+    )  # fuse label tokens with heuristic tags
+    issue_filers = top_n(issue_authors, 8)
+
+    # “Interest” signals (who starred, who forked)
+    stargazers = top_n(
+        stars.get("stargazer_login", pd.Series(dtype=str)).dropna().tolist(), 8
+    )
+    fork_owners = top_n(
+        forks.get("fork_owner_login", pd.Series(dtype=str)).dropna().tolist(), 8
+    )
+
+    # Context multi-repo aware
+    seed_slice = globals().get("_SEED_BY_PID", {}).get(project_id, pd.DataFrame())
+    repo_ctx = build_repo_context_all(per_project_dfs, seed_slice=seed_slice)
+
+    # Build the dict to return (JSON-serializable)
+    return {
+        "project_id": project_id,
+        "project_name": project_name,
+        "active_in_window": bool(active),
+        "repo_context": repo_ctx,  # representative repo (not arbitrary first table)
+        "areas_touched": areas,  # [{value:'api', count:7}, {value:'py', count:5}, …]
+        "themes": themes,  # heuristic tags aggregated
+        "contributors": contributors,  # top commit/PR authors
+        "issue_filers": issue_filers,  # top issue creators
+        "issue_themes": issue_themes,  # label tokens + intent tags
+        "recent_examples": {  # concrete, clickable traceability
+            "commits": commit_examples,
+            "pull_requests": pr_examples,
+            "issues": issue_examples,
+            "releases": release_examples,
+        },
+        "interest_signals": {
+            "stargazers": stargazers,
+            "fork_owners": fork_owners,
+        },
+        "notes": "Heuristic tags; examples sampled from the window. Repo context chosen by activity+README heuristic.",
+    }
+
+
+# ----------- Main orchestration -----------
+
+
+def main():
+    # Load ALL-REPO combined tables (may be empty if no rows were written for that table)
+    commits = load_or_empty(os.path.join(CLEAN_DIR, "_all_commits.parquet"))
+    issues = load_or_empty(os.path.join(CLEAN_DIR, "_all_issues.parquet"))
+    prs = load_or_empty(os.path.join(CLEAN_DIR, "_all_pull_requests.parquet"))
+    stars = load_or_empty(os.path.join(CLEAN_DIR, "_all_stargazers.parquet"))
+    forks = load_or_empty(os.path.join(CLEAN_DIR, "_all_forks.parquet"))
+    releases = load_or_empty(os.path.join(CLEAN_DIR, "_all_releases.parquet"))
+    pr_files = load_or_empty(os.path.join(CLEAN_DIR, "_all_pr_files.parquet"))
+
+    # Normalize key string columns if present (helps avoid dtype mismatches)
+    for df in [commits, issues, prs, releases, pr_files, stars, forks]:
+        if isinstance(df, pd.DataFrame) and not df.empty:
+            if "project_id" in df.columns:
+                df["project_id"] = df["project_id"].astype("string")
+            if "project_name" in df.columns:
+                df["project_name"] = df["project_name"].astype("string")
+
+    # Determine which project_ids exist anywhere across the tables
+    # project ids found in tables
+    projects_found = set(
+        pd.concat(
+            [
+                commits.get("project_id", pd.Series(dtype="string")),
+                issues.get("project_id", pd.Series(dtype="string")),
+                prs.get("project_id", pd.Series(dtype="string")),
+                releases.get("project_id", pd.Series(dtype="string")),
+                pr_files.get("project_id", pd.Series(dtype="string")),
+                stars.get("project_id", pd.Series(dtype="string")),
+                forks.get("project_id", pd.Series(dtype="string")),
+            ],
+            ignore_index=True,
+        )
+        .dropna()
+        .unique()
+    )
+
+    # load seed and union with found projects
+    seed_df = load_seed(SEED_CSV)
+    seed_df["project_id"] = seed_df["project_id"].astype("string")
+    seed_df["project_name"] = seed_df["project_name"].astype("string")
+    projects_seed = set(seed_df["project_id"].dropna().unique())
+
+    # expose a handy index for summarize_project()
+    global _SEED_BY_PID
+    _SEED_BY_PID = {pid: seed_df[seed_df["project_id"] == pid] for pid in projects_seed}
+
+    projects = sorted(projects_found | projects_seed)
+
+    # We'll accumulate machine-readable summaries here so summarize_portfolio.py can read them
+    portfolio = {"projects": [], "generated_from": CLEAN_DIR}
+
+    for pid in projects:
+        # Try to recover a human-friendly project_name from any table that has it for this pid
+        pname = None
+        for df in (commits, issues, prs, releases, pr_files, stars, forks):
+            if (
+                isinstance(df, pd.DataFrame)
+                and "project_id" in df.columns
+                and "project_name" in df.columns
+            ):
+                vals = df[df["project_id"] == pid]["project_name"].dropna().unique()
+                if len(vals):
+                    pname = vals[0]
+                    break
+
+        # Slice each table down to this project id (or keep the empty DataFrame)
+        per_project_dfs = {
+            "commits": (
+                commits[commits["project_id"] == pid] if not commits.empty else commits
+            ),
+            "issues": (
+                issues[issues["project_id"] == pid] if not issues.empty else issues
+            ),
+            "pull_requests": prs[prs["project_id"] == pid] if not prs.empty else prs,
+            "releases": (
+                releases[releases["project_id"] == pid]
+                if not releases.empty
+                else releases
+            ),
+            "pr_files": (
+                pr_files[pr_files["project_id"] == pid]
+                if not pr_files.empty
+                else pr_files
+            ),
+            "stargazers": (
+                stars[stars["project_id"] == pid] if not stars.empty else stars
+            ),
+            "forks": forks[forks["project_id"] == pid] if not forks.empty else forks,
+        }
+
+        # Compute distinct repos (and reuse to form a stable list)
+        seed_slice = _SEED_BY_PID.get(pid, pd.DataFrame())
+        pairs = _iter_project_repos(per_project_dfs, seed_slice=seed_slice)
+        repo_count = len(pairs)
+
+        # Produce the actual summary payload
+        summary = summarize_project(pid, pname, per_project_dfs)
+        summary["repo_count"] = repo_count
+        summary["repos"] = [{"owner": o, "repo": r} for (o, r) in pairs]
+
+        # Write per-project machine JSON (consumed by summarize_projects.py and summarize_portfolio.py)
+        out_json = os.path.join(OUT_DIR, f"{pid}.json")
+        with open(out_json, "w", encoding="utf-8") as f:
+            json.dump(summary, f, indent=2, ensure_ascii=False)
+        print(f"Wrote {out_json}")
+
+        # Also write a small people CSV to quickly see top participants
+        people_rows = []
+        for row in summary["contributors"]:
+            people_rows.append(
+                {
+                    "project_id": pid,
+                    "role": "contributor",
+                    "login": row["value"],
+                    "count": row["count"],
+                }
+            )
+        for row in summary["issue_filers"]:
+            people_rows.append(
+                {
+                    "project_id": pid,
+                    "role": "issue_filer",
+                    "login": row["value"],
+                    "count": row["count"],
+                }
+            )
+        if people_rows:
+            pd.DataFrame(people_rows).to_csv(
+                os.path.join(OUT_DIR, f"{pid}__people.csv"), index=False
+            )
+
+        # Add to the machine portfolio index
+        portfolio["projects"].append(summary)
+
+    # Write a simple portfolio JSON index listing all project summaries
+    # (summarize_portfolio.py will turn THIS into a narrative report)
+    with open(os.path.join(OUT_DIR, "_portfolio.json"), "w", encoding="utf-8") as f:
+        json.dump(portfolio, f, indent=2, ensure_ascii=False)
+    print(f"Wrote {os.path.join(OUT_DIR, '_portfolio.json')}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/summarize_portfolio.py b/src/summarize_portfolio.py
new file mode 100644
index 0000000..e475cd6
--- /dev/null
+++ b/src/summarize_portfolio.py
@@ -0,0 +1,625 @@
+# src/summarize_portfolio.py
+"""
+Generate a portfolio-level executive summary report in Markdown, synthesizing information.
+"""
+
+import os, json, argparse, textwrap, time
+from typing import List, Dict, Any
+from collections import Counter
+from dotenv import load_dotenv
+from openai import OpenAI
+from datetime import datetime, timezone
+import re
+
+# ------------------ Environment & Client Setup ------------------
+load_dotenv()  # pull OPENAI_API_KEY / OPENAI_MODEL from .env if present
+
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+if not OPENAI_API_KEY:
+    # Fail fast if the key is missing-nothing will work without it.
+    raise SystemExit("Missing OPENAI_API_KEY in .env")
+
+# Default model can be overridden by --model at runtime
+DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
+
+# Initialize OpenAI client (reads API key from env)
+DEFAULT_HTTP_TIMEOUT = float(os.environ.get("OPENAI_HTTP_TIMEOUT", "60"))  # seconds
+client = OpenAI(
+    timeout=DEFAULT_HTTP_TIMEOUT, max_retries=0
+)  # applies connect/read/write timeouts
+
+# Where we read portfolio JSON and write the final Markdown
+REPORTS_DIR = "reports"
+SUMMARY_DIR = "data/summary"
+os.makedirs(REPORTS_DIR, exist_ok=True)
+
+# --- Project MD: extract "## Recent Developments (...)" ---
+_PROJECT_ACTIVITY_RE_TMPL = (
+    r"^##\s*Recent Developments\s*\(\s*{label}\s*\)\s*\n(.*?)(?:\n##\s+|\Z)"
+)
+
+# put near the top with the other regexes
+_GOAL_RE = re.compile(
+    r"^\s*##\s*Summary\s*(?:and|&)\s*Goal\s*\n(.*?)(?=^\s*##\s|\Z)",
+    flags=re.DOTALL | re.MULTILINE | re.IGNORECASE,
+)
+
+
+def _read_project_activity_from_md(project_id: str, window_label: str) -> str | None:
+    """
+    Read reports/<PROJECT_ID>.md and extract the '## Recent Developments (<window_label>)' block.
+    Returns stripped text or None.
+    """
+    path = os.path.join(REPORTS_DIR, f"{project_id}.md")
+    if not os.path.exists(path):
+        return None
+    with open(path, "r", encoding="utf-8") as f:
+        text = f.read()
+    # Make a regex that matches the exact window label literally
+    pat = re.compile(
+        _PROJECT_ACTIVITY_RE_TMPL.format(label=re.escape(window_label)),
+        flags=re.DOTALL | re.MULTILINE,
+    )
+    m = pat.search(text)
+    if not m:
+        return None
+    body = (m.group(1) or "").strip()
+    return body or None
+
+
+def build_portfolio_activity_corpus_from_project_mds(
+    projects: List[Dict[str, Any]], window_label: str
+) -> str:
+    """
+    Concatenate the '## Recent Developments (<window_label>)' section from each project's MD,
+    with clear project labels and a per-project character cap to prevent domination.
+    Skips boilerplate 'No changes in <window_label>' lines.
+    """
+
+    def _cap_for(n, soft_total=12000, min_cap=300, max_cap=900):
+        # Aim for ~12k chars total; clamp to keep useful signal
+        return max(min_cap, min(max_cap, soft_total // max(1, n)))
+
+    parts: list[str] = []
+    n = max(1, len(projects))
+    per_cap = _cap_for(n)
+    for p in sorted(projects, key=lambda x: (x.get("project_id") or "")):
+        pid = (p.get("project_id") or "").strip()
+        if not pid:
+            continue
+        block = _read_project_activity_from_md(pid, window_label)
+        if not block:
+            continue
+        if block.strip() == f"**No changes in {window_label}**":
+            continue
+        pname = (p.get("project_name") or pid).strip()
+        parts.append(f"[PROJECT {pid} — {pname}]\n{block.strip()[:per_cap]}")
+    return ("\n\n".join(parts)).strip()
+
+
+def _goal_from_project_md(project_id: str) -> str | None:
+    """
+    Fallback: read 'reports/<PROJECT_ID>.md' and extract the text under '## Goal'
+    up to the next '## ' or end-of-file. Returns stripped text or None.
+    """
+    md_path = os.path.join(REPORTS_DIR, f"{project_id}.md")
+    if not os.path.exists(md_path):
+        return None
+    with open(md_path, "r", encoding="utf-8") as f:
+        md = f.read()
+    m = re.search(
+        r"^## Summary and Goal\s*\n(.*?)(?:\n## |\Z)",
+        md,
+        flags=re.DOTALL | re.MULTILINE,
+    )
+    if not m:
+        return None
+    text = m.group(1).strip()
+    return text or None
+
+
+# ------------------ Utilities ------------------
+
+
+def _footer():
+    dmy = datetime.now(timezone.utc).strftime("%d/%m/%Y")
+    return f"\n\n*Report generated using A.I. on {dmy}*"
+
+
+def read_json(path: str) -> Dict[str, Any]:
+    """Load JSON file from disk into a Python dict."""
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def call_llm(messages, model: str, max_retries: int = 4) -> str:
+    """
+    Robust LLM call for prebuilt messages:
+    - bounded retries with backoff,
+    - treats empty content as an error,
+    - surfaces the last error clearly.
+    """
+    last_err = None
+    for attempt in range(1, max_retries + 1):
+        try:
+            resp = client.chat.completions.create(
+                model=model,
+                messages=messages,
+            )
+            text = (resp.choices[0].message.content or "").strip()
+            if not text:
+                raise RuntimeError("Empty completion")
+            return text
+        except Exception as e:
+            last_err = e
+            time.sleep(min(2**attempt, 10))
+    raise RuntimeError(f"LLM failed after {max_retries} attempts: {last_err}")
+
+
+def _chunk_text(s: str, chunk_chars: int = 12000, overlap: int = 500) -> List[str]:
+    """
+    Split a long string into overlapping chunks.
+    We use this when synthesizing a clean goal statement from a long README.
+    """
+    s = s or ""
+    if len(s) <= chunk_chars:
+        return [s]
+    chunks, start, n = [], 0, len(s)
+    while start < n:
+        end = min(start + chunk_chars, n)
+        chunks.append(s[start:end])
+        if end == n:
+            break
+        start = max(0, end - overlap)  # overlap keeps continuity between chunks
+    return chunks
+
+
+def compute_portfolio_metrics(projects: List[Dict[str, Any]]):
+    totals = dict(
+        projects=len(projects),
+        active=sum(1 for p in projects if p.get("active_in_window")),
+    )
+    commits = prs = issues = releases = 0
+    theme_ctr = Counter()
+    area_ctr = Counter()
+
+    for p in projects:
+        ex = p.get("recent_examples") or {}
+        commits += len(ex.get("commits") or [])
+        prs += len(ex.get("pull_requests") or [])
+        issues += len(ex.get("issues") or [])
+        releases += len(ex.get("releases") or [])
+        for t in p.get("themes") or []:
+            if t and t.get("value"):
+                theme_ctr.update([t["value"]])
+        for a in p.get("areas_touched") or []:
+            if a and a.get("value"):
+                area_ctr.update([a["value"]])
+
+    totals.update(
+        {
+            "commits": commits,
+            "prs": prs,
+            "issues": issues,
+            "releases": releases,
+            "top_themes": [f"{k} ({v})" for k, v in theme_ctr.most_common(6)],
+            "top_areas": [f"{k} ({v})" for k, v in area_ctr.most_common(6)],
+        }
+    )
+    return totals
+
+
+# ------------------ README → Goal (helper) ------------------
+def summarize_readme_goal(readme_text: str, model: str) -> str:
+    """
+    Distill a potentially long README into a crisp purpose statement.
+    Strategy:
+      1) Summarize each chunk into "purpose-only" bullets.
+      2) Synthesize a final  goal from all bullets.
+    """
+    chunks = _chunk_text(readme_text, chunk_chars=12000, overlap=500)
+    bullets = []
+    for i, ch in enumerate(chunks, 1):
+        messages = [
+            {
+                "role": "system",
+                "content": "You extract the core PURPOSE of a repository from README text.",
+            },
+            {
+                "role": "user",
+                "content": textwrap.dedent(
+                    f"""
+                From the README chunk below, write 3–5 ultra-concise bullets capturing the repository's PURPOSE only.
+                Avoid installation/usage details, badges, and marketing language.
+
+                --- README CHUNK {i}/{len(chunks)} ---
+                {ch}
+            """
+                ).strip(),
+            },
+        ]
+        bullets.append(call_llm(messages, model=model))
+
+    synth_messages = [
+        {
+            "role": "system",
+            "content": "You distill bullets into a faithful, succinct purpose statement.",
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                f"""
+            Combine the bullets below into a single 1–2 sentence statement describing the repository/project goal.
+            Do not invent details.
+
+            BULLETS:
+            {chr(10).join(bullets)}
+        """
+            ).strip(),
+        },
+    ]
+    return call_llm(synth_messages, model=model).strip()
+
+
+# ------------------ Render helpers ------------------
+def safe_kv_list(items: List[Dict[str, Any]], k="value", c="count", top=6) -> List[str]:
+    """
+    Turn [{'value': 'foo', 'count': 7}, ...] into ['foo (7)', ...], with a top N cap.
+    Safely handles missing keys or None.
+    """
+    out = []
+    for it in (items or [])[:top]:
+        if it and it.get(k):
+            if c in it and it.get(c) is not None:
+                out.append(f"{it[k]} ({it[c]})")
+            else:
+                out.append(str(it[k]))
+    return out
+
+
+def ex_lines(items: List[Dict[str, Any]], fields: List[str], n=4) -> List[str]:
+    """
+    Turn an array of dicts into 'field1 - field2 - field3' lines.
+    n caps how many lines to return.
+    """
+    if not items:
+        return []
+    lines = []
+    for it in items[:n]:
+        parts = []
+        for f in fields:
+            v = it.get(f)
+            if v:
+                parts.append(str(v))
+        if parts:
+            lines.append(" - ".join(parts))
+    return lines
+
+
+# ------------------ Prompt builders ------------------
+def _extract_goal_from_md(text: str) -> str | None:
+    # normalize line endings and strip BOM if present
+    text = (text or "").lstrip("\ufeff").replace("\r\n", "\n").replace("\r", "\n")
+    m = _GOAL_RE.search(text)
+    if not m:
+        return None
+    body = (m.group(1) or "").strip()
+    return body or None
+
+
+def _collect_project_goal(project_id: str) -> str | None:
+    """
+    Read reports/<PROJECT_ID>.md and extract its ## Goal text.
+    Returns a single string or None if missing.
+    """
+    path = os.path.join(REPORTS_DIR, f"{project_id}.md")
+    if not os.path.exists(path):
+        return None
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            md = f.read()
+        return _extract_goal_from_md(md)
+    except Exception:
+        return None
+
+
+def _collect_repo_goals_for_project(project_id: str) -> list[str]:
+    """
+    Scan reports/ for ALL repo-level files belonging to this project
+    (files named '<PROJECT_ID>__<owner>__<repo>.md') and collect their ## Goal text.
+    """
+    goals: list[str] = []
+    prefix = f"{project_id}__"
+    if not os.path.isdir(REPORTS_DIR):
+        return goals
+    for fname in os.listdir(REPORTS_DIR):
+        if not (fname.startswith(prefix) and fname.endswith(".md")):
+            continue
+        # Skip the project-level file (`reports/<PROJECT_ID>.md`)
+        if fname == f"{project_id}.md":
+            continue
+        path = os.path.join(REPORTS_DIR, fname)
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                md = f.read()
+            g = _extract_goal_from_md(md)
+            if g:
+                goals.append(g)
+        except Exception:
+            # Ignore unreadable files; keep going
+            pass
+    return goals
+
+
+def build_balanced_goal_corpus(projects: List[Dict[str, Any]]) -> str:
+    """
+    Create a single corpus representing the portfolio mission, in priority:
+      1) Project-level MD '## Goal' (reports/<PROJECT_ID>.md) — synthesized from repo code
+      2) Fallback: aggregate all repo-level MD '## Goal' for that project
+      3) Last resort: project.repo_context (readme -> description -> topics)
+    Includes ALL available goal text without truncation.
+    """
+
+    def _cap_for(n, soft_total=10000, min_cap=250, max_cap=800):
+        return max(min_cap, min(max_cap, soft_total // max(1, n)))
+
+    parts: list[str] = []
+    n = max(1, len(projects))
+    per_cap = _cap_for(n)
+
+    for p in sorted(projects, key=lambda x: (x.get("project_id") or "")):
+        pid = (p.get("project_id") or "").strip()
+        project_text = ""
+
+        # (1) Prefer project-level Goal
+        g_project = _collect_project_goal(pid)
+        if g_project:
+            project_text = g_project
+        else:
+            # (2) Fallback to aggregated repo Goals
+            repo_goals = _collect_repo_goals_for_project(pid)
+            if repo_goals:
+                project_text = "\n\n".join(repo_goals)
+            else:
+                # (3) Last resort: repo_context
+                ctx = p.get("repo_context") or {}
+                readme = (ctx.get("readme") or "").strip()
+                desc = (ctx.get("description") or "").strip()
+                topics = ctx.get("topics") or []
+                if readme:
+                    project_text = readme
+                elif desc:
+                    project_text = desc
+                elif topics:
+                    project_text = "Topics: " + ", ".join(map(str, topics[:6]))
+                else:
+                    project_text = ""
+
+        project_text = project_text.strip()
+        if project_text:
+            pname = (p.get("project_name") or pid).strip()
+            parts.append(f"[PROJECT {pid} — {pname}]\n{project_text[:per_cap]}")
+
+    return "\n\n".join(parts).strip()
+
+
+def build_portfolio_overview_prompt(
+    projects: List[Dict[str, Any]], window_label: str, model: str
+) -> str:
+    """
+    Build a prompt for a half-page portfolio summary with two sections:
+      1) Portfolio Goal — one unified mission synthesized across ALL projects (from project/repo MDs/readme fallbacks).
+      2) Recent Developments — PRIMARY: concatenation of 'Recent Developments' from project MDs; FALLBACK: rollup metrics.
+    """
+    m = compute_portfolio_metrics(projects)
+
+    # 1) Unified goal corpus (all available text; uncapped)
+    goal_corpus = build_balanced_goal_corpus(projects)
+    has_goal = bool(goal_corpus.strip())
+
+    # 2) Primary activity source: concatenate each project's Recent Developments from its MD
+    activity_corpus = build_portfolio_activity_corpus_from_project_mds(
+        projects, window_label
+    )
+    has_activity_md = bool(activity_corpus.strip())
+
+    # 3) Small set of inlineable examples (used only if we fall back)
+    def pick_inline_examples(ps: List[Dict[str, Any]], max_n: int = 4) -> List[str]:
+        """
+        Choose at most one example per project before repeating (round-robin across projects),
+        preferring releases -> PRs -> commits -> issues.
+        """
+        buckets = ["releases", "pull_requests", "commits", "issues"]
+        # Pre-extract first candidate per bucket per project
+        by_project = []
+        for p in sorted(ps, key=lambda x: (x.get("project_id") or "")):
+            ex = p.get("recent_examples") or {}
+            cand = None
+            for b in buckets:
+                arr = ex.get(b) or []
+                if arr:
+                    it = arr[0]
+                    name = (
+                        it.get("title")
+                        or it.get("release_name")
+                        or it.get("message_headline")
+                        or b
+                    )
+                    url = (
+                        it.get("pr_url")
+                        or it.get("release_url")
+                        or it.get("commit_url")
+                        or it.get("issue_url")
+                    )
+                    if url:
+                        cand = f"[{name}]({url})"
+                        break
+            if cand:
+                by_project.append(cand)
+        # Round-robin: one per project, then stop at max_n
+        return by_project[:max_n]
+
+    examples = pick_inline_examples(projects, max_n=4)
+    examples_str = "; ".join(examples) if examples else "-"
+    # 4) Project list (context only; do not require the model to list them)
+    proj_list = (
+        ", ".join(
+            [
+                f"{p.get('project_id')}"
+                for p in sorted(projects, key=lambda x: (x.get("project_id") or ""))
+            ]
+        )
+        or "-"
+    )
+
+    # 5) Final prompt
+    return textwrap.dedent(
+        f"""
+You are writing an **executive summary** for a research **portfolio** (multiple projects involving multiple repos possible).
+
+STRICT RULES
+- Use ONLY the GOAL CORPUS and, if present, the ACTIVITY CORPUS below; no outside knowledge.
+- If ACTIVITY CORPUS is empty, use PORTFOLIO METRICS (fallback) for 'Recent Developments'.
+- Do not output bullet lists of dated events. Synthesize **what actually changed**.
+- Use ONLY the information below. Do not invent anything. Do **not** list individual project names in the output.
+- Balance coverage across projects; all projects inform the narrative.
+- Do not let a single project dominate more than ~40% of sentences; highlight cross-cutting themes spanning multiple projects when possible, but all projects must inform narrative.
+- Inline links are allowed when they aid the narrative.
+- Support claims with inline Markdown links **only** within the sentence/statement within the narrative and prose. The paragraphs must flow.
+- No dated bullet lists or lists; synthesize into concise paragraphs.
+- Do not write bracketed anchors like “[commit …]”, “[PR …]”, or “[issue …]” under any circumstance.
+- Do not name any pull request, commit or issue by name (i.e., pull request 1, commit 70bcd7e6, etc.) under any circumstance.
+- Do not add a link without it being hyperlinked under any circumstance.
+- This is how to include incline links:
+EXAMPLE 1:
+GOOD: A [consolidating pull request]((https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1)) further structured these changes.
+BAD: A consolidating pull request further structured these changes, as seen in [pull request #1](https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1).
+ALSO BAD: A consolidating pull request further structured these changes (https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1).
+EXAMPLE 2:
+GOOD: A  [consolidating PR](https://github.com/dashnowlab/STRchive/pull/268) advanced end-to-end workflows.
+BAD: A consolidating PR advanced end-to-end workflows [consolidating PR](https://github.com/dashnowlab/STRchive/pull/268).
+EXAMPLE 3 (Do not reference EVIDENCE by name (commit names, issue names, etc.). Instead use inline links within the narrative):
+GOOD: ...and [entrypoint logic to span development](https://github.com/JRaviLab/molevolvr2.0/commit/6e123e18aa8cb3a26c1432ee945ea1f9575b8e37), test, and production contexts, with [cloud-function deployment made more generic](https://github.com/JRaviLab/molevolvr2.0/commit/2e6cde0bf73e288d4beeb9a46cec3fc5bb491503)
+BAD: ...and entrypoint logic to span development, test, and production contexts, with cloud-function deployment made more generic [2e6cde0bf73e288d4beeb9a46cec3fc5bb491503](https://github.com/JRaviLab/molevolvr2.0/commit/2e6cde0bf73e288d4beeb9a46cec3fc5bb491503) and [6e123e18aa8cb3a26c1432ee945ea1f9575b8e37](https://github.com/JRaviLab/molevolvr2.0/commit/6e123e18aa8cb3a26c1432ee945ea1f9575b8e37).
+
+- Headings must appear exactly as below. 
+- Keep the whole report under ~250 words. Use these exact sections:
+
+# Executive Summary                          
+## Portfolio Summary and Goal
+Write a single, unified 2-5 sentence mission that captures the overall summary of the projects and what ALL projects together aim to achieve.
+Base this ONLY on GOAL CORPUS. You MUST synthesize across ALL content, not a subset.
+Do not name repositories, and only name projects if it aids in the narrative. {"Do NOT write 'Not stated'." if has_goal else "If the corpus is empty, write 'Not stated'."}
+Also identify the scientific communities or users who benefit if there is explicit evidence in GOAL CORPUS or STAR + FORKS (identity signals);
+otherwise do not include this sentence at all. Keep this brief (1–2 sentences).
+
+## Recent Developments ({window_label})
+If ACTIVITY CORPUS is present, synthesize from it. Otherwise, use PORTFOLIO METRICS (fallback).
+Explain substantive work (features, fixes, refactors, tests, infra, docs), issues addressed, and progress made.
+Think big picture: are multiple issues or commits working towards the same goal? Use that goal in the narrative rather than specifics about the code change.
+Tie claims to cross-cutting themes/areas when evident.
+Avoid dates, project/repo lists, and changelog-style enumeration. Focus on work progress towards the overall goal.
+If one repository has no changes, simply do not include in the narrative, do not state anything.
+
+ACTIVITY CORPUS (from project MD '## Recent Developments ({window_label})'; primary source):
+{activity_corpus if has_activity_md else "(empty)"}
+
+PORTFOLIO METRICS (fallback; for reasoning only; do not include directly):
+- Projects: {m["projects"]} total; {m["active"]} active
+- Activity in {window_label}: {m["commits"]} commits, {m["prs"]} PRs, {m["issues"]} issues, {m["releases"]} releases
+- Top themes: {", ".join(m["top_themes"]) if m["top_themes"] else "-"}
+- Top areas touched: {", ".join(m["top_areas"]) if m["top_areas"] else "-"}
+- Inline examples: {examples_str}
+
+GOAL CORPUS (distill into a unified purpose; do not copy verbatim):
+{goal_corpus if goal_corpus else "(empty)"}
+CONTEXT (do not echo; for balance only):
+PROJECTS: {proj_list}
+    """
+    ).strip()
+
+
+# ------------------ Main (CLI) ------------------
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate a portfolio executive report for ALL projects."
+    )
+    parser.add_argument(
+        "--model",
+        default=DEFAULT_MODEL,
+        help="OpenAI model name (default from env or gpt-4o-mini)",
+    )
+    parser.add_argument(
+        "--window-label",
+        default="the last 90 days",
+        help='Label for the time window, e.g., "May–July 2025"',
+    )
+    parser.add_argument(
+        "--only",
+        nargs="*",
+        default=None,
+        help="Optional list of project IDs to include",
+    )
+    parser.add_argument(
+        "--out",
+        default=os.path.join(REPORTS_DIR, "_portfolio_full.md"),
+        help="Output Markdown path (default: reports/_portfolio_full.md)",
+    )
+    args = parser.parse_args()
+
+    # 1) Load the machine portfolio JSON created by rollup_projects.py
+    portfolio_path = os.path.join(SUMMARY_DIR, "_portfolio.json")
+    if not os.path.exists(portfolio_path):
+        raise SystemExit(f"Missing {portfolio_path}. Run rollup_projects.py first.")
+    portfolio = read_json(portfolio_path)
+
+    # Extract the list of project dicts
+    projects = portfolio.get("projects") or []
+
+    # Optional: filter to a subset of project IDs
+    if args.only:
+        keep = set(args.only)
+        projects = [p for p in projects if p.get("project_id") in keep]
+
+    # Ensure we have something to summarize
+    if not projects:
+        raise SystemExit("No projects to summarize (after filtering).")
+
+    # 2) Build the portfolio-level "Executive Overview" text
+    overview_prompt = build_portfolio_overview_prompt(
+        projects, args.window_label, args.model
+    )
+    overview_text = call_llm(
+        [
+            {
+                "role": "system",
+                "content": (
+                    f"You are a careful, evidence-bound summarizer that follows directions exactly."
+                    f"You take information from multiple projects and summarize it into a cohesive and succint excecutive summary, highlighting key themes. "
+                    f"Your summaries on the project's activity highlight the overall scope of the work done and the work progress. "
+                    f"You are very observant and are able to take multiple project's progress and identify general trends of 'what work has been done across all projects'. "
+                    f"Use ONLY the information in the user message; no external knowledge. "
+                    f"Output exactly two Markdown sections with these headings and nothing else: "
+                    f"'## Portfolio Summary and Goal' and '## Recent Developments ({args.window_label})'. "
+                    f"No bullets. No owners. No generic KPIs. No fluff. "
+                ),
+            },
+            {"role": "user", "content": overview_prompt},
+        ],
+        model=args.model,
+    )
+
+    project_count = len(projects)
+    md = (
+        f"# Portfolio Summary - {project_count} projects ({args.window_label})\n\n"
+        f"{overview_text}\n"
+        f"{_footer()}\n"
+    )
+
+    with open(args.out, "w", encoding="utf-8") as f:
+        f.write(md)
+    print(f"Wrote {args.out}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/summarize_projects.py b/src/summarize_projects.py
new file mode 100644
index 0000000..47e9db4
--- /dev/null
+++ b/src/summarize_projects.py
@@ -0,0 +1,440 @@
+# src/summarize_projects.py
+"""
+Generate per-project executive summaries by calling an LLM.
+"""
+
+import os, json, glob, argparse, time, re
+from datetime import datetime, timezone
+from dotenv import load_dotenv
+from openai import OpenAI
+
+# -------- Environment & client setup --------
+load_dotenv()
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+if not OPENAI_API_KEY:
+    # Fail fast if API key is missing so the user knows to fix .env
+    raise SystemExit("Missing OPENAI_API_KEY in .env")
+
+
+# Select model from env or default to a small, cost-effective model
+DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
+
+# Initialize the OpenAI client (reads key from env automatically)
+DEFAULT_HTTP_TIMEOUT = float(os.environ.get("OPENAI_HTTP_TIMEOUT", "60"))  # seconds
+client = OpenAI(
+    timeout=DEFAULT_HTTP_TIMEOUT, max_retries=0
+)  # applies connect/read/write timeouts
+
+
+# -------- Paths --------
+REPORTS_DIR = "reports"  # where we write the per-project markdown reports
+SUMMARY_DIR = "data/summary"  # where per-project JSONs live (from rollup_projects.py)
+os.makedirs(REPORTS_DIR, exist_ok=True)
+_GOAL_RE = re.compile(
+    r"^##\s*Summary and Goal\s*\n(.*?)(?:\n##\s+|\Z)", re.DOTALL | re.MULTILINE
+)
+_ACTIVITY_RE = re.compile(
+    r"^##\s*Recent Developments.*?\n(.*?)(?:\n##\s+|\Z)", re.DOTALL | re.MULTILINE
+)
+
+
+# -------- Small helpers --------
+def _read_repo_activity_from_md(project_id: str, owner: str, repo: str) -> str | None:
+    path = os.path.join(REPORTS_DIR, f"{project_id}__{owner}__{repo}.md")
+    if not os.path.exists(path):
+        return None
+    with open(path, "r", encoding="utf-8") as f:
+        text = f.read()
+    m = _ACTIVITY_RE.search(text)
+    if not m:
+        return None
+    g = m.group(1).strip()
+    return g or None
+
+
+def read_json(p: str) -> dict:
+    """Load a JSON file into a Python dict."""
+    with open(p, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def _footer():
+    dmy = datetime.now(timezone.utc).strftime("%d/%m/%Y")
+    return f"\n\n*Report generated using A.I. on {dmy}*"
+
+
+def _read_repo_goal_from_md(project_id: str, owner: str, repo: str) -> str | None:
+    path = os.path.join(REPORTS_DIR, f"{project_id}__{owner}__{repo}.md")
+    if not os.path.exists(path):
+        return None
+    text = open(path, "r", encoding="utf-8").read()
+    m = _GOAL_RE.search(text)
+    if not m:
+        return None
+    g = m.group(1).strip()
+    return g or None
+
+
+def build_project_goal_corpus_from_repo_mds(
+    project: dict, per_repo_char_cap: int = 600
+) -> str:
+    """
+    Collect the '## Goal' text from all repo-level MD files for this project.
+    Prefer an explicit 'repos' list from the project JSON; otherwise, scan reports/* files
+    named like reports/<PROJECT_ID>__<owner>__<repo>.md
+    """
+    pid = str(project.get("project_id") or "").strip()
+    parts = []
+
+    # small fairness cap per repo; keeps the corpus balanced
+    def _cap_for(n, soft_total=8000, min_cap=250, max_cap=900):
+        return max(min_cap, min(max_cap, soft_total // max(1, n)))
+
+    # 1) If project JSON lists repos, use that (best)
+    repos = sorted(
+        project.get("repos") or [],
+        key=lambda r: (r.get("owner") or "", r.get("repo") or ""),
+    )
+    per_cap = _cap_for(max(1, len(repos) or 1))
+    for r in repos:
+        owner, repo = r.get("owner"), r.get("repo")
+        if not (owner and repo):
+            continue
+        g = _read_repo_goal_from_md(pid, owner, repo)
+        if g:
+            parts.append(f"[REPO {owner}/{repo}]\n{g.strip()[:per_cap]}")
+
+    # 2) Fallback: scan reports for any repo files that match this project id
+    if not parts:
+        # Fallback: scan and label explicitly
+        for path in sorted(glob.glob(os.path.join(REPORTS_DIR, f"{pid}__*__*.md"))):
+            # filename pattern: <pid>__<owner>__<repo>.md
+            m = re.match(
+                rf"^{re.escape(pid)}__([^_]+)__(.+)\.md$", os.path.basename(path)
+            )
+            if not m:
+                continue
+            owner, repo = m.group(1), m.group(2)
+            g = _read_repo_goal_from_md(pid, owner, repo)
+            if g:
+                parts.append(f"[REPO {owner}/{repo}]\n{g.strip()[:per_cap]}")
+
+    return ("\n\n".join(parts)).strip()
+
+
+def build_project_activity_corpus_from_repo_mds(
+    project: dict, window_label: str, per_repo_char_cap: int = 900
+) -> str:
+    """
+    Collect '## Recent Developments' blocks from all repo-level MD files for this project.
+    Skips the exact no-activity boilerplate line to avoid noise.
+    """
+    pid = str(project.get("project_id") or "").strip()
+    parts = []
+
+    # If project JSON lists repos, prefer that; otherwise scan by prefix.
+    repos = project.get("repos") or []
+    if repos:
+        candidates = [
+            (r.get("owner"), r.get("repo"))
+            for r in repos
+            if r.get("owner") and r.get("repo")
+        ]
+    else:
+        candidates = []
+        for path in sorted(glob.glob(os.path.join(REPORTS_DIR, f"{pid}__*__*.md"))):
+            m = re.match(
+                rf"^{re.escape(pid)}__([^_]+)__(.+)\.md$", os.path.basename(path)
+            )
+            if m:
+                candidates.append((m.group(1), m.group(2)))
+    # Collect activity text with per-repo cap
+    per_cap = max(300, min(900, 12000 // max(1, len(candidates) or 1)))
+    for owner, repo in candidates:
+        txt = _read_repo_activity_from_md(pid, owner, repo)
+        if not txt:
+            continue
+        # Skip the boilerplate "No changes ..." line if that's all there is
+        if txt.strip() == f"**No changes in {window_label}**":
+            continue
+        parts.append(f"[REPO {owner}/{repo}]\n{txt.strip()[:per_cap]}")
+
+    return ("\n\n".join(parts)).strip()
+
+
+def call_llm(prompt: str, model: str, max_retries: int = 4) -> str:
+    for attempt in range(1, max_retries + 1):
+        try:
+            resp = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": (
+                            f"You are a careful, evidence-bound summarizer that follows directions exactly."
+                            f"You take information from multiple repositories and summarize it into a cohesive and succint excecutive summary, highlighting key themes. "
+                            f"Your summaries on the project's activity highlight the overall scope of the work done and the work progress across ALL repositories. "
+                            f"You are very observant and are able to take multiple respoitories' progress and identify general trends of 'what work has been done across all repositories'. "
+                            f"Use ONLY the information in the user message; no external knowledge. "
+                            f"Output exactly two Markdown sections with these headings and nothing else: "
+                            f"'## Summary and Goal' and '## Recent Developments (<window label>)"
+                            f"No bullets. No owners. No generic KPIs. No fluff. "
+                        ),
+                    },
+                    {"role": "user", "content": prompt},
+                ],
+            )
+            text = (resp.choices[0].message.content or "").strip()
+            if not text:
+                raise RuntimeError("Empty completion")
+            return text
+        except Exception as e:
+            last_err = e
+            time.sleep(min(2**attempt, 10))
+    raise RuntimeError(f"LLM failed after {max_retries} attempts: {last_err}")
+
+
+def make_project_prompt(project_summary: dict, window_label: str) -> str:
+    pid = project_summary.get("project_id", "")
+    pname = project_summary.get("project_name") or pid
+    interest = project_summary.get("interest_signals", {}) or {}
+    examples = project_summary.get("recent_examples", {}) or {}
+
+    # Goal corpus from repo MD "## Goal" (code-derived via summarize_repos)
+    goal_corpus = build_project_goal_corpus_from_repo_mds(project_summary)
+    has_goal = bool(goal_corpus)
+
+    # Activity corpus from repo MD "## Recent Developments" (primary activity source)
+    activity_corpus = build_project_activity_corpus_from_repo_mds(
+        project_summary, window_label
+    )
+    has_activity_md = bool(activity_corpus)
+
+    # Rollup evidence (fallback if no activity corpus available)
+    def kv_list(pairs):
+        return [
+            f"{p['value']} ({p['count']})"
+            for p in (pairs or [])
+            if p and p.get("value") is not None
+        ]
+
+    sg_k = kv_list(interest.get("stargazers"))
+    fk_k = kv_list(interest.get("fork_owners"))
+
+    def ex_lines_grouped(items, fields, n_total=10, per_repo=3):
+        """
+        Build lines like 'field1 — field2 — ...' but balance across repos.
+        Requires each item to include 'owner' and 'repo'.
+        """
+        if not items:
+            return []
+        buckets = {}
+        for it in items:
+            if not isinstance(it, dict):
+                continue
+            key = (it.get("owner"), it.get("repo"))
+            buckets.setdefault(key, []).append(it)
+        # round-robin across repos
+        lines, took = [], {k: 0 for k in buckets}
+        while len(lines) < n_total:
+            progressed = False
+            for k, arr in buckets.items():
+                if took[k] >= min(per_repo, len(arr)):
+                    continue
+                it = arr[took[k]]
+                took[k] += 1
+                parts = [str(it.get(f)) for f in fields if it.get(f)]
+                if parts:
+                    # prefix repo for even clearer balance to the model:
+                    lines.append(f"[{k[0]}/{k[1]}] — " + " — ".join(parts))
+                    progressed = True
+                if len(lines) >= n_total:
+                    break
+            if not progressed:
+                break
+        return lines
+
+    commit_lines = ex_lines_grouped(
+        examples.get("commits"),
+        ["committed_at", "author_login", "message_headline", "commit_url"],
+    )
+    pr_lines = ex_lines_grouped(
+        examples.get("pull_requests"),
+        ["created_at", "author_login", "title", "pr_url", "state"],
+    )
+    issue_lines = ex_lines_grouped(
+        examples.get("issues"),
+        ["created_at", "author_login", "title", "issue_url", "labels"],
+    )
+    release_lines = ex_lines_grouped(
+        examples.get("releases"),
+        ["published_at", "release_name", "release_tag", "release_url"],
+    )
+
+    # Activity-present flag: prefer the repo-MD view; else infer from rollup examples
+    activity_present = bool(has_activity_md) or any(
+        bool(examples.get(k))
+        for k in ["commits", "pull_requests", "issues", "releases"]
+    )
+
+    repo_list = (
+        ", ".join(
+            [f"{r['owner']}/{r['repo']}" for r in (project_summary.get("repos") or [])]
+        )
+        or "(unknown)"
+    )
+    return f"""
+You are writing an **executive summary** for a research **project** (multiple repos possible).
+
+STRICT RULES
+- Use ONLY the GOAL CORPUS and (if present) the ACTIVITY CORPUS from repo-level .md files; no outside knowledge.
+- If ACTIVITY CORPUS is empty, use the EVIDENCE block as a fallback for "Recent Developments".
+- Do not output bullet lists of dated events. Synthesize **what actually changed** given the changes observed.
+- If ACTIVITY_PRESENT=no, under “Recent Developments” write exactly: **No changes in {window_label}**. Do not write ACTIVITY_PRESENT=yes or no.
+- Use inline links inside the prose only when it adds to the narrative or when the example is truly informative (e.g., "...includes [work to fix X](link to where X is fixed)). 
+- Inline links to commit/change/issue are only present when it's very representative of the point you are trying to make.        
+- Avoid letting any single repository account for most of the narrative.
+- Balance coverage across repositories; ensure all repositories are represented where possible.
+- Do not under any circumstance name any pull request, commit or issue by name (i.e., pull request 1, commit 70bcd7e6, etc.)
+- Do not let a single repository dominate the narrative; integrate themes spanning multiple repos.
+- Do not add a link without it being hyperlinked under any circumstance.
+- This is how to include incline links:
+EXAMPLE 1:
+GOOD: A [consolidating pull request]((https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1)) further structured these changes.
+BAD: A consolidating pull request further structured these changes, as seen in [pull request #1](https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1).
+ALSO BAD: A consolidating pull request further structured these changes (https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1).
+EXAMPLE 2:
+GOOD: A  [consolidating PR](https://github.com/dashnowlab/STRchive/pull/268) advanced end-to-end workflows.
+BAD: A consolidating PR advanced end-to-end workflows [consolidating PR](https://github.com/dashnowlab/STRchive/pull/268).
+EXAMPLE 3 (Do not reference EVIDENCE by name (commit names, issue names, etc.). Instead use inline links within the narrative):
+GOOD: ...and [entrypoint logic to span development](https://github.com/JRaviLab/molevolvr2.0/commit/6e123e18aa8cb3a26c1432ee945ea1f9575b8e37), test, and production contexts, with [cloud-function deployment made more generic](https://github.com/JRaviLab/molevolvr2.0/commit/2e6cde0bf73e288d4beeb9a46cec3fc5bb491503)
+BAD: ...and entrypoint logic to span development, test, and production contexts, with cloud-function deployment made more generic [2e6cde0bf73e288d4beeb9a46cec3fc5bb491503](https://github.com/JRaviLab/molevolvr2.0/commit/2e6cde0bf73e288d4beeb9a46cec3fc5bb491503) and [6e123e18aa8cb3a26c1432ee945ea1f9575b8e37](https://github.com/JRaviLab/molevolvr2.0/commit/6e123e18aa8cb3a26c1432ee945ea1f9575b8e37).
+- Keep the whole report under ~250 words. Use these 2 exact sections:
+
+## Summary and Goal
+Write 2–8 sentences that **synthesize a single project summary and goal** across ALL repositories.
+Base ONLY on GOAL CORPUS. Do **not** list repository names, but you can reference repositories if needed in the narrative. {"Do NOT write 'Not stated'." if has_goal else "If the corpus is empty, write 'Not stated'."}
+Also identify the scientific communities or users who benefit if there is explicit evidence in GOAL CORPUS or STAR + FORKS (identity signals);
+otherwise do not include this sentence at all. Keep this brief (1–2 sentences).
+
+## Recent Developments ({window_label})
+If ACTIVITY CORPUS is present, synthesize from it. Otherwise use EVIDENCE as the source.
+Explain the **substance** of changes across repos (features/fixes/docs/refactor/tests/infra/deps),
+what areas of the codebase were touched (infer from titles/file cues if present), issues addressed and the scope of issues opened, and releases. 
+Support claims with **inline links** to specific commits/PRs/issues/releases only when it fits the narrative. Do not list dates or create a timeline. 
+The paragraph should be in prose narrative form, with at most 6 links total, if any. 
+Think big picture: are multiple issues or commits working towards the same goal? Use that goal in the narrative rather than specifics about the code change.
+Avoid counts without explanation and only mention counts when they aid the narrative. 
+If one repository has no changes, simply do not include in the narrative: do not state anything.
+
+ACTIVITY_PRESENT: {str(bool(activity_present)).lower()}
+
+CONTEXT (INPUT; do not echo verbatim)
+PROJECT: {pname} ({pid})
+REPOSITORIES: {repo_list}
+GOAL CORPUS (from repo MD '## Goal' sections):
+{build_project_goal_corpus_from_repo_mds(project_summary) if has_goal else "(empty)"}
+
+ACTIVITY CORPUS (from repo MD '## Recent Developments' sections; primary source):
+{build_project_activity_corpus_from_repo_mds(project_summary, window_label) if has_activity_md else "(empty)"}
+
+EVIDENCE (fallback source if ACTIVITY CORPUS is empty — for reasoning; do not list verbatim and do not use the names, only hyperlink URL)
+Commits:
+{chr(10).join("- " + line for line in (commit_lines or ["(none)"]))}
+Pull Requests:
+{chr(10).join("- " + line for line in (pr_lines or ["(none)"]))}
+Issues:
+{chr(10).join("- " + line for line in (issue_lines or ["(none)"]))}
+Releases:
+{chr(10).join("- " + line for line in (release_lines or ["(none)"]))}
+
+STARS (identity signals): {", ".join(sg_k) if sg_k else "(none)"}
+FORKS (identity signals): {", ".join(fk_k) if fk_k else "(none)"}
+""".strip()
+
+
+def write_report(
+    project_id: str, project_name: str, repo_count: int, window_label: str, body_md: str
+):
+    path = os.path.join(REPORTS_DIR, f"{project_id}.md")
+    title = f"# Executive Summary: Project {project_name or project_id} — {repo_count} repositories — {window_label}"
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(title + "\n\n" + body_md.strip() + _footer() + "\n")
+    print(f"Wrote {path}")
+
+
+def summarize_project_file(path: str, window_label: str, model: str):
+    data = read_json(path)
+    pid = data.get("project_id", "UNKNOWN")
+    pname = data.get("project_name") or pid
+    repo_count = int(data.get("repo_count") or 0)
+
+    # Build the LLM prompt (now derives the Goal strictly from repo MD Goals)
+    prompt = make_project_prompt(data, window_label)
+
+    try:
+        text = call_llm(prompt, model)
+        if not text:
+            raise RuntimeError("LLM returned empty content")
+    except Exception as e:
+        text = (
+            f"# Executive Summary\n\n"
+            f"_LLM call failed: {e}_\n\n"
+            f"- Project ID: {pid}\n"
+            f"- Project Name: {pname}\n"
+            f"- Window: {window_label}\n"
+        )
+
+    write_report(pid, pname, repo_count, window_label, text)
+
+
+def main():
+    """
+    CLI entrypoint:
+      - loads all per-project JSONs (excluding _portfolio.json),
+      - optionally filters by --only project IDs,
+      - generates one Markdown file per remaining project.
+    """
+    parser = argparse.ArgumentParser(
+        description="Generate per-project executive summaries."
+    )
+    parser.add_argument(
+        "--model",
+        default=DEFAULT_MODEL,
+        help="OpenAI model (default from env or gpt-4o-mini)",
+    )
+    parser.add_argument(
+        "--window-label",
+        default="the last 90 days",
+        help='Human label for the window, e.g., "May–Jul 2025"',
+    )
+    parser.add_argument(
+        "--only",
+        nargs="*",
+        default=None,
+        help="Optional list of project IDs to include",
+    )
+    args = parser.parse_args()
+
+    # Find all project JSONs written by rollup (ignore the portfolio index file)
+    paths = sorted(glob.glob(os.path.join(SUMMARY_DIR, "*.json")))
+    paths = [p for p in paths if os.path.basename(p) != "_portfolio.json"]
+
+    # Optional filter: only summarize specified project IDs
+    if args.only:
+        ids = set(args.only)
+        # Note: we need to peek to get IDs; small and fine for POC
+        paths = [p for p in paths if read_json(p).get("project_id") in ids]
+
+    if not paths:
+        raise SystemExit(
+            f"No project JSON files found in {SUMMARY_DIR}. Run rollup first."
+        )
+
+    # Generate the per-project reports
+    for p in paths:
+        summarize_project_file(p, args.window_label, args.model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/summarize_repos.py b/src/summarize_repos.py
new file mode 100644
index 0000000..7b988ab
--- /dev/null
+++ b/src/summarize_repos.py
@@ -0,0 +1,539 @@
+# src/summarize_repos.py
+"""
+Generate repository-level summaries grouped by project.
+"""
+
+import os, argparse, textwrap, time
+import pandas as pd
+import json
+from dotenv import load_dotenv
+from openai import OpenAI
+from datetime import datetime, timezone
+from goal_from_code import (
+    shallow_clone,
+    synthesize_repo_goal_from_code,
+    delete_clone_path,
+)
+import pathlib
+import shutil
+
+# ---------- Setup: env + OpenAI client ----------
+load_dotenv()  # pulls OPENAI_API_KEY/OPENAI_MODEL from .env, if present
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+if not OPENAI_API_KEY:
+    # fail fast; nothing will work without a key
+    raise SystemExit("Missing OPENAI_API_KEY in .env")
+
+# Default model can be overridden via --model
+DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-5-nano")
+DEFAULT_HTTP_TIMEOUT = float(os.environ.get("OPENAI_HTTP_TIMEOUT", "60"))  # seconds
+client = OpenAI(
+    timeout=DEFAULT_HTTP_TIMEOUT, max_retries=0
+)  # applies connect/read/write timeouts
+
+# Paths
+CLEAN_DIR = "data/clean"
+REPORTS_DIR = "reports"
+os.makedirs(REPORTS_DIR, exist_ok=True)
+
+SEED_CSV = "data/projects_seed.csv"
+RAW_DIR = "data/raw/github"
+
+
+# ---------- LLM helper with simple retries ----------
+def _footer():
+    dmy = datetime.now(timezone.utc).strftime("%d/%m/%Y")
+    return f"\n\n*Report generated using A.I. on {dmy}*"
+
+
+def call_llm(messages, model: str, max_retries: int = 4) -> str:
+    """
+    Thin wrapper around chat.completions.create with a bounded retry loop.
+    Adds a per-attempt watchdog timeout via the client config.
+    """
+    last_err = None
+    for attempt in range(1, max_retries + 1):
+        try:
+            resp = client.chat.completions.create(model=model, messages=messages)
+            txt = resp.choices[0].message.content or ""
+            txt = txt.strip()
+            if not txt:
+                raise RuntimeError("Empty completion")
+            return txt
+        except Exception as e:
+            last_err = e
+            # Exponential backoff: 2, 4, 8, 10 (cap)
+            time.sleep(min(2**attempt, 10))
+    raise RuntimeError(f"LLM failed after {max_retries} attempts: {last_err}")
+
+
+# ---------- Load the combined per-table Parquets ----------
+def load_repo_frames():
+    """
+    Read combined tables from data/clean into a dict of DataFrames.
+    If a table is missing, return an empty DataFrame for that key.
+    This keeps downstream logic simple (no KeyErrors).
+    """
+    tables = {}
+    for name in [
+        "commits",
+        "issues",
+        "pull_requests",
+        "releases",
+        "stargazers",
+        "forks",
+        "pr_files",
+    ]:
+        path = os.path.join(CLEAN_DIR, f"_all_{name}.parquet")
+        if os.path.exists(path):
+            tables[name] = pd.read_parquet(path)
+        else:
+            tables[name] = pd.DataFrame()
+    return tables
+
+
+def load_seed(path: str = SEED_CSV) -> pd.DataFrame:
+    df = pd.read_csv(path)
+    for col in ["project_id", "project_name", "owner", "repo"]:
+        if col not in df.columns:
+            df[col] = None
+    return df
+
+
+def read_raw_json(owner: str, repo: str) -> dict | None:
+    path = os.path.join(RAW_DIR, f"{owner}__{repo}.json")
+    if not os.path.exists(path):
+        return None
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+# ---------- Determine the universe of repos to summarize ----------
+def group_by_repo(tables, seed_df: pd.DataFrame):
+    """
+    Union of repos seen in the tables AND listed in the seed.
+    Returns sorted list of (project_id, project_name, owner, repo)
+    """
+    keys = set()
+    # from tables
+    for df in tables.values():
+        if isinstance(df, pd.DataFrame) and not df.empty:
+            if {"owner", "repo", "project_id", "project_name"}.issubset(df.columns):
+                for t in (
+                    df[["project_id", "project_name", "owner", "repo"]]
+                    .dropna()
+                    .itertuples(index=False)
+                ):
+                    keys.add((t[0], t[1], t[2], t[3]))
+    # from seed
+    for r in seed_df.itertuples(index=False):
+        pid = getattr(r, "project_id", None)
+        pname = getattr(r, "project_name", None)
+        owner = getattr(r, "owner", None)
+        repo = getattr(r, "repo", None)
+        if pid and owner and repo:
+            keys.add(
+                (
+                    str(pid),
+                    str(pname) if pname is not None else None,
+                    str(owner),
+                    str(repo),
+                )
+            )
+    return sorted(keys)
+
+
+# ---------- Pull a single repo's context (goal sources) ----------
+def extract_repo_context(tables, owner, repo):
+    """
+    Try to extract description/homepage/readme from any combined table row.
+    If not found (inactive repo), fall back to raw JSON fetched earlier.
+    """
+
+    def first_non_null(sub, col):
+        if col in sub.columns:
+            s = sub[col].dropna()
+            if not s.empty:
+                return s.iloc[0]
+        return None
+
+    # Try tables first
+    for df in tables.values():
+        if isinstance(df, pd.DataFrame) and not df.empty:
+            if "owner" in df.columns and "repo" in df.columns:
+                mask = (df.get("owner").astype(str) == str(owner)) & (
+                    df.get("repo").astype(str) == str(repo)
+                )
+                sub = df.loc[mask]
+                if not sub.empty:
+                    return {
+                        "description": first_non_null(sub, "repo_description"),
+                        "homepage": first_non_null(sub, "repo_homepage"),
+                        "readme": first_non_null(sub, "readme_text"),
+                    }
+
+    # Fallback: raw JSON
+    raw = read_raw_json(owner, repo) or {}
+    if raw:
+        # topics + languages are not needed here; we just need goal inputs
+        readme = raw.get("__readme_text")
+        return {
+            "description": raw.get("description"),
+            "homepage": raw.get("homepageUrl"),
+            "readme": readme,
+        }
+
+    # Last resort
+    return {"description": None, "homepage": None, "readme": None}
+
+
+# ---------- Robust datetime sorting helper ----------
+def _sorted_desc(df: pd.DataFrame, time_col: str) -> pd.DataFrame:
+    """
+    Return df sorted descending by time_col using robust datetime parsing (UTC).
+    If the column is missing/empty, returns an empty DataFrame.
+    """
+    if not isinstance(df, pd.DataFrame) or df.empty or time_col not in df.columns:
+        return pd.DataFrame()
+    out = df.copy()
+    out[time_col] = pd.to_datetime(out[time_col], errors="coerce", utc=True)
+    return out.sort_values(time_col, ascending=False, kind="stable")
+
+
+# ---------- Build the LLM prompt for ONE repo ----------
+
+
+def _urls_only(df: pd.DataFrame, url_col: str, max_n: int = 12) -> list[str]:
+    """Return a de-duped, order-preserving list of URLs from df[url_col]."""
+    if not isinstance(df, pd.DataFrame) or df.empty or url_col not in df.columns:
+        return []
+    seen, out = set(), []
+    for u in df[url_col].dropna().astype(str):
+        if u.startswith("github.com/"):
+            u = "https://" + u
+        if u.startswith("http") and u not in seen:
+            seen.add(u)
+            out.append(u)
+            if len(out) >= max_n:
+                break
+    return out
+
+
+def _identity_signals(tables, owner, repo, max_items=6):
+    """Return two short lists describing recent stargazer/fork identities for this repo."""
+    stars = tables.get("stargazers", pd.DataFrame())
+    forks = tables.get("forks", pd.DataFrame())
+
+    def _safe(s):
+        return s if isinstance(s, str) and s.strip() else None
+
+    star_lines = []
+    if isinstance(stars, pd.DataFrame) and not stars.empty:
+        sub = stars[(stars.get("owner") == owner) & (stars.get("repo") == repo)].copy()
+        sub = (
+            sub.sort_values("starred_at", ascending=False)
+            if "starred_at" in sub.columns
+            else sub
+        )
+        for _, r in sub.head(max_items).iterrows():
+            login = _safe(r.get("stargazer_login")) or "unknown"
+            name = _safe(r.get("stargazer_name"))
+            comp = _safe(r.get("stargazer_company"))
+            loc = _safe(r.get("stargazer_location"))
+            orgs = _safe(r.get("stargazer_orgs"))
+            bits = [f"{login}" + (f" ({name})" if name else "")]
+            meta = "; ".join([x for x in [comp, loc, orgs] if x])
+            if meta:
+                bits.append(meta)
+            star_lines.append(" — ".join(bits))
+
+    fork_lines = []
+    if isinstance(forks, pd.DataFrame) and not forks.empty:
+        sub = forks[(forks.get("owner") == owner) & (forks.get("repo") == repo)].copy()
+        sub = (
+            sub.sort_values("fork_created_at", ascending=False)
+            if "fork_created_at" in sub.columns
+            else sub
+        )
+        for _, r in sub.head(max_items).iterrows():
+            login = _safe(r.get("fork_owner_login")) or "unknown"
+            name = _safe(r.get("fork_owner_name"))
+            typ = _safe(r.get("fork_owner_type"))  # User/Organization
+            loc = _safe(r.get("fork_owner_location"))
+            orgd = (
+                _safe(r.get("fork_owner_org_description"))
+                if typ == "Organization"
+                else None
+            )
+            bits = [f"{login}" + (f" ({name})" if name else "")]
+            meta = "; ".join([x for x in [typ, loc, orgd] if x])
+            if meta:
+                bits.append(meta)
+            fork_lines.append(" — ".join(bits))
+
+    return (star_lines[:max_items] or ["(none)"], fork_lines[:max_items] or ["(none)"])
+
+
+def build_repo_prompt(project_id, project_name, owner, repo, ctx, tables, window_label):
+    """
+    Build a compact, strictly evidence-grounded prompt that yields a short,
+    synthesized repo summary with headings. No bullet lists or dated timelines.
+    """
+
+    commits = tables.get("commits", pd.DataFrame())
+    issues = tables.get("issues", pd.DataFrame())
+    prs = tables.get("pull_requests", pd.DataFrame())
+    rels = tables.get("releases", pd.DataFrame())
+
+    c_sub = _sorted_desc(
+        (
+            commits[(commits.get("owner") == owner) & (commits.get("repo") == repo)]
+            if isinstance(commits, pd.DataFrame) and not commits.empty
+            else pd.DataFrame()
+        ),
+        "committed_at",
+    )
+    i_sub = _sorted_desc(
+        (
+            issues[(issues.get("owner") == owner) & (issues.get("repo") == repo)]
+            if isinstance(issues, pd.DataFrame) and not issues.empty
+            else pd.DataFrame()
+        ),
+        "created_at",
+    )
+    pr_sub = _sorted_desc(
+        (
+            prs[(prs.get("owner") == owner) & (prs.get("repo") == repo)]
+            if isinstance(prs, pd.DataFrame) and not prs.empty
+            else pd.DataFrame()
+        ),
+        "created_at",
+    )
+    r_sub = _sorted_desc(
+        (
+            rels[(rels.get("owner") == owner) & (rels.get("repo") == repo)]
+            if isinstance(rels, pd.DataFrame) and not rels.empty
+            else pd.DataFrame()
+        ),
+        "published_at",
+    )
+
+    def _has_activity(c_sub, pr_sub, i_sub, r_sub) -> bool:
+        return any(
+            [
+                isinstance(c_sub, pd.DataFrame) and not c_sub.empty,
+                isinstance(pr_sub, pd.DataFrame) and not pr_sub.empty,
+                isinstance(i_sub, pd.DataFrame) and not i_sub.empty,
+                isinstance(r_sub, pd.DataFrame) and not r_sub.empty,
+            ]
+        )
+
+    activity_present = _has_activity(c_sub, pr_sub, i_sub, r_sub)
+
+    goal_text = (ctx.get("readme") or ctx.get("description") or "Not stated.").strip()
+
+    # Identity signals for grounded beneficiaries
+    star_lines, fork_lines = _identity_signals(tables, owner, repo)
+
+    # Minimal evidence buffers (for inline linking; NOT to be printed as lists)
+    commit_urls = _urls_only(c_sub, "commit_url", max_n=8)
+    pr_urls = _urls_only(pr_sub, "pr_url", max_n=8)
+    issue_urls = _urls_only(i_sub, "issue_url", max_n=8)
+    release_urls = _urls_only(r_sub, "release_url", max_n=8)
+
+    return textwrap.dedent(
+        f"""
+You are writing an **executive summary** for ONE repository.
+
+STRICT RULES
+- Write succinct language and do not repeat yourself.
+- Keep total under ~250 words.
+- Use ONLY the facts below (GOAL SOURCE, EVIDENCE, IDENTITY SIGNALS). No outside knowledge.
+- Only inline link a commit/change/issue when it's very representative of the point you are trying to make or when it adds to the narrative.
+- If ACTIVITY_PRESENT=no, or there is no recent activity, under “Recent Developments” write EXACTLY: **No changes in {window_label}**. Do not write ACTIVITY_PRESENT=yes or no.
+- Support claims with inline Markdown links **only** within the sentence/statement within the narrative and prose. The paragraphs must flow.
+- No dated bullet lists or lists; synthesize into concise paragraphs.
+- Do not write bracketed anchors like “[commit …]”, “[PR …]”, or “[issue …]” under any circumstance.
+- Do not name any pull request, commit or issue by name (i.e., pull request 1, commit 70bcd7e6, etc.) under any circumstance.
+EXAMPLE 1:
+GOOD: A [consolidating pull request]((https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1)) further structured these changes.
+BAD: A consolidating pull request further structured these changes, as seen in [pull request #1](https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1).
+ALSO BAD: A consolidating pull request further structured these changes (https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1).
+EXAMPLE 2:
+GOOD: A  [consolidating PR](https://github.com/dashnowlab/STRchive/pull/268) advanced end-to-end workflows.
+BAD: A consolidating PR advanced end-to-end workflows [consolidating PR](https://github.com/dashnowlab/STRchive/pull/268).
+EXAMPLE 3 (Do not reference EVIDENCE by name (commit names, issue names, etc.). Instead use inline links within the narrative):
+GOOD: ...and [entrypoint logic to span development](https://github.com/JRaviLab/molevolvr2.0/commit/6e123e18aa8cb3a26c1432ee945ea1f9575b8e37), test, and production contexts, with [cloud-function deployment made more generic](https://github.com/JRaviLab/molevolvr2.0/commit/2e6cde0bf73e288d4beeb9a46cec3fc5bb491503)
+BAD: ...and entrypoint logic to span development, test, and production contexts, with cloud-function deployment made more generic [2e6cde0bf73e288d4beeb9a46cec3fc5bb491503](https://github.com/JRaviLab/molevolvr2.0/commit/2e6cde0bf73e288d4beeb9a46cec3fc5bb491503) and [6e123e18aa8cb3a26c1432ee945ea1f9575b8e37](https://github.com/JRaviLab/molevolvr2.0/commit/6e123e18aa8cb3a26c1432ee945ea1f9575b8e37).
+- Keep the whole report under ~250 words. Use these 2 exact sections:
+
+## Summary and Goal
+Write 2–8 crisp sentences describing the repo’s summary and purpose from GOAL SOURCE only. You do not need to state which repository you are describing.
+Evaluate what the codebase aims to do as a whole, big picture, not just within the code. 
+Like what are the researchers aiming to do through this code? What are they researching? How are they researching it, and what is the goal?
+Also identify the scientific communities or users who benefit from the research if there is explicit evidence in GOAL CORPUS or STAR + FORKS (identity signals);
+otherwise do not include this sentence at all. Keep this brief (1–2 sentences).
+
+## Recent Developments ({window_label})
+Write 2–10 crisp sentences that explains **what changed**, not when: summarize the scope and the substance of changes (features/fixes/docs/refactor/tests/infra/deps),
+what parts of the codebase were affected (infer from file names or titles if apparent), and any issues/release outcomes, what work has been done. 
+Think big picture: are multiple issues or commits working towards the same goal? Use that goal in the narrative rather than specifics about the code change.
+Do not list dates or create a timeline. The paragraph should be in prose narrative form, with at most 6 links total, if any. 
+Avoid counts without explanation and only mention counts when they aid the narrative. 
+
+CONTEXT (INPUT; do not echo verbatim):
+Project: {project_name} ({project_id})
+Repository: {owner}/{repo}
+
+ACTIVITY_PRESENT: {str(bool(activity_present)).lower()}
+
+GOAL SOURCE:
+{goal_text}
+
+EVIDENCE (links for reasoning only; do not echo raw URLs, and do not use the names, only hyperlink URL):
+Commits:
+{chr(10).join("- " + u for u in (commit_urls or ["(none)"]))}
+Pull Requests:
+{chr(10).join("- " + u for u in (pr_urls or ["(none)"]))}
+Issues:
+{chr(10).join("- " + u for u in (issue_urls or ["(none)"]))}
+Releases:
+{chr(10).join("- " + u for u in (release_urls or ["(none)"]))}
+
+IDENTITY SIGNALS (for grounding beneficiaries):
+Stargazers:
+{chr(10).join("- " + s for s in star_lines)}
+Fork owners:
+{chr(10).join("- " + s for s in fork_lines)}
+    """
+    ).strip()
+
+
+# ---------- CLI entrypoint ----------
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate repository-level summaries grouped by project."
+    )
+    parser.add_argument(
+        "--model",
+        default=DEFAULT_MODEL,
+        help="OpenAI model (default from env or gpt-5-nano)",
+    )
+    parser.add_argument(
+        "--model-high",
+        default=DEFAULT_MODEL,
+        help="OpenAI model for higher analysis (default: value of --model/DEFAULT_MODEL)"
+    )
+    parser.add_argument(
+        "--model-low",
+        default=DEFAULT_MODEL,
+        help="OpenAI model for lower analysis (default: value of --model/DEFAULT_MODEL)"
+    )
+    parser.add_argument(
+        "--model-medium",
+        default=DEFAULT_MODEL,
+        help="OpenAI model for medium analysis (default: value of --model/DEFAULT_MODEL)"
+    )
+    parser.add_argument(
+        "--window-label", 
+        default="the last 90 days",
+        help='Label for the time window (e.g. "May–July 2025")',
+    )
+    parser.add_argument(
+        "--out-dir", default=REPORTS_DIR, help="Directory to write repo-level reports"
+    )
+    args = parser.parse_args()
+
+    tables = load_repo_frames()
+    seed_df = load_seed(SEED_CSV)
+    clone_root_torm = pathlib.Path("data/clones_goals")
+
+    repos = group_by_repo(tables, seed_df)
+    if not repos:
+        raise SystemExit(
+            "No repositories found in clean tables. Run fetch/normalize first."
+        )
+
+    # Emit one Markdown per repo
+    for pid, pname, owner, repo in repos:
+        # Extract README/description/homepage context for this repo
+        ctx = extract_repo_context(tables, owner, repo)
+
+        # ALWAYS derive the Goal from the full repo code via shallow clone + map-reduce
+        try:
+            clone_root = pathlib.Path("data/clones_goals")
+            repo_path = shallow_clone(owner, repo, clone_root)
+            code_goal = synthesize_repo_goal_from_code(
+                repo_path,
+                model_high=args.model_high,
+                model_low=args.model_low,
+                model_medium=args.model_medium,
+                call_llm_fn=call_llm,
+            )
+            # Inject into ctx so build_repo_prompt uses it as GOAL SOURCE
+            # build_repo_prompt already prefers ctx["readme"] over description
+            ctx = dict(ctx)
+            ctx["readme"] = code_goal
+        except Exception as e:
+            print(f"[warn] Code-derived goal failed for {owner}/{repo}: {e}")
+            # Fallback: keep whatever extract_repo_context found (README/description/raw)
+
+        # Build the LLM prompt for this repo
+        prompt = build_repo_prompt(
+            pid, pname, owner, repo, ctx, tables, args.window_label
+        )
+
+        # Call the LLM with a brief, consistent system instruction
+        try:
+            summary = call_llm(
+                [
+                    {
+                        "role": "system",
+                        "content": (
+                            f"You are a careful, evidence-bound summarizer that follows directions exactly."
+                            f"You take information from a repository (code and activity) and summarize it into a cohesive and succinct excecutive summary, "
+                            f"highlighting key themes in issues, pull requests, users, etc. "
+                            f"You are very observant and are able to take multiple issues, pull requests, etc. "
+                            "and identify general trends of 'what work has been done' and 'what key issues or work pop up consistenly'. "
+                            f"Use ONLY the information in the user message; no external knowledge. "
+                            f"Output exactly two Markdown sections with these headings and nothing else: "
+                            f"'## Summary and Goal' and '## Recent Developments ({args.window_label})'. "
+                            f"No bullets. No owners. No generic KPIs. No fluff. "
+                        ),
+                    },
+                    {"role": "user", "content": prompt},
+                ],
+                model=args.model,
+            )
+        except Exception as e:
+            # On failure, write a minimal stub so the pipeline still produces files
+            summary = f"_LLM call failed: {e}_"
+
+        # Format the Markdown with a clear title
+        title = f"# Executive Summary: {owner}/{repo} — {pname} ({pid}) — {args.window_label}"
+        md = title + "\n\n" + summary + _footer() + "\n"
+
+        # reports/<PROJECT_ID>__<owner>__<repo>.md
+        out_path = os.path.join(args.out_dir, f"{pid}__{owner}__{repo}.md")
+        with open(out_path, "w", encoding="utf-8") as f:
+            f.write(md)
+        print(f"Wrote {out_path}")
+
+        # Clean up the clone to avoid disk growth (Option A)
+        try:
+            delete_clone_path(repo_path)  # repo_path came from shallow_clone(...)
+        except Exception as e:
+            print(f"[warn] cleanup failed for {owner}/{repo}: {e}")
+
+    try:
+        if clone_root_torm.exists():
+            shutil.rmtree(clone_root_torm)
+        clone_root_torm.mkdir(
+            parents=True, exist_ok=True
+        )  # leave an empty folder for next run
+    except Exception as e:
+        print(f"[warn] failed to clear clone cache at {clone_root_torm}: {e}")
+
+
+if __name__ == "__main__":
+    main()