diff --git a/src/goal_from_code.py b/src/goal_from_code.py new file mode 100644 index 0000000..05afd08 --- /dev/null +++ b/src/goal_from_code.py @@ -0,0 +1,644 @@ +# src/goal_from_code.py + +""" +Extract the "goal" or purpose of a code repository by analyzing its code/config files. +""" + +import pathlib, subprocess, os, textwrap +from typing import Iterable, List, Tuple, Optional +import shutil +import math + +BINARY_EXTS = { + # images & raster + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".tiff", + ".tif", + ".ico", + ".svs", + ".webp", + # docs/binaries + ".pdf", + ".xlsx", + ".docx", + ".pptx", + # archives (single-suffix forms) + ".zip", + ".gz", + ".bz2", + ".xz", + ".7z", + ".rar", + # 3D / meshes (single-suffix) + ".fbx", + ".glb", + ".gltf", + ".stl", + ".ply", + ".las", + ".objz", + ".3ds", + # med/geo (single-suffix; multi-suffix handled above) + ".nii", + ".nrrd", + ".mhd", + ".mha", + ".geotiff", + # audio/video + ".mp4", + ".mp3", + ".wav", + ".avi", + ".mov", + ".webm", + ".m4a", + ".aac", + ".flac", + # fonts / wasm + ".woff", + ".woff2", + ".ttf", + ".otf", + ".wasm", + # design + ".psd", + ".ai", + ".xcf", + # db / sqlite + ".sqlite", + ".db", + ".db3", + # native libs / executables + ".so", + ".dylib", + ".dll", + ".exe", + ".bin", + ".obj", + # columnar / arrays / hdf + ".parquet", + ".feather", + ".h5", + ".hdf5", + ".npz", + ".npy", + # ML / notebooks / checkpoints + ".ipynb", + ".tfrecord", + ".pb", + ".onnx", + ".safetensors", + ".ckpt", + ".pt", + ".pth", + ".pkl", + ".pickle", + ".joblib", + # chunked array stores + ".zarr", +} + +# Code/config extensions we will consider for goal synthesis +CODE_EXTS = { + ".py", + ".pyi", + ".r", + ".rmd", + ".jl", + ".m", + ".c", + ".h", + ".hpp", + ".hxx", + ".hh", + ".cc", + ".cpp", + ".cxx", + ".cu", + ".cuh", + ".ino", + ".java", + ".scala", + ".kt", + ".kts", + ".groovy", + ".go", + ".rs", + ".swift", + ".php", + ".rb", + ".pl", + ".pm", + ".t", + ".lua", + ".fs", + ".fsx", + ".f90", + ".f95", + ".f03", + ".f08", + ".for", + ".ftn", + ".f", + ".cs", + ".vb", + ".vbs", + ".js", + ".mjs", + ".cjs", + ".jsx", + ".ts", + ".tsx", + ".json", + ".yml", + ".yaml", + ".toml", + ".ini", + ".cfg", + ".conf", + ".properties", + ".sh", + ".bash", + ".zsh", + ".fish", + ".bat", + ".cmd", + ".ps1", + ".psm1", + ".psd1", + ".html", + ".htm", + ".xhtml", + ".xml", + ".xsl", + ".xslt", + ".svg", + ".sql", + ".psql", + ".mysql", + ".pgsql", + ".hql", + ".cmake", + ".ninja", + ".bazel", + ".bzl", + ".gradle", + ".mk", + ".tex", + ".sty", + ".cls", + ".bib", + ".rst", + ".md", + ".markdown", + ".txt", + ".proto", + ".thrift", + ".avdl", + ".graphql", + ".gql", + ".sol", + ".asm", + ".s", + ".v", + ".vh", + ".sv", + ".svh", + ".vhdl", + ".vhd", + ".dart", + ".coffee", + ".erl", + ".hrl", + ".ex", + ".exs", + ".nim", + ".clj", + ".cljs", + ".edn", + ".lisp", + ".el", + ".scm", + ".ss", + ".cr", + ".mli", + ".ml", + ".re", + ".rei", + ".hx", + ".hxml", + ".wgsl", + ".metal", + ".glsl", + ".vert", + ".frag", + ".shader", +} + +# Code/config files that often have no extension but are meaningful +CODE_BASENAMES = { + "Dockerfile", + "Makefile", + "CMakeLists.txt", + "WORKSPACE", + "BUILD", + "BUILD.bazel", + "Gemfile", + "Rakefile", + "Procfile", + ".env", + ".env.example", + ".envrc", + ".gitignore", + ".gitattributes", + ".editorconfig", + "Pipfile", + "requirements.txt", + "pyproject.toml", + "setup.cfg", + "setup.py", + "package.json", + "package-lock.json", + "pnpm-lock.yaml", + "yarn.lock", + "tsconfig.json", + ".babelrc", + ".eslintrc", + ".prettierrc", + ".prettierignore", + ".ruff.toml", + ".flake8", +} + +# Common dirs we gnore even if texty (not the project’s purpose): +IGNORE_DIRS = { + ".git", + ".github", + ".gitlab", + ".svn", + ".hg", + "assets", + "static", + "public", + "media", + "images", + "img", + "figures", + "screenshots", + "thumbnails", + "downloads", + "node_modules", + "dist", + "build", + "out", + "target", + "__pycache__", + ".ipynb_checkpoints", + ".mypy_cache", + ".pytest_cache", + ".ruff_cache", + ".venv", + "venv", + "env", + ".idea", + ".vscode", + ".next", + ".cache", + ".parcel-cache", + "third_party", + "vendor", + ".tox", + ".eggs", + ".gradle", + ".nuget", + "Pods", + "Packages", + ".Rproj.user", + "models", + "model", + "checkpoints", + "artifacts", + "data", + "datasets", + "samples", + "sample-data", + "logs", + "log", + "tmp", + "temp", + ".coverage", + "coverage", +} + +BINARY_MULTI_EXTS = ( + ".tar.gz", + ".tgz", + ".tar.bz2", + ".tbz2", + ".tar.xz", + ".txz", + ".nii.gz", + ".ome.tif", + ".ome.tiff", +) +EXT_SIZE_LIMITS = { + ".json": 2_000_000, # 2 MB + ".md": 1_500_000, + ".xml": 2_000_000, + ".svg": 800_000, + ".html": 1_500_000, +} + +TEXT_MAX_BYTES_PER_CHUNK = 16384 # per chunk (bytes) +CHUNKS_PER_FILE_CAP = 64 # hard cap per file to avoid huge files +FILES_PER_DIR_CAP = 300 # safety: extremely large dirs will cap at this + + +def delete_clone_path(p: pathlib.Path): + """Remove a previously cloned repo folder; ignore if missing.""" + try: + if p.exists(): + shutil.rmtree(p) + except FileNotFoundError: + pass + except Exception as e: + print(f"[warn] failed to delete clone at {p}: {e}") + + +def _run(cmd: list[str], cwd: Optional[pathlib.Path] = None) -> str: + p = subprocess.run( + cmd, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True + ) + if p.returncode != 0: + raise RuntimeError(f"Command failed: {' '.join(cmd)}\n{p.stderr.strip()}") + return p.stdout + + +def shallow_clone(owner: str, repo: str, dest_root: pathlib.Path) -> pathlib.Path: + """ + Clone or refresh a shallow checkout of https://github.com/{owner}/{repo}.git + into dest_root/owner/repo. Returns the repo path. + """ + url = f"https://github.com/{owner}/{repo}.git" + dest = dest_root / owner / repo + dest.parent.mkdir(parents=True, exist_ok=True) + if (dest / ".git").exists(): + _run(["git", "fetch", "--depth", "1", "origin"], cwd=dest) + _run(["git", "reset", "--hard", "origin/HEAD"], cwd=dest) + else: + _run(["git", "clone", "--depth", "1", "--single-branch", url, str(dest)]) + return dest + + +def _is_binary_path(p: pathlib.Path) -> bool: + name_l = p.name.lower() + if name_l.endswith(BINARY_MULTI_EXTS): + return True + if p.suffix.lower() in BINARY_EXTS: + return True + try: + return p.stat().st_size > 2_000_000 + except Exception: + return True + + +def _should_skip_dir(p: pathlib.Path) -> bool: + name = p.name.lower() + return name in IGNORE_DIRS + + +def iter_text_files(root: pathlib.Path) -> Iterable[pathlib.Path]: + """ + Yield only code/config/text files under root, skipping obvious binary/vendor dirs. + """ + for dirpath, dirnames, filenames in os.walk(root): + # prune ignored dirs + dirnames[:] = [ + d for d in dirnames if not _should_skip_dir(pathlib.Path(dirpath) / d) + ] + + # dir-level cap to avoid explosion + if len(filenames) > FILES_PER_DIR_CAP: + filenames = sorted(filenames)[:FILES_PER_DIR_CAP] + + for fn in filenames: + p = pathlib.Path(dirpath) / fn + + # skip files inside .git (belt & suspenders) + if ".git" in p.parts: + continue + + # skip binary/huge files first + if _is_binary_path(p): + continue + + ext = p.suffix.lower() + if p.name.startswith(".") and p.name not in CODE_BASENAMES: + continue + if ext not in CODE_EXTS and p.name not in CODE_BASENAMES: + continue + # cap oversized text-like files that aren’t useful for “goal” synthesis + name = p.name.lower() + if name.endswith(".min.js"): + continue + if name.endswith(".map"): + continue + lim = EXT_SIZE_LIMITS.get(ext) + if lim is not None: + try: + if p.stat().st_size > lim: + continue + except Exception: + pass + yield p + + +def chunk_file_bytes( + p: pathlib.Path, + max_chunk_bytes: int = TEXT_MAX_BYTES_PER_CHUNK, + max_chunks: int = CHUNKS_PER_FILE_CAP, + huge_threshold: Optional[int] = None, + windows: int = 8, +): + """ + Yield text chunks from a file with these rules: + - ≤ max_chunk_bytes: one chunk (whole file) + - (max_chunk_bytes, huge_threshold]: split into N near-equal chunks (N = ceil(size/max_chunk_bytes), capped) + - > huge_threshold: sample `windows` spans of size max_chunk_bytes (head/tail/middles) + """ + if huge_threshold is None: + huge_threshold = max_chunk_bytes * max_chunks + + try: + n = p.stat().st_size + except Exception: + return + + if n == 0: + return + + if n <= max_chunk_bytes: + try: + yield p.read_text(encoding="utf-8", errors="ignore") + except Exception: + try: + yield p.read_bytes().decode("utf-8", errors="ignore") + except Exception: + return + return + + if n > huge_threshold: + win_size = max_chunk_bytes + spans = [] + spans.append((0, min(win_size, n))) + spans.append((max(0, n - win_size), n)) + if windows > 2: + step = max((n - 2 * win_size) // (windows - 2), 1) + pos = win_size + for _ in range(windows - 2): + start = min(max(pos, 0), max(0, n - win_size)) + spans.append((start, min(start + win_size, n))) + pos += step + try: + with p.open("rb") as f: + for s, e in spans[:max_chunks]: + f.seek(s) + chunk = f.read(e - s) + yield chunk.decode("utf-8", errors="ignore") + except Exception: + return + return + + # Medium: read once, but only as much as needed + try: + data = p.read_bytes() + except Exception: + return + num_chunks = min(max_chunks, max(2, math.ceil(n / max_chunk_bytes))) + chunk_size = math.ceil(n / num_chunks) + start = 0 + for _ in range(num_chunks): + end = min(start + chunk_size, n) + if end <= start: + break + yield data[start:end].decode("utf-8", errors="ignore") + start = end + + +def summarize_file_chunks( + path: str, chunks: List[str], model_low: str, model_medium: str, call_llm_fn +) -> str: + """ + Map step: summarize a single file's purpose-only signals from all its chunks. + """ + bullets: List[str] = [] + for i, ch in enumerate(chunks, 1): + prompt = textwrap.dedent( + f""" + From the file below, write 2-12 bullets capturing a summary that has ONLY purpose/intent (not usage/install/code minutiae). + + FILE: {path} (chunk {i}/{len(chunks)}) + --- + {ch} + """ + ).strip() + msg = [ + { + "role": "system", + "content": "Extract purpose-only bullets from file text. Keep it minimal.", + }, + {"role": "user", "content": prompt}, + ] + bullets.append(call_llm_fn(msg, model=model_low)) + # reduce bullets -> one-liner for file + reduce_prompt = textwrap.dedent( + f""" + Combine the bullets below into a single 2-12 sentence purpose statement/summary for this file. + Avoid implementation details. If purpose is unclear, say 'unclear'. + + FILE: {path} + BULLETS: + {chr(10).join(bullets)} + """ + ).strip() + return call_llm_fn( + [ + { + "role": "system", + "content": "Distill bullets into one short purpose sentence.", + }, + {"role": "user", "content": reduce_prompt}, + ], + model=model_medium, + ) + + +def summarize_directory( + file_summaries: List[Tuple[str, str]], model: str, call_llm_fn +) -> str: + """ + Reduce step: combine many file-purpose sentences into a directory-level 2-12 sentence purpose. + """ + body = "\n".join([f"- {p}: {s}" for p, s in file_summaries]) + prompt = textwrap.dedent( + f""" + Summarize the unified purpose for this collection of files in **2-12 sentences**. + Focus on what the code aims to do. Avoid lists and specifics. + + FILE PURPOSES: + {body[:18000]} + """ + ).strip() + return call_llm_fn( + [ + { + "role": "system", + "content": "Synthesize a concise, faithful purpose across files.", + }, + {"role": "user", "content": prompt}, + ], + model=model, + ) + + +def synthesize_repo_goal_from_code( + repo_root: pathlib.Path, + model_medium: str, + model_low: str, + model_high: str, + call_llm_fn, +) -> str: + """ + Full map-reduce across ALL text files: + file chunks -> file purpose -> repo purpose (final goal). + """ + # 1) Map: per-file purpose + file_purposes: List[Tuple[str, str]] = [] + for p in iter_text_files(repo_root): + rel = str(p.relative_to(repo_root)) + chunks = list(chunk_file_bytes(p)) + if not chunks: + continue + try: + purpose = summarize_file_chunks( + rel, + chunks, + model_low=model_low, + model_medium=model_medium, + call_llm_fn=call_llm_fn, + ) + if purpose and purpose.strip(): + file_purposes.append((rel, purpose.strip())) + except Exception: + # skip noisy failures, continue + continue + + if not file_purposes: + return "Goal not explicitly stated." + + # 2) Reduce: repo-level purpose + return summarize_directory(file_purposes, model=model_high, call_llm_fn=call_llm_fn) diff --git a/src/rollup_projects.py b/src/rollup_projects.py new file mode 100644 index 0000000..354978b --- /dev/null +++ b/src/rollup_projects.py @@ -0,0 +1,600 @@ +# src/rollup_projects.py +""" +Aggregate per-repo activity tables into per-project summaries. +""" + +import os, json +from datetime import datetime, timezone +import numpy as np +import pandas as pd +from collections import Counter + + +# ----------- Paths ----------- +CLEAN_DIR = "data/clean" # where normalize_activity.py wrote the _all_*.parquet files +OUT_DIR = "data/summary" # where we’ll write per-project JSON and CSV +os.makedirs(OUT_DIR, exist_ok=True) +SEED_CSV = "data/projects_seed.csv" +RAW_DIR = "data/raw/github" + + +# ----------- Small utilities ----------- +def _jsonable(v): + """Convert pandas/NumPy/time values to JSON-safe Python types/strings.""" + # pandas/pyarrow timestamps -> ISO8601 (UTC) + if isinstance(v, pd.Timestamp): + if v.tzinfo is None: + v = v.tz_localize("UTC") + else: + v = v.tz_convert("UTC") + return v.isoformat() + # python datetime -> ISO8601 (UTC) + if isinstance(v, datetime): + if v.tzinfo is None: + v = v.replace(tzinfo=timezone.utc) + else: + v = v.astimezone(timezone.utc) + return v.isoformat() + # NumPy scalars -> native Python + if isinstance(v, np.generic): + return v.item() + return v + + +def load_or_empty(path: str) -> pd.DataFrame: + """Read a Parquet if it exists; else return an empty DataFrame (so code can proceed).""" + return pd.read_parquet(path) if os.path.exists(path) else pd.DataFrame() + + +def load_seed(path: str = SEED_CSV) -> pd.DataFrame: + df = pd.read_csv(path) + # normalize columns we care about + for col in ["project_id", "project_name", "owner", "repo"]: + if col not in df.columns: + df[col] = None + # enforce string where applicable + if "project_id" in df.columns: + df["project_id"] = df["project_id"].astype("string") + if "project_name" in df.columns: + df["project_name"] = df["project_name"].astype("string") + return df + + +def read_raw_json(owner: str, repo: str) -> dict | None: + path = os.path.join(RAW_DIR, f"{owner}__{repo}.json") + if not os.path.exists(path): + return None + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +def top_n(counter_like, n=5): + """ + Accepts: + - a list of values (we'll count them), or + - a Counter + Returns [{'value': , 'count': }, ...] top n by frequency. + """ + return [ + {"value": k, "count": int(v)} for k, v in Counter(counter_like).most_common(n) + ] + + +def explode_tags(series: pd.Series) -> list[str]: + """ + Intent tags are stored like 'feature|fix|docs'. + Turn a series of such strings into a flat list of tag tokens. + """ + vals = [] + for s in series.dropna(): + vals.extend([t for t in str(s).split("|") if t]) + return vals + + +def collect_examples( + df: pd.DataFrame, cols: list[str], n=5, sort_col: str | None = None +) -> list[dict]: + """ + Take the first n rows of df (optionally after sorting by sort_col DESC), + extract a subset of fields named in `cols`, converting values to JSON-safe types/strings. + """ + out = [] + if df is None or df.empty: + return out + + sub = df + if sort_col and sort_col in df.columns: + try: + sub = df.sort_values(sort_col, ascending=False) + except Exception: + # If dtype is mixed/invalid for sorting, fall back to original order + sub = df + + for _, r in sub.head(n).iterrows(): + item = {} + for c in cols: + v = r.get(c) + item[c] = _jsonable(v) + out.append(item) + return out + + +from collections import Counter + + +def _iter_project_repos( + per_project_dfs: dict[str, pd.DataFrame], seed_slice: pd.DataFrame | None = None +) -> list[tuple[str, str]]: + """ + Return sorted list of (owner, repo) pairs present in activity tables OR listed in the seed. + """ + pairs = set() + for df in per_project_dfs.values(): + if ( + isinstance(df, pd.DataFrame) + and not df.empty + and {"owner", "repo"}.issubset(df.columns) + ): + sub = df[["owner", "repo"]].dropna().astype(str) + pairs.update(map(tuple, sub.values)) + if isinstance(seed_slice, pd.DataFrame) and not seed_slice.empty: + for _, r in seed_slice.dropna(subset=["owner", "repo"]).astype(str).iterrows(): + pairs.add((r["owner"], r["repo"])) + return sorted(pairs) + + +def _extract_single_repo_fields( + per_project_dfs: dict[str, pd.DataFrame], owner: str, repo: str +) -> dict: + """ + Pull context for one repo from any activity table row; fallback to raw JSON if needed. + Returns fields: description, homepage, topics[], primary_language, languages[], readme (str|None). + """ + fields = { + "description": None, + "homepage": None, + "topics": [], + "primary_language": None, + "languages": [], + "readme": None, + } + + def _split_csv(val): + if val is None or (isinstance(val, float) and pd.isna(val)): + return [] + return [x.strip() for x in str(val).split(",") if x.strip()] + + # Try to find a row for this repo in any table (prefer one with readme_text) + tables = [ + "commits", + "issues", + "pull_requests", + "releases", + "pr_files", + "stargazers", + "forks", + ] + best = None + for t in tables: + df = per_project_dfs.get(t) + if not (isinstance(df, pd.DataFrame) and not df.empty): + continue + sub = df[(df.get("owner") == owner) & (df.get("repo") == repo)] + if sub.empty: + continue + with_readme = ( + sub[~sub.get("readme_text").isna()] + if "readme_text" in sub.columns + else pd.DataFrame() + ) + best = with_readme.iloc[0] if not with_readme.empty else sub.iloc[0] + break + + if best is not None: + fields["description"] = best.get("repo_description") or None + fields["homepage"] = best.get("repo_homepage") or None + fields["topics"] = _split_csv(best.get("repo_topics")) + fields["primary_language"] = best.get("repo_primary_language") or None + fields["languages"] = _split_csv(best.get("repo_languages")) + fields["readme"] = best.get("readme_text") or None + # early return if we already have a README + if fields["readme"]: + return fields + + # Fallback to raw JSON (helps for inactive repos) + raw = read_raw_json(owner, repo) or {} + if raw: + # topics + topics = [] + try: + nodes = (raw.get("repositoryTopics") or {}).get("nodes", []) or [] + topics = [ + n["topic"]["name"] + for n in nodes + if n and n.get("topic") and n["topic"].get("name") + ] + except Exception: + pass + # languages + langs = [] + try: + nodes = (raw.get("languages") or {}).get("nodes", []) or [] + langs = [n.get("name") for n in nodes if n and n.get("name")] + except Exception: + pass + fields["description"] = fields["description"] or raw.get("description") + fields["homepage"] = fields["homepage"] or raw.get("homepageUrl") + fields["topics"] = fields["topics"] or topics + fields["primary_language"] = fields["primary_language"] or ( + (raw.get("primaryLanguage") or {}).get("name") + ) + fields["languages"] = fields["languages"] or langs + fields["readme"] = fields["readme"] or raw.get("__readme_text") + + return fields + + +def build_repo_context_all( + per_project_dfs: dict[str, pd.DataFrame], + seed_slice: pd.DataFrame | None = None, + readme_chars: int = 20000, +) -> dict: + """ + Aggregate context across ALL repos in the project (including inactive ones listed in the seed). + - description: most common non-empty; if multiple distinct, join a few unique variants (<=500 chars). + - homepage: most common non-empty + - topics: frequency-sorted union + - primary_language: most common + - languages: frequency-sorted union + - readme: concatenation of short per-repo README excerpts with 'owner/repo' headers (truncated to readme_chars) + """ + pairs = _iter_project_repos(per_project_dfs, seed_slice=seed_slice) + if not pairs: + return { + "description": None, + "homepage": None, + "topics": [], + "primary_language": None, + "languages": [], + "readme": None, + } + + descs, homes = [], [] + topic_ctr, lang_ctr, primary_ctr = Counter(), Counter(), Counter() + parts = [] + + for owner, repo in pairs: + f = _extract_single_repo_fields(per_project_dfs, owner, repo) + if f["description"]: + descs.append(f["description"].strip()) + if f["homepage"]: + homes.append(f["homepage"].strip()) + topic_ctr.update([t for t in f["topics"] if t]) + lang_ctr.update([l for l in f["languages"] if l]) + if f["primary_language"]: + primary_ctr.update([f["primary_language"]]) + if f["readme"]: + excerpt = f["readme"].strip() + # 2K per-repo excerpt to keep the total bounded + parts.append(f"### {owner}/{repo}\n{excerpt[:2000]}") + + # Choose description/homepage by frequency; if many distinct descriptions, join a few + description = None + if descs: + desc_counts = Counter(descs).most_common() + description = desc_counts[0][0] + if len(desc_counts) > 1: + uniq = [] + seen = set() + for d, _ in desc_counts: + if d not in seen: + seen.add(d) + uniq.append(d) + if len(" | ".join(uniq)) > 500: + break + description = " | ".join(uniq) + + homepage = Counter(homes).most_common(1)[0][0] if homes else None + topics = [t for t, _ in topic_ctr.most_common(50)] + languages = [l for l, _ in lang_ctr.most_common(50)] + primary_language = primary_ctr.most_common(1)[0][0] if primary_ctr else None + readme = (("\n\n").join(parts)[:readme_chars]) if parts else None + + return { + "description": description, + "homepage": homepage, + "topics": topics, + "primary_language": primary_language, + "languages": languages, + "readme": readme, + } + + +# ----------- Per-project summarization ----------- + + +def summarize_project( + project_id: str, project_name: str | None, per_project_dfs: dict[str, pd.DataFrame] +) -> dict: + """ + Build a single JSON-serializable summary for ONE project_id from its sliced tables. + Expects per_project_dfs to contain DataFrames for: + commits, issues, pull_requests, stargazers, forks, releases, pr_files + (Any may be empty DataFrames.) + """ + # Pull per-table slices with defaults + commits = per_project_dfs.get("commits", pd.DataFrame()) + issues = per_project_dfs.get("issues", pd.DataFrame()) + prs = per_project_dfs.get("pull_requests", pd.DataFrame()) + stars = per_project_dfs.get("stargazers", pd.DataFrame()) + forks = per_project_dfs.get("forks", pd.DataFrame()) + releases = per_project_dfs.get("releases", pd.DataFrame()) + pr_files = per_project_dfs.get("pr_files", pd.DataFrame()) + + # Consider project "active" if any table has ≥1 row (forks are weak activity but still signal) + active = any(len(df) > 0 for df in [commits, issues, prs, releases, stars, forks]) + + # Aggregate intent tags across commits/issues/PRs/releases to top themes + theme_tags = explode_tags( + pd.concat( + [ + commits.get("intent_tags", pd.Series(dtype=str)), + issues.get("intent_tags", pd.Series(dtype=str)), + prs.get("intent_tags", pd.Series(dtype=str)), + releases.get("intent_tags", pd.Series(dtype=str)), + ], + ignore_index=True, + ) + ) + themes = top_n(theme_tags, 6) + + # Build example lists (recent first, robust datetime sort) + # Include owner/repo so higher-level summaries can balance across repos + release_examples = collect_examples( + releases, + ["owner", "repo", "published_at", "release_name", "release_tag", "release_url"], + n=5, + sort_col="published_at", + ) + commit_examples = collect_examples( + commits, + [ + "owner", + "repo", + "committed_at", + "author_login", + "message_headline", + "commit_url", + ], + n=5, + sort_col="committed_at", + ) + pr_examples = collect_examples( + prs, + ["owner", "repo", "created_at", "author_login", "title", "pr_url", "state"], + n=5, + sort_col="created_at", + ) + issue_examples = collect_examples( + issues, + ["owner", "repo", "created_at", "author_login", "title", "issue_url", "labels"], + n=5, + sort_col="created_at", + ) + + # “Areas touched” = frequent top-level directories and/or file extensions from PR files + areas = [] + if ( + isinstance(pr_files, pd.DataFrame) + and not pr_files.empty + and "path" in pr_files.columns + ): + paths = pr_files["path"].dropna().astype(str).tolist() + top_dirs = [ + p.split("/")[0] for p in paths if "/" in p + ] # e.g., 'api', 'src', 'docs' + exts = [ + p.rsplit(".", 1)[-1] for p in paths if "." in p + ] # e.g., 'py', 'md', 'yaml' + areas = top_n(top_dirs + exts, 10) + + # Contributors = commit authors + PR authors (by login) + commit_authors = commits.get("author_login", pd.Series(dtype=str)).dropna().tolist() + pr_authors = prs.get("author_login", pd.Series(dtype=str)).dropna().tolist() + contributors = top_n(commit_authors + pr_authors, 8) + + # Issue filers + issue label themes + issue_authors = issues.get("author_login", pd.Series(dtype=str)).dropna().tolist() + issue_labels = [] + if "labels" in issues.columns: + for lbls in issues["labels"].dropna(): + issue_labels.extend([x.strip() for x in str(lbls).split(",") if x.strip()]) + issue_themes = top_n( + issue_labels + theme_tags, 8 + ) # fuse label tokens with heuristic tags + issue_filers = top_n(issue_authors, 8) + + # “Interest” signals (who starred, who forked) + stargazers = top_n( + stars.get("stargazer_login", pd.Series(dtype=str)).dropna().tolist(), 8 + ) + fork_owners = top_n( + forks.get("fork_owner_login", pd.Series(dtype=str)).dropna().tolist(), 8 + ) + + # Context multi-repo aware + seed_slice = globals().get("_SEED_BY_PID", {}).get(project_id, pd.DataFrame()) + repo_ctx = build_repo_context_all(per_project_dfs, seed_slice=seed_slice) + + # Build the dict to return (JSON-serializable) + return { + "project_id": project_id, + "project_name": project_name, + "active_in_window": bool(active), + "repo_context": repo_ctx, # representative repo (not arbitrary first table) + "areas_touched": areas, # [{value:'api', count:7}, {value:'py', count:5}, …] + "themes": themes, # heuristic tags aggregated + "contributors": contributors, # top commit/PR authors + "issue_filers": issue_filers, # top issue creators + "issue_themes": issue_themes, # label tokens + intent tags + "recent_examples": { # concrete, clickable traceability + "commits": commit_examples, + "pull_requests": pr_examples, + "issues": issue_examples, + "releases": release_examples, + }, + "interest_signals": { + "stargazers": stargazers, + "fork_owners": fork_owners, + }, + "notes": "Heuristic tags; examples sampled from the window. Repo context chosen by activity+README heuristic.", + } + + +# ----------- Main orchestration ----------- + + +def main(): + # Load ALL-REPO combined tables (may be empty if no rows were written for that table) + commits = load_or_empty(os.path.join(CLEAN_DIR, "_all_commits.parquet")) + issues = load_or_empty(os.path.join(CLEAN_DIR, "_all_issues.parquet")) + prs = load_or_empty(os.path.join(CLEAN_DIR, "_all_pull_requests.parquet")) + stars = load_or_empty(os.path.join(CLEAN_DIR, "_all_stargazers.parquet")) + forks = load_or_empty(os.path.join(CLEAN_DIR, "_all_forks.parquet")) + releases = load_or_empty(os.path.join(CLEAN_DIR, "_all_releases.parquet")) + pr_files = load_or_empty(os.path.join(CLEAN_DIR, "_all_pr_files.parquet")) + + # Normalize key string columns if present (helps avoid dtype mismatches) + for df in [commits, issues, prs, releases, pr_files, stars, forks]: + if isinstance(df, pd.DataFrame) and not df.empty: + if "project_id" in df.columns: + df["project_id"] = df["project_id"].astype("string") + if "project_name" in df.columns: + df["project_name"] = df["project_name"].astype("string") + + # Determine which project_ids exist anywhere across the tables + # project ids found in tables + projects_found = set( + pd.concat( + [ + commits.get("project_id", pd.Series(dtype="string")), + issues.get("project_id", pd.Series(dtype="string")), + prs.get("project_id", pd.Series(dtype="string")), + releases.get("project_id", pd.Series(dtype="string")), + pr_files.get("project_id", pd.Series(dtype="string")), + stars.get("project_id", pd.Series(dtype="string")), + forks.get("project_id", pd.Series(dtype="string")), + ], + ignore_index=True, + ) + .dropna() + .unique() + ) + + # load seed and union with found projects + seed_df = load_seed(SEED_CSV) + seed_df["project_id"] = seed_df["project_id"].astype("string") + seed_df["project_name"] = seed_df["project_name"].astype("string") + projects_seed = set(seed_df["project_id"].dropna().unique()) + + # expose a handy index for summarize_project() + global _SEED_BY_PID + _SEED_BY_PID = {pid: seed_df[seed_df["project_id"] == pid] for pid in projects_seed} + + projects = sorted(projects_found | projects_seed) + + # We'll accumulate machine-readable summaries here so summarize_portfolio.py can read them + portfolio = {"projects": [], "generated_from": CLEAN_DIR} + + for pid in projects: + # Try to recover a human-friendly project_name from any table that has it for this pid + pname = None + for df in (commits, issues, prs, releases, pr_files, stars, forks): + if ( + isinstance(df, pd.DataFrame) + and "project_id" in df.columns + and "project_name" in df.columns + ): + vals = df[df["project_id"] == pid]["project_name"].dropna().unique() + if len(vals): + pname = vals[0] + break + + # Slice each table down to this project id (or keep the empty DataFrame) + per_project_dfs = { + "commits": ( + commits[commits["project_id"] == pid] if not commits.empty else commits + ), + "issues": ( + issues[issues["project_id"] == pid] if not issues.empty else issues + ), + "pull_requests": prs[prs["project_id"] == pid] if not prs.empty else prs, + "releases": ( + releases[releases["project_id"] == pid] + if not releases.empty + else releases + ), + "pr_files": ( + pr_files[pr_files["project_id"] == pid] + if not pr_files.empty + else pr_files + ), + "stargazers": ( + stars[stars["project_id"] == pid] if not stars.empty else stars + ), + "forks": forks[forks["project_id"] == pid] if not forks.empty else forks, + } + + # Compute distinct repos (and reuse to form a stable list) + seed_slice = _SEED_BY_PID.get(pid, pd.DataFrame()) + pairs = _iter_project_repos(per_project_dfs, seed_slice=seed_slice) + repo_count = len(pairs) + + # Produce the actual summary payload + summary = summarize_project(pid, pname, per_project_dfs) + summary["repo_count"] = repo_count + summary["repos"] = [{"owner": o, "repo": r} for (o, r) in pairs] + + # Write per-project machine JSON (consumed by summarize_projects.py and summarize_portfolio.py) + out_json = os.path.join(OUT_DIR, f"{pid}.json") + with open(out_json, "w", encoding="utf-8") as f: + json.dump(summary, f, indent=2, ensure_ascii=False) + print(f"Wrote {out_json}") + + # Also write a small people CSV to quickly see top participants + people_rows = [] + for row in summary["contributors"]: + people_rows.append( + { + "project_id": pid, + "role": "contributor", + "login": row["value"], + "count": row["count"], + } + ) + for row in summary["issue_filers"]: + people_rows.append( + { + "project_id": pid, + "role": "issue_filer", + "login": row["value"], + "count": row["count"], + } + ) + if people_rows: + pd.DataFrame(people_rows).to_csv( + os.path.join(OUT_DIR, f"{pid}__people.csv"), index=False + ) + + # Add to the machine portfolio index + portfolio["projects"].append(summary) + + # Write a simple portfolio JSON index listing all project summaries + # (summarize_portfolio.py will turn THIS into a narrative report) + with open(os.path.join(OUT_DIR, "_portfolio.json"), "w", encoding="utf-8") as f: + json.dump(portfolio, f, indent=2, ensure_ascii=False) + print(f"Wrote {os.path.join(OUT_DIR, '_portfolio.json')}") + + +if __name__ == "__main__": + main() diff --git a/src/summarize_portfolio.py b/src/summarize_portfolio.py new file mode 100644 index 0000000..e475cd6 --- /dev/null +++ b/src/summarize_portfolio.py @@ -0,0 +1,625 @@ +# src/summarize_portfolio.py +""" +Generate a portfolio-level executive summary report in Markdown, synthesizing information. +""" + +import os, json, argparse, textwrap, time +from typing import List, Dict, Any +from collections import Counter +from dotenv import load_dotenv +from openai import OpenAI +from datetime import datetime, timezone +import re + +# ------------------ Environment & Client Setup ------------------ +load_dotenv() # pull OPENAI_API_KEY / OPENAI_MODEL from .env if present + +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") +if not OPENAI_API_KEY: + # Fail fast if the key is missing-nothing will work without it. + raise SystemExit("Missing OPENAI_API_KEY in .env") + +# Default model can be overridden by --model at runtime +DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o-mini") + +# Initialize OpenAI client (reads API key from env) +DEFAULT_HTTP_TIMEOUT = float(os.environ.get("OPENAI_HTTP_TIMEOUT", "60")) # seconds +client = OpenAI( + timeout=DEFAULT_HTTP_TIMEOUT, max_retries=0 +) # applies connect/read/write timeouts + +# Where we read portfolio JSON and write the final Markdown +REPORTS_DIR = "reports" +SUMMARY_DIR = "data/summary" +os.makedirs(REPORTS_DIR, exist_ok=True) + +# --- Project MD: extract "## Recent Developments (...)" --- +_PROJECT_ACTIVITY_RE_TMPL = ( + r"^##\s*Recent Developments\s*\(\s*{label}\s*\)\s*\n(.*?)(?:\n##\s+|\Z)" +) + +# put near the top with the other regexes +_GOAL_RE = re.compile( + r"^\s*##\s*Summary\s*(?:and|&)\s*Goal\s*\n(.*?)(?=^\s*##\s|\Z)", + flags=re.DOTALL | re.MULTILINE | re.IGNORECASE, +) + + +def _read_project_activity_from_md(project_id: str, window_label: str) -> str | None: + """ + Read reports/.md and extract the '## Recent Developments ()' block. + Returns stripped text or None. + """ + path = os.path.join(REPORTS_DIR, f"{project_id}.md") + if not os.path.exists(path): + return None + with open(path, "r", encoding="utf-8") as f: + text = f.read() + # Make a regex that matches the exact window label literally + pat = re.compile( + _PROJECT_ACTIVITY_RE_TMPL.format(label=re.escape(window_label)), + flags=re.DOTALL | re.MULTILINE, + ) + m = pat.search(text) + if not m: + return None + body = (m.group(1) or "").strip() + return body or None + + +def build_portfolio_activity_corpus_from_project_mds( + projects: List[Dict[str, Any]], window_label: str +) -> str: + """ + Concatenate the '## Recent Developments ()' section from each project's MD, + with clear project labels and a per-project character cap to prevent domination. + Skips boilerplate 'No changes in ' lines. + """ + + def _cap_for(n, soft_total=12000, min_cap=300, max_cap=900): + # Aim for ~12k chars total; clamp to keep useful signal + return max(min_cap, min(max_cap, soft_total // max(1, n))) + + parts: list[str] = [] + n = max(1, len(projects)) + per_cap = _cap_for(n) + for p in sorted(projects, key=lambda x: (x.get("project_id") or "")): + pid = (p.get("project_id") or "").strip() + if not pid: + continue + block = _read_project_activity_from_md(pid, window_label) + if not block: + continue + if block.strip() == f"**No changes in {window_label}**": + continue + pname = (p.get("project_name") or pid).strip() + parts.append(f"[PROJECT {pid} — {pname}]\n{block.strip()[:per_cap]}") + return ("\n\n".join(parts)).strip() + + +def _goal_from_project_md(project_id: str) -> str | None: + """ + Fallback: read 'reports/.md' and extract the text under '## Goal' + up to the next '## ' or end-of-file. Returns stripped text or None. + """ + md_path = os.path.join(REPORTS_DIR, f"{project_id}.md") + if not os.path.exists(md_path): + return None + with open(md_path, "r", encoding="utf-8") as f: + md = f.read() + m = re.search( + r"^## Summary and Goal\s*\n(.*?)(?:\n## |\Z)", + md, + flags=re.DOTALL | re.MULTILINE, + ) + if not m: + return None + text = m.group(1).strip() + return text or None + + +# ------------------ Utilities ------------------ + + +def _footer(): + dmy = datetime.now(timezone.utc).strftime("%d/%m/%Y") + return f"\n\n*Report generated using A.I. on {dmy}*" + + +def read_json(path: str) -> Dict[str, Any]: + """Load JSON file from disk into a Python dict.""" + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +def call_llm(messages, model: str, max_retries: int = 4) -> str: + """ + Robust LLM call for prebuilt messages: + - bounded retries with backoff, + - treats empty content as an error, + - surfaces the last error clearly. + """ + last_err = None + for attempt in range(1, max_retries + 1): + try: + resp = client.chat.completions.create( + model=model, + messages=messages, + ) + text = (resp.choices[0].message.content or "").strip() + if not text: + raise RuntimeError("Empty completion") + return text + except Exception as e: + last_err = e + time.sleep(min(2**attempt, 10)) + raise RuntimeError(f"LLM failed after {max_retries} attempts: {last_err}") + + +def _chunk_text(s: str, chunk_chars: int = 12000, overlap: int = 500) -> List[str]: + """ + Split a long string into overlapping chunks. + We use this when synthesizing a clean goal statement from a long README. + """ + s = s or "" + if len(s) <= chunk_chars: + return [s] + chunks, start, n = [], 0, len(s) + while start < n: + end = min(start + chunk_chars, n) + chunks.append(s[start:end]) + if end == n: + break + start = max(0, end - overlap) # overlap keeps continuity between chunks + return chunks + + +def compute_portfolio_metrics(projects: List[Dict[str, Any]]): + totals = dict( + projects=len(projects), + active=sum(1 for p in projects if p.get("active_in_window")), + ) + commits = prs = issues = releases = 0 + theme_ctr = Counter() + area_ctr = Counter() + + for p in projects: + ex = p.get("recent_examples") or {} + commits += len(ex.get("commits") or []) + prs += len(ex.get("pull_requests") or []) + issues += len(ex.get("issues") or []) + releases += len(ex.get("releases") or []) + for t in p.get("themes") or []: + if t and t.get("value"): + theme_ctr.update([t["value"]]) + for a in p.get("areas_touched") or []: + if a and a.get("value"): + area_ctr.update([a["value"]]) + + totals.update( + { + "commits": commits, + "prs": prs, + "issues": issues, + "releases": releases, + "top_themes": [f"{k} ({v})" for k, v in theme_ctr.most_common(6)], + "top_areas": [f"{k} ({v})" for k, v in area_ctr.most_common(6)], + } + ) + return totals + + +# ------------------ README → Goal (helper) ------------------ +def summarize_readme_goal(readme_text: str, model: str) -> str: + """ + Distill a potentially long README into a crisp purpose statement. + Strategy: + 1) Summarize each chunk into "purpose-only" bullets. + 2) Synthesize a final goal from all bullets. + """ + chunks = _chunk_text(readme_text, chunk_chars=12000, overlap=500) + bullets = [] + for i, ch in enumerate(chunks, 1): + messages = [ + { + "role": "system", + "content": "You extract the core PURPOSE of a repository from README text.", + }, + { + "role": "user", + "content": textwrap.dedent( + f""" + From the README chunk below, write 3–5 ultra-concise bullets capturing the repository's PURPOSE only. + Avoid installation/usage details, badges, and marketing language. + + --- README CHUNK {i}/{len(chunks)} --- + {ch} + """ + ).strip(), + }, + ] + bullets.append(call_llm(messages, model=model)) + + synth_messages = [ + { + "role": "system", + "content": "You distill bullets into a faithful, succinct purpose statement.", + }, + { + "role": "user", + "content": textwrap.dedent( + f""" + Combine the bullets below into a single 1–2 sentence statement describing the repository/project goal. + Do not invent details. + + BULLETS: + {chr(10).join(bullets)} + """ + ).strip(), + }, + ] + return call_llm(synth_messages, model=model).strip() + + +# ------------------ Render helpers ------------------ +def safe_kv_list(items: List[Dict[str, Any]], k="value", c="count", top=6) -> List[str]: + """ + Turn [{'value': 'foo', 'count': 7}, ...] into ['foo (7)', ...], with a top N cap. + Safely handles missing keys or None. + """ + out = [] + for it in (items or [])[:top]: + if it and it.get(k): + if c in it and it.get(c) is not None: + out.append(f"{it[k]} ({it[c]})") + else: + out.append(str(it[k])) + return out + + +def ex_lines(items: List[Dict[str, Any]], fields: List[str], n=4) -> List[str]: + """ + Turn an array of dicts into 'field1 - field2 - field3' lines. + n caps how many lines to return. + """ + if not items: + return [] + lines = [] + for it in items[:n]: + parts = [] + for f in fields: + v = it.get(f) + if v: + parts.append(str(v)) + if parts: + lines.append(" - ".join(parts)) + return lines + + +# ------------------ Prompt builders ------------------ +def _extract_goal_from_md(text: str) -> str | None: + # normalize line endings and strip BOM if present + text = (text or "").lstrip("\ufeff").replace("\r\n", "\n").replace("\r", "\n") + m = _GOAL_RE.search(text) + if not m: + return None + body = (m.group(1) or "").strip() + return body or None + + +def _collect_project_goal(project_id: str) -> str | None: + """ + Read reports/.md and extract its ## Goal text. + Returns a single string or None if missing. + """ + path = os.path.join(REPORTS_DIR, f"{project_id}.md") + if not os.path.exists(path): + return None + try: + with open(path, "r", encoding="utf-8") as f: + md = f.read() + return _extract_goal_from_md(md) + except Exception: + return None + + +def _collect_repo_goals_for_project(project_id: str) -> list[str]: + """ + Scan reports/ for ALL repo-level files belonging to this project + (files named '____.md') and collect their ## Goal text. + """ + goals: list[str] = [] + prefix = f"{project_id}__" + if not os.path.isdir(REPORTS_DIR): + return goals + for fname in os.listdir(REPORTS_DIR): + if not (fname.startswith(prefix) and fname.endswith(".md")): + continue + # Skip the project-level file (`reports/.md`) + if fname == f"{project_id}.md": + continue + path = os.path.join(REPORTS_DIR, fname) + try: + with open(path, "r", encoding="utf-8") as f: + md = f.read() + g = _extract_goal_from_md(md) + if g: + goals.append(g) + except Exception: + # Ignore unreadable files; keep going + pass + return goals + + +def build_balanced_goal_corpus(projects: List[Dict[str, Any]]) -> str: + """ + Create a single corpus representing the portfolio mission, in priority: + 1) Project-level MD '## Goal' (reports/.md) — synthesized from repo code + 2) Fallback: aggregate all repo-level MD '## Goal' for that project + 3) Last resort: project.repo_context (readme -> description -> topics) + Includes ALL available goal text without truncation. + """ + + def _cap_for(n, soft_total=10000, min_cap=250, max_cap=800): + return max(min_cap, min(max_cap, soft_total // max(1, n))) + + parts: list[str] = [] + n = max(1, len(projects)) + per_cap = _cap_for(n) + + for p in sorted(projects, key=lambda x: (x.get("project_id") or "")): + pid = (p.get("project_id") or "").strip() + project_text = "" + + # (1) Prefer project-level Goal + g_project = _collect_project_goal(pid) + if g_project: + project_text = g_project + else: + # (2) Fallback to aggregated repo Goals + repo_goals = _collect_repo_goals_for_project(pid) + if repo_goals: + project_text = "\n\n".join(repo_goals) + else: + # (3) Last resort: repo_context + ctx = p.get("repo_context") or {} + readme = (ctx.get("readme") or "").strip() + desc = (ctx.get("description") or "").strip() + topics = ctx.get("topics") or [] + if readme: + project_text = readme + elif desc: + project_text = desc + elif topics: + project_text = "Topics: " + ", ".join(map(str, topics[:6])) + else: + project_text = "" + + project_text = project_text.strip() + if project_text: + pname = (p.get("project_name") or pid).strip() + parts.append(f"[PROJECT {pid} — {pname}]\n{project_text[:per_cap]}") + + return "\n\n".join(parts).strip() + + +def build_portfolio_overview_prompt( + projects: List[Dict[str, Any]], window_label: str, model: str +) -> str: + """ + Build a prompt for a half-page portfolio summary with two sections: + 1) Portfolio Goal — one unified mission synthesized across ALL projects (from project/repo MDs/readme fallbacks). + 2) Recent Developments — PRIMARY: concatenation of 'Recent Developments' from project MDs; FALLBACK: rollup metrics. + """ + m = compute_portfolio_metrics(projects) + + # 1) Unified goal corpus (all available text; uncapped) + goal_corpus = build_balanced_goal_corpus(projects) + has_goal = bool(goal_corpus.strip()) + + # 2) Primary activity source: concatenate each project's Recent Developments from its MD + activity_corpus = build_portfolio_activity_corpus_from_project_mds( + projects, window_label + ) + has_activity_md = bool(activity_corpus.strip()) + + # 3) Small set of inlineable examples (used only if we fall back) + def pick_inline_examples(ps: List[Dict[str, Any]], max_n: int = 4) -> List[str]: + """ + Choose at most one example per project before repeating (round-robin across projects), + preferring releases -> PRs -> commits -> issues. + """ + buckets = ["releases", "pull_requests", "commits", "issues"] + # Pre-extract first candidate per bucket per project + by_project = [] + for p in sorted(ps, key=lambda x: (x.get("project_id") or "")): + ex = p.get("recent_examples") or {} + cand = None + for b in buckets: + arr = ex.get(b) or [] + if arr: + it = arr[0] + name = ( + it.get("title") + or it.get("release_name") + or it.get("message_headline") + or b + ) + url = ( + it.get("pr_url") + or it.get("release_url") + or it.get("commit_url") + or it.get("issue_url") + ) + if url: + cand = f"[{name}]({url})" + break + if cand: + by_project.append(cand) + # Round-robin: one per project, then stop at max_n + return by_project[:max_n] + + examples = pick_inline_examples(projects, max_n=4) + examples_str = "; ".join(examples) if examples else "-" + # 4) Project list (context only; do not require the model to list them) + proj_list = ( + ", ".join( + [ + f"{p.get('project_id')}" + for p in sorted(projects, key=lambda x: (x.get("project_id") or "")) + ] + ) + or "-" + ) + + # 5) Final prompt + return textwrap.dedent( + f""" +You are writing an **executive summary** for a research **portfolio** (multiple projects involving multiple repos possible). + +STRICT RULES +- Use ONLY the GOAL CORPUS and, if present, the ACTIVITY CORPUS below; no outside knowledge. +- If ACTIVITY CORPUS is empty, use PORTFOLIO METRICS (fallback) for 'Recent Developments'. +- Do not output bullet lists of dated events. Synthesize **what actually changed**. +- Use ONLY the information below. Do not invent anything. Do **not** list individual project names in the output. +- Balance coverage across projects; all projects inform the narrative. +- Do not let a single project dominate more than ~40% of sentences; highlight cross-cutting themes spanning multiple projects when possible, but all projects must inform narrative. +- Inline links are allowed when they aid the narrative. +- Support claims with inline Markdown links **only** within the sentence/statement within the narrative and prose. The paragraphs must flow. +- No dated bullet lists or lists; synthesize into concise paragraphs. +- Do not write bracketed anchors like “[commit …]”, “[PR …]”, or “[issue …]” under any circumstance. +- Do not name any pull request, commit or issue by name (i.e., pull request 1, commit 70bcd7e6, etc.) under any circumstance. +- Do not add a link without it being hyperlinked under any circumstance. +- This is how to include incline links: +EXAMPLE 1: +GOOD: A [consolidating pull request]((https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1)) further structured these changes. +BAD: A consolidating pull request further structured these changes, as seen in [pull request #1](https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1). +ALSO BAD: A consolidating pull request further structured these changes (https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1). +EXAMPLE 2: +GOOD: A [consolidating PR](https://github.com/dashnowlab/STRchive/pull/268) advanced end-to-end workflows. +BAD: A consolidating PR advanced end-to-end workflows [consolidating PR](https://github.com/dashnowlab/STRchive/pull/268). +EXAMPLE 3 (Do not reference EVIDENCE by name (commit names, issue names, etc.). Instead use inline links within the narrative): +GOOD: ...and [entrypoint logic to span development](https://github.com/JRaviLab/molevolvr2.0/commit/6e123e18aa8cb3a26c1432ee945ea1f9575b8e37), test, and production contexts, with [cloud-function deployment made more generic](https://github.com/JRaviLab/molevolvr2.0/commit/2e6cde0bf73e288d4beeb9a46cec3fc5bb491503) +BAD: ...and entrypoint logic to span development, test, and production contexts, with cloud-function deployment made more generic [2e6cde0bf73e288d4beeb9a46cec3fc5bb491503](https://github.com/JRaviLab/molevolvr2.0/commit/2e6cde0bf73e288d4beeb9a46cec3fc5bb491503) and [6e123e18aa8cb3a26c1432ee945ea1f9575b8e37](https://github.com/JRaviLab/molevolvr2.0/commit/6e123e18aa8cb3a26c1432ee945ea1f9575b8e37). + +- Headings must appear exactly as below. +- Keep the whole report under ~250 words. Use these exact sections: + +# Executive Summary +## Portfolio Summary and Goal +Write a single, unified 2-5 sentence mission that captures the overall summary of the projects and what ALL projects together aim to achieve. +Base this ONLY on GOAL CORPUS. You MUST synthesize across ALL content, not a subset. +Do not name repositories, and only name projects if it aids in the narrative. {"Do NOT write 'Not stated'." if has_goal else "If the corpus is empty, write 'Not stated'."} +Also identify the scientific communities or users who benefit if there is explicit evidence in GOAL CORPUS or STAR + FORKS (identity signals); +otherwise do not include this sentence at all. Keep this brief (1–2 sentences). + +## Recent Developments ({window_label}) +If ACTIVITY CORPUS is present, synthesize from it. Otherwise, use PORTFOLIO METRICS (fallback). +Explain substantive work (features, fixes, refactors, tests, infra, docs), issues addressed, and progress made. +Think big picture: are multiple issues or commits working towards the same goal? Use that goal in the narrative rather than specifics about the code change. +Tie claims to cross-cutting themes/areas when evident. +Avoid dates, project/repo lists, and changelog-style enumeration. Focus on work progress towards the overall goal. +If one repository has no changes, simply do not include in the narrative, do not state anything. + +ACTIVITY CORPUS (from project MD '## Recent Developments ({window_label})'; primary source): +{activity_corpus if has_activity_md else "(empty)"} + +PORTFOLIO METRICS (fallback; for reasoning only; do not include directly): +- Projects: {m["projects"]} total; {m["active"]} active +- Activity in {window_label}: {m["commits"]} commits, {m["prs"]} PRs, {m["issues"]} issues, {m["releases"]} releases +- Top themes: {", ".join(m["top_themes"]) if m["top_themes"] else "-"} +- Top areas touched: {", ".join(m["top_areas"]) if m["top_areas"] else "-"} +- Inline examples: {examples_str} + +GOAL CORPUS (distill into a unified purpose; do not copy verbatim): +{goal_corpus if goal_corpus else "(empty)"} +CONTEXT (do not echo; for balance only): +PROJECTS: {proj_list} + """ + ).strip() + + +# ------------------ Main (CLI) ------------------ +def main(): + parser = argparse.ArgumentParser( + description="Generate a portfolio executive report for ALL projects." + ) + parser.add_argument( + "--model", + default=DEFAULT_MODEL, + help="OpenAI model name (default from env or gpt-4o-mini)", + ) + parser.add_argument( + "--window-label", + default="the last 90 days", + help='Label for the time window, e.g., "May–July 2025"', + ) + parser.add_argument( + "--only", + nargs="*", + default=None, + help="Optional list of project IDs to include", + ) + parser.add_argument( + "--out", + default=os.path.join(REPORTS_DIR, "_portfolio_full.md"), + help="Output Markdown path (default: reports/_portfolio_full.md)", + ) + args = parser.parse_args() + + # 1) Load the machine portfolio JSON created by rollup_projects.py + portfolio_path = os.path.join(SUMMARY_DIR, "_portfolio.json") + if not os.path.exists(portfolio_path): + raise SystemExit(f"Missing {portfolio_path}. Run rollup_projects.py first.") + portfolio = read_json(portfolio_path) + + # Extract the list of project dicts + projects = portfolio.get("projects") or [] + + # Optional: filter to a subset of project IDs + if args.only: + keep = set(args.only) + projects = [p for p in projects if p.get("project_id") in keep] + + # Ensure we have something to summarize + if not projects: + raise SystemExit("No projects to summarize (after filtering).") + + # 2) Build the portfolio-level "Executive Overview" text + overview_prompt = build_portfolio_overview_prompt( + projects, args.window_label, args.model + ) + overview_text = call_llm( + [ + { + "role": "system", + "content": ( + f"You are a careful, evidence-bound summarizer that follows directions exactly." + f"You take information from multiple projects and summarize it into a cohesive and succint excecutive summary, highlighting key themes. " + f"Your summaries on the project's activity highlight the overall scope of the work done and the work progress. " + f"You are very observant and are able to take multiple project's progress and identify general trends of 'what work has been done across all projects'. " + f"Use ONLY the information in the user message; no external knowledge. " + f"Output exactly two Markdown sections with these headings and nothing else: " + f"'## Portfolio Summary and Goal' and '## Recent Developments ({args.window_label})'. " + f"No bullets. No owners. No generic KPIs. No fluff. " + ), + }, + {"role": "user", "content": overview_prompt}, + ], + model=args.model, + ) + + project_count = len(projects) + md = ( + f"# Portfolio Summary - {project_count} projects ({args.window_label})\n\n" + f"{overview_text}\n" + f"{_footer()}\n" + ) + + with open(args.out, "w", encoding="utf-8") as f: + f.write(md) + print(f"Wrote {args.out}") + + +if __name__ == "__main__": + main() diff --git a/src/summarize_projects.py b/src/summarize_projects.py new file mode 100644 index 0000000..47e9db4 --- /dev/null +++ b/src/summarize_projects.py @@ -0,0 +1,440 @@ +# src/summarize_projects.py +""" +Generate per-project executive summaries by calling an LLM. +""" + +import os, json, glob, argparse, time, re +from datetime import datetime, timezone +from dotenv import load_dotenv +from openai import OpenAI + +# -------- Environment & client setup -------- +load_dotenv() +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") +if not OPENAI_API_KEY: + # Fail fast if API key is missing so the user knows to fix .env + raise SystemExit("Missing OPENAI_API_KEY in .env") + + +# Select model from env or default to a small, cost-effective model +DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o-mini") + +# Initialize the OpenAI client (reads key from env automatically) +DEFAULT_HTTP_TIMEOUT = float(os.environ.get("OPENAI_HTTP_TIMEOUT", "60")) # seconds +client = OpenAI( + timeout=DEFAULT_HTTP_TIMEOUT, max_retries=0 +) # applies connect/read/write timeouts + + +# -------- Paths -------- +REPORTS_DIR = "reports" # where we write the per-project markdown reports +SUMMARY_DIR = "data/summary" # where per-project JSONs live (from rollup_projects.py) +os.makedirs(REPORTS_DIR, exist_ok=True) +_GOAL_RE = re.compile( + r"^##\s*Summary and Goal\s*\n(.*?)(?:\n##\s+|\Z)", re.DOTALL | re.MULTILINE +) +_ACTIVITY_RE = re.compile( + r"^##\s*Recent Developments.*?\n(.*?)(?:\n##\s+|\Z)", re.DOTALL | re.MULTILINE +) + + +# -------- Small helpers -------- +def _read_repo_activity_from_md(project_id: str, owner: str, repo: str) -> str | None: + path = os.path.join(REPORTS_DIR, f"{project_id}__{owner}__{repo}.md") + if not os.path.exists(path): + return None + with open(path, "r", encoding="utf-8") as f: + text = f.read() + m = _ACTIVITY_RE.search(text) + if not m: + return None + g = m.group(1).strip() + return g or None + + +def read_json(p: str) -> dict: + """Load a JSON file into a Python dict.""" + with open(p, "r", encoding="utf-8") as f: + return json.load(f) + + +def _footer(): + dmy = datetime.now(timezone.utc).strftime("%d/%m/%Y") + return f"\n\n*Report generated using A.I. on {dmy}*" + + +def _read_repo_goal_from_md(project_id: str, owner: str, repo: str) -> str | None: + path = os.path.join(REPORTS_DIR, f"{project_id}__{owner}__{repo}.md") + if not os.path.exists(path): + return None + text = open(path, "r", encoding="utf-8").read() + m = _GOAL_RE.search(text) + if not m: + return None + g = m.group(1).strip() + return g or None + + +def build_project_goal_corpus_from_repo_mds( + project: dict, per_repo_char_cap: int = 600 +) -> str: + """ + Collect the '## Goal' text from all repo-level MD files for this project. + Prefer an explicit 'repos' list from the project JSON; otherwise, scan reports/* files + named like reports/____.md + """ + pid = str(project.get("project_id") or "").strip() + parts = [] + + # small fairness cap per repo; keeps the corpus balanced + def _cap_for(n, soft_total=8000, min_cap=250, max_cap=900): + return max(min_cap, min(max_cap, soft_total // max(1, n))) + + # 1) If project JSON lists repos, use that (best) + repos = sorted( + project.get("repos") or [], + key=lambda r: (r.get("owner") or "", r.get("repo") or ""), + ) + per_cap = _cap_for(max(1, len(repos) or 1)) + for r in repos: + owner, repo = r.get("owner"), r.get("repo") + if not (owner and repo): + continue + g = _read_repo_goal_from_md(pid, owner, repo) + if g: + parts.append(f"[REPO {owner}/{repo}]\n{g.strip()[:per_cap]}") + + # 2) Fallback: scan reports for any repo files that match this project id + if not parts: + # Fallback: scan and label explicitly + for path in sorted(glob.glob(os.path.join(REPORTS_DIR, f"{pid}__*__*.md"))): + # filename pattern: ____.md + m = re.match( + rf"^{re.escape(pid)}__([^_]+)__(.+)\.md$", os.path.basename(path) + ) + if not m: + continue + owner, repo = m.group(1), m.group(2) + g = _read_repo_goal_from_md(pid, owner, repo) + if g: + parts.append(f"[REPO {owner}/{repo}]\n{g.strip()[:per_cap]}") + + return ("\n\n".join(parts)).strip() + + +def build_project_activity_corpus_from_repo_mds( + project: dict, window_label: str, per_repo_char_cap: int = 900 +) -> str: + """ + Collect '## Recent Developments' blocks from all repo-level MD files for this project. + Skips the exact no-activity boilerplate line to avoid noise. + """ + pid = str(project.get("project_id") or "").strip() + parts = [] + + # If project JSON lists repos, prefer that; otherwise scan by prefix. + repos = project.get("repos") or [] + if repos: + candidates = [ + (r.get("owner"), r.get("repo")) + for r in repos + if r.get("owner") and r.get("repo") + ] + else: + candidates = [] + for path in sorted(glob.glob(os.path.join(REPORTS_DIR, f"{pid}__*__*.md"))): + m = re.match( + rf"^{re.escape(pid)}__([^_]+)__(.+)\.md$", os.path.basename(path) + ) + if m: + candidates.append((m.group(1), m.group(2))) + # Collect activity text with per-repo cap + per_cap = max(300, min(900, 12000 // max(1, len(candidates) or 1))) + for owner, repo in candidates: + txt = _read_repo_activity_from_md(pid, owner, repo) + if not txt: + continue + # Skip the boilerplate "No changes ..." line if that's all there is + if txt.strip() == f"**No changes in {window_label}**": + continue + parts.append(f"[REPO {owner}/{repo}]\n{txt.strip()[:per_cap]}") + + return ("\n\n".join(parts)).strip() + + +def call_llm(prompt: str, model: str, max_retries: int = 4) -> str: + for attempt in range(1, max_retries + 1): + try: + resp = client.chat.completions.create( + model=model, + messages=[ + { + "role": "system", + "content": ( + f"You are a careful, evidence-bound summarizer that follows directions exactly." + f"You take information from multiple repositories and summarize it into a cohesive and succint excecutive summary, highlighting key themes. " + f"Your summaries on the project's activity highlight the overall scope of the work done and the work progress across ALL repositories. " + f"You are very observant and are able to take multiple respoitories' progress and identify general trends of 'what work has been done across all repositories'. " + f"Use ONLY the information in the user message; no external knowledge. " + f"Output exactly two Markdown sections with these headings and nothing else: " + f"'## Summary and Goal' and '## Recent Developments ()" + f"No bullets. No owners. No generic KPIs. No fluff. " + ), + }, + {"role": "user", "content": prompt}, + ], + ) + text = (resp.choices[0].message.content or "").strip() + if not text: + raise RuntimeError("Empty completion") + return text + except Exception as e: + last_err = e + time.sleep(min(2**attempt, 10)) + raise RuntimeError(f"LLM failed after {max_retries} attempts: {last_err}") + + +def make_project_prompt(project_summary: dict, window_label: str) -> str: + pid = project_summary.get("project_id", "") + pname = project_summary.get("project_name") or pid + interest = project_summary.get("interest_signals", {}) or {} + examples = project_summary.get("recent_examples", {}) or {} + + # Goal corpus from repo MD "## Goal" (code-derived via summarize_repos) + goal_corpus = build_project_goal_corpus_from_repo_mds(project_summary) + has_goal = bool(goal_corpus) + + # Activity corpus from repo MD "## Recent Developments" (primary activity source) + activity_corpus = build_project_activity_corpus_from_repo_mds( + project_summary, window_label + ) + has_activity_md = bool(activity_corpus) + + # Rollup evidence (fallback if no activity corpus available) + def kv_list(pairs): + return [ + f"{p['value']} ({p['count']})" + for p in (pairs or []) + if p and p.get("value") is not None + ] + + sg_k = kv_list(interest.get("stargazers")) + fk_k = kv_list(interest.get("fork_owners")) + + def ex_lines_grouped(items, fields, n_total=10, per_repo=3): + """ + Build lines like 'field1 — field2 — ...' but balance across repos. + Requires each item to include 'owner' and 'repo'. + """ + if not items: + return [] + buckets = {} + for it in items: + if not isinstance(it, dict): + continue + key = (it.get("owner"), it.get("repo")) + buckets.setdefault(key, []).append(it) + # round-robin across repos + lines, took = [], {k: 0 for k in buckets} + while len(lines) < n_total: + progressed = False + for k, arr in buckets.items(): + if took[k] >= min(per_repo, len(arr)): + continue + it = arr[took[k]] + took[k] += 1 + parts = [str(it.get(f)) for f in fields if it.get(f)] + if parts: + # prefix repo for even clearer balance to the model: + lines.append(f"[{k[0]}/{k[1]}] — " + " — ".join(parts)) + progressed = True + if len(lines) >= n_total: + break + if not progressed: + break + return lines + + commit_lines = ex_lines_grouped( + examples.get("commits"), + ["committed_at", "author_login", "message_headline", "commit_url"], + ) + pr_lines = ex_lines_grouped( + examples.get("pull_requests"), + ["created_at", "author_login", "title", "pr_url", "state"], + ) + issue_lines = ex_lines_grouped( + examples.get("issues"), + ["created_at", "author_login", "title", "issue_url", "labels"], + ) + release_lines = ex_lines_grouped( + examples.get("releases"), + ["published_at", "release_name", "release_tag", "release_url"], + ) + + # Activity-present flag: prefer the repo-MD view; else infer from rollup examples + activity_present = bool(has_activity_md) or any( + bool(examples.get(k)) + for k in ["commits", "pull_requests", "issues", "releases"] + ) + + repo_list = ( + ", ".join( + [f"{r['owner']}/{r['repo']}" for r in (project_summary.get("repos") or [])] + ) + or "(unknown)" + ) + return f""" +You are writing an **executive summary** for a research **project** (multiple repos possible). + +STRICT RULES +- Use ONLY the GOAL CORPUS and (if present) the ACTIVITY CORPUS from repo-level .md files; no outside knowledge. +- If ACTIVITY CORPUS is empty, use the EVIDENCE block as a fallback for "Recent Developments". +- Do not output bullet lists of dated events. Synthesize **what actually changed** given the changes observed. +- If ACTIVITY_PRESENT=no, under “Recent Developments” write exactly: **No changes in {window_label}**. Do not write ACTIVITY_PRESENT=yes or no. +- Use inline links inside the prose only when it adds to the narrative or when the example is truly informative (e.g., "...includes [work to fix X](link to where X is fixed)). +- Inline links to commit/change/issue are only present when it's very representative of the point you are trying to make. +- Avoid letting any single repository account for most of the narrative. +- Balance coverage across repositories; ensure all repositories are represented where possible. +- Do not under any circumstance name any pull request, commit or issue by name (i.e., pull request 1, commit 70bcd7e6, etc.) +- Do not let a single repository dominate the narrative; integrate themes spanning multiple repos. +- Do not add a link without it being hyperlinked under any circumstance. +- This is how to include incline links: +EXAMPLE 1: +GOOD: A [consolidating pull request]((https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1)) further structured these changes. +BAD: A consolidating pull request further structured these changes, as seen in [pull request #1](https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1). +ALSO BAD: A consolidating pull request further structured these changes (https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1). +EXAMPLE 2: +GOOD: A [consolidating PR](https://github.com/dashnowlab/STRchive/pull/268) advanced end-to-end workflows. +BAD: A consolidating PR advanced end-to-end workflows [consolidating PR](https://github.com/dashnowlab/STRchive/pull/268). +EXAMPLE 3 (Do not reference EVIDENCE by name (commit names, issue names, etc.). Instead use inline links within the narrative): +GOOD: ...and [entrypoint logic to span development](https://github.com/JRaviLab/molevolvr2.0/commit/6e123e18aa8cb3a26c1432ee945ea1f9575b8e37), test, and production contexts, with [cloud-function deployment made more generic](https://github.com/JRaviLab/molevolvr2.0/commit/2e6cde0bf73e288d4beeb9a46cec3fc5bb491503) +BAD: ...and entrypoint logic to span development, test, and production contexts, with cloud-function deployment made more generic [2e6cde0bf73e288d4beeb9a46cec3fc5bb491503](https://github.com/JRaviLab/molevolvr2.0/commit/2e6cde0bf73e288d4beeb9a46cec3fc5bb491503) and [6e123e18aa8cb3a26c1432ee945ea1f9575b8e37](https://github.com/JRaviLab/molevolvr2.0/commit/6e123e18aa8cb3a26c1432ee945ea1f9575b8e37). +- Keep the whole report under ~250 words. Use these 2 exact sections: + +## Summary and Goal +Write 2–8 sentences that **synthesize a single project summary and goal** across ALL repositories. +Base ONLY on GOAL CORPUS. Do **not** list repository names, but you can reference repositories if needed in the narrative. {"Do NOT write 'Not stated'." if has_goal else "If the corpus is empty, write 'Not stated'."} +Also identify the scientific communities or users who benefit if there is explicit evidence in GOAL CORPUS or STAR + FORKS (identity signals); +otherwise do not include this sentence at all. Keep this brief (1–2 sentences). + +## Recent Developments ({window_label}) +If ACTIVITY CORPUS is present, synthesize from it. Otherwise use EVIDENCE as the source. +Explain the **substance** of changes across repos (features/fixes/docs/refactor/tests/infra/deps), +what areas of the codebase were touched (infer from titles/file cues if present), issues addressed and the scope of issues opened, and releases. +Support claims with **inline links** to specific commits/PRs/issues/releases only when it fits the narrative. Do not list dates or create a timeline. +The paragraph should be in prose narrative form, with at most 6 links total, if any. +Think big picture: are multiple issues or commits working towards the same goal? Use that goal in the narrative rather than specifics about the code change. +Avoid counts without explanation and only mention counts when they aid the narrative. +If one repository has no changes, simply do not include in the narrative: do not state anything. + +ACTIVITY_PRESENT: {str(bool(activity_present)).lower()} + +CONTEXT (INPUT; do not echo verbatim) +PROJECT: {pname} ({pid}) +REPOSITORIES: {repo_list} +GOAL CORPUS (from repo MD '## Goal' sections): +{build_project_goal_corpus_from_repo_mds(project_summary) if has_goal else "(empty)"} + +ACTIVITY CORPUS (from repo MD '## Recent Developments' sections; primary source): +{build_project_activity_corpus_from_repo_mds(project_summary, window_label) if has_activity_md else "(empty)"} + +EVIDENCE (fallback source if ACTIVITY CORPUS is empty — for reasoning; do not list verbatim and do not use the names, only hyperlink URL) +Commits: +{chr(10).join("- " + line for line in (commit_lines or ["(none)"]))} +Pull Requests: +{chr(10).join("- " + line for line in (pr_lines or ["(none)"]))} +Issues: +{chr(10).join("- " + line for line in (issue_lines or ["(none)"]))} +Releases: +{chr(10).join("- " + line for line in (release_lines or ["(none)"]))} + +STARS (identity signals): {", ".join(sg_k) if sg_k else "(none)"} +FORKS (identity signals): {", ".join(fk_k) if fk_k else "(none)"} +""".strip() + + +def write_report( + project_id: str, project_name: str, repo_count: int, window_label: str, body_md: str +): + path = os.path.join(REPORTS_DIR, f"{project_id}.md") + title = f"# Executive Summary: Project {project_name or project_id} — {repo_count} repositories — {window_label}" + with open(path, "w", encoding="utf-8") as f: + f.write(title + "\n\n" + body_md.strip() + _footer() + "\n") + print(f"Wrote {path}") + + +def summarize_project_file(path: str, window_label: str, model: str): + data = read_json(path) + pid = data.get("project_id", "UNKNOWN") + pname = data.get("project_name") or pid + repo_count = int(data.get("repo_count") or 0) + + # Build the LLM prompt (now derives the Goal strictly from repo MD Goals) + prompt = make_project_prompt(data, window_label) + + try: + text = call_llm(prompt, model) + if not text: + raise RuntimeError("LLM returned empty content") + except Exception as e: + text = ( + f"# Executive Summary\n\n" + f"_LLM call failed: {e}_\n\n" + f"- Project ID: {pid}\n" + f"- Project Name: {pname}\n" + f"- Window: {window_label}\n" + ) + + write_report(pid, pname, repo_count, window_label, text) + + +def main(): + """ + CLI entrypoint: + - loads all per-project JSONs (excluding _portfolio.json), + - optionally filters by --only project IDs, + - generates one Markdown file per remaining project. + """ + parser = argparse.ArgumentParser( + description="Generate per-project executive summaries." + ) + parser.add_argument( + "--model", + default=DEFAULT_MODEL, + help="OpenAI model (default from env or gpt-4o-mini)", + ) + parser.add_argument( + "--window-label", + default="the last 90 days", + help='Human label for the window, e.g., "May–Jul 2025"', + ) + parser.add_argument( + "--only", + nargs="*", + default=None, + help="Optional list of project IDs to include", + ) + args = parser.parse_args() + + # Find all project JSONs written by rollup (ignore the portfolio index file) + paths = sorted(glob.glob(os.path.join(SUMMARY_DIR, "*.json"))) + paths = [p for p in paths if os.path.basename(p) != "_portfolio.json"] + + # Optional filter: only summarize specified project IDs + if args.only: + ids = set(args.only) + # Note: we need to peek to get IDs; small and fine for POC + paths = [p for p in paths if read_json(p).get("project_id") in ids] + + if not paths: + raise SystemExit( + f"No project JSON files found in {SUMMARY_DIR}. Run rollup first." + ) + + # Generate the per-project reports + for p in paths: + summarize_project_file(p, args.window_label, args.model) + + +if __name__ == "__main__": + main() diff --git a/src/summarize_repos.py b/src/summarize_repos.py new file mode 100644 index 0000000..7b988ab --- /dev/null +++ b/src/summarize_repos.py @@ -0,0 +1,539 @@ +# src/summarize_repos.py +""" +Generate repository-level summaries grouped by project. +""" + +import os, argparse, textwrap, time +import pandas as pd +import json +from dotenv import load_dotenv +from openai import OpenAI +from datetime import datetime, timezone +from goal_from_code import ( + shallow_clone, + synthesize_repo_goal_from_code, + delete_clone_path, +) +import pathlib +import shutil + +# ---------- Setup: env + OpenAI client ---------- +load_dotenv() # pulls OPENAI_API_KEY/OPENAI_MODEL from .env, if present +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") +if not OPENAI_API_KEY: + # fail fast; nothing will work without a key + raise SystemExit("Missing OPENAI_API_KEY in .env") + +# Default model can be overridden via --model +DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-5-nano") +DEFAULT_HTTP_TIMEOUT = float(os.environ.get("OPENAI_HTTP_TIMEOUT", "60")) # seconds +client = OpenAI( + timeout=DEFAULT_HTTP_TIMEOUT, max_retries=0 +) # applies connect/read/write timeouts + +# Paths +CLEAN_DIR = "data/clean" +REPORTS_DIR = "reports" +os.makedirs(REPORTS_DIR, exist_ok=True) + +SEED_CSV = "data/projects_seed.csv" +RAW_DIR = "data/raw/github" + + +# ---------- LLM helper with simple retries ---------- +def _footer(): + dmy = datetime.now(timezone.utc).strftime("%d/%m/%Y") + return f"\n\n*Report generated using A.I. on {dmy}*" + + +def call_llm(messages, model: str, max_retries: int = 4) -> str: + """ + Thin wrapper around chat.completions.create with a bounded retry loop. + Adds a per-attempt watchdog timeout via the client config. + """ + last_err = None + for attempt in range(1, max_retries + 1): + try: + resp = client.chat.completions.create(model=model, messages=messages) + txt = resp.choices[0].message.content or "" + txt = txt.strip() + if not txt: + raise RuntimeError("Empty completion") + return txt + except Exception as e: + last_err = e + # Exponential backoff: 2, 4, 8, 10 (cap) + time.sleep(min(2**attempt, 10)) + raise RuntimeError(f"LLM failed after {max_retries} attempts: {last_err}") + + +# ---------- Load the combined per-table Parquets ---------- +def load_repo_frames(): + """ + Read combined tables from data/clean into a dict of DataFrames. + If a table is missing, return an empty DataFrame for that key. + This keeps downstream logic simple (no KeyErrors). + """ + tables = {} + for name in [ + "commits", + "issues", + "pull_requests", + "releases", + "stargazers", + "forks", + "pr_files", + ]: + path = os.path.join(CLEAN_DIR, f"_all_{name}.parquet") + if os.path.exists(path): + tables[name] = pd.read_parquet(path) + else: + tables[name] = pd.DataFrame() + return tables + + +def load_seed(path: str = SEED_CSV) -> pd.DataFrame: + df = pd.read_csv(path) + for col in ["project_id", "project_name", "owner", "repo"]: + if col not in df.columns: + df[col] = None + return df + + +def read_raw_json(owner: str, repo: str) -> dict | None: + path = os.path.join(RAW_DIR, f"{owner}__{repo}.json") + if not os.path.exists(path): + return None + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +# ---------- Determine the universe of repos to summarize ---------- +def group_by_repo(tables, seed_df: pd.DataFrame): + """ + Union of repos seen in the tables AND listed in the seed. + Returns sorted list of (project_id, project_name, owner, repo) + """ + keys = set() + # from tables + for df in tables.values(): + if isinstance(df, pd.DataFrame) and not df.empty: + if {"owner", "repo", "project_id", "project_name"}.issubset(df.columns): + for t in ( + df[["project_id", "project_name", "owner", "repo"]] + .dropna() + .itertuples(index=False) + ): + keys.add((t[0], t[1], t[2], t[3])) + # from seed + for r in seed_df.itertuples(index=False): + pid = getattr(r, "project_id", None) + pname = getattr(r, "project_name", None) + owner = getattr(r, "owner", None) + repo = getattr(r, "repo", None) + if pid and owner and repo: + keys.add( + ( + str(pid), + str(pname) if pname is not None else None, + str(owner), + str(repo), + ) + ) + return sorted(keys) + + +# ---------- Pull a single repo's context (goal sources) ---------- +def extract_repo_context(tables, owner, repo): + """ + Try to extract description/homepage/readme from any combined table row. + If not found (inactive repo), fall back to raw JSON fetched earlier. + """ + + def first_non_null(sub, col): + if col in sub.columns: + s = sub[col].dropna() + if not s.empty: + return s.iloc[0] + return None + + # Try tables first + for df in tables.values(): + if isinstance(df, pd.DataFrame) and not df.empty: + if "owner" in df.columns and "repo" in df.columns: + mask = (df.get("owner").astype(str) == str(owner)) & ( + df.get("repo").astype(str) == str(repo) + ) + sub = df.loc[mask] + if not sub.empty: + return { + "description": first_non_null(sub, "repo_description"), + "homepage": first_non_null(sub, "repo_homepage"), + "readme": first_non_null(sub, "readme_text"), + } + + # Fallback: raw JSON + raw = read_raw_json(owner, repo) or {} + if raw: + # topics + languages are not needed here; we just need goal inputs + readme = raw.get("__readme_text") + return { + "description": raw.get("description"), + "homepage": raw.get("homepageUrl"), + "readme": readme, + } + + # Last resort + return {"description": None, "homepage": None, "readme": None} + + +# ---------- Robust datetime sorting helper ---------- +def _sorted_desc(df: pd.DataFrame, time_col: str) -> pd.DataFrame: + """ + Return df sorted descending by time_col using robust datetime parsing (UTC). + If the column is missing/empty, returns an empty DataFrame. + """ + if not isinstance(df, pd.DataFrame) or df.empty or time_col not in df.columns: + return pd.DataFrame() + out = df.copy() + out[time_col] = pd.to_datetime(out[time_col], errors="coerce", utc=True) + return out.sort_values(time_col, ascending=False, kind="stable") + + +# ---------- Build the LLM prompt for ONE repo ---------- + + +def _urls_only(df: pd.DataFrame, url_col: str, max_n: int = 12) -> list[str]: + """Return a de-duped, order-preserving list of URLs from df[url_col].""" + if not isinstance(df, pd.DataFrame) or df.empty or url_col not in df.columns: + return [] + seen, out = set(), [] + for u in df[url_col].dropna().astype(str): + if u.startswith("github.com/"): + u = "https://" + u + if u.startswith("http") and u not in seen: + seen.add(u) + out.append(u) + if len(out) >= max_n: + break + return out + + +def _identity_signals(tables, owner, repo, max_items=6): + """Return two short lists describing recent stargazer/fork identities for this repo.""" + stars = tables.get("stargazers", pd.DataFrame()) + forks = tables.get("forks", pd.DataFrame()) + + def _safe(s): + return s if isinstance(s, str) and s.strip() else None + + star_lines = [] + if isinstance(stars, pd.DataFrame) and not stars.empty: + sub = stars[(stars.get("owner") == owner) & (stars.get("repo") == repo)].copy() + sub = ( + sub.sort_values("starred_at", ascending=False) + if "starred_at" in sub.columns + else sub + ) + for _, r in sub.head(max_items).iterrows(): + login = _safe(r.get("stargazer_login")) or "unknown" + name = _safe(r.get("stargazer_name")) + comp = _safe(r.get("stargazer_company")) + loc = _safe(r.get("stargazer_location")) + orgs = _safe(r.get("stargazer_orgs")) + bits = [f"{login}" + (f" ({name})" if name else "")] + meta = "; ".join([x for x in [comp, loc, orgs] if x]) + if meta: + bits.append(meta) + star_lines.append(" — ".join(bits)) + + fork_lines = [] + if isinstance(forks, pd.DataFrame) and not forks.empty: + sub = forks[(forks.get("owner") == owner) & (forks.get("repo") == repo)].copy() + sub = ( + sub.sort_values("fork_created_at", ascending=False) + if "fork_created_at" in sub.columns + else sub + ) + for _, r in sub.head(max_items).iterrows(): + login = _safe(r.get("fork_owner_login")) or "unknown" + name = _safe(r.get("fork_owner_name")) + typ = _safe(r.get("fork_owner_type")) # User/Organization + loc = _safe(r.get("fork_owner_location")) + orgd = ( + _safe(r.get("fork_owner_org_description")) + if typ == "Organization" + else None + ) + bits = [f"{login}" + (f" ({name})" if name else "")] + meta = "; ".join([x for x in [typ, loc, orgd] if x]) + if meta: + bits.append(meta) + fork_lines.append(" — ".join(bits)) + + return (star_lines[:max_items] or ["(none)"], fork_lines[:max_items] or ["(none)"]) + + +def build_repo_prompt(project_id, project_name, owner, repo, ctx, tables, window_label): + """ + Build a compact, strictly evidence-grounded prompt that yields a short, + synthesized repo summary with headings. No bullet lists or dated timelines. + """ + + commits = tables.get("commits", pd.DataFrame()) + issues = tables.get("issues", pd.DataFrame()) + prs = tables.get("pull_requests", pd.DataFrame()) + rels = tables.get("releases", pd.DataFrame()) + + c_sub = _sorted_desc( + ( + commits[(commits.get("owner") == owner) & (commits.get("repo") == repo)] + if isinstance(commits, pd.DataFrame) and not commits.empty + else pd.DataFrame() + ), + "committed_at", + ) + i_sub = _sorted_desc( + ( + issues[(issues.get("owner") == owner) & (issues.get("repo") == repo)] + if isinstance(issues, pd.DataFrame) and not issues.empty + else pd.DataFrame() + ), + "created_at", + ) + pr_sub = _sorted_desc( + ( + prs[(prs.get("owner") == owner) & (prs.get("repo") == repo)] + if isinstance(prs, pd.DataFrame) and not prs.empty + else pd.DataFrame() + ), + "created_at", + ) + r_sub = _sorted_desc( + ( + rels[(rels.get("owner") == owner) & (rels.get("repo") == repo)] + if isinstance(rels, pd.DataFrame) and not rels.empty + else pd.DataFrame() + ), + "published_at", + ) + + def _has_activity(c_sub, pr_sub, i_sub, r_sub) -> bool: + return any( + [ + isinstance(c_sub, pd.DataFrame) and not c_sub.empty, + isinstance(pr_sub, pd.DataFrame) and not pr_sub.empty, + isinstance(i_sub, pd.DataFrame) and not i_sub.empty, + isinstance(r_sub, pd.DataFrame) and not r_sub.empty, + ] + ) + + activity_present = _has_activity(c_sub, pr_sub, i_sub, r_sub) + + goal_text = (ctx.get("readme") or ctx.get("description") or "Not stated.").strip() + + # Identity signals for grounded beneficiaries + star_lines, fork_lines = _identity_signals(tables, owner, repo) + + # Minimal evidence buffers (for inline linking; NOT to be printed as lists) + commit_urls = _urls_only(c_sub, "commit_url", max_n=8) + pr_urls = _urls_only(pr_sub, "pr_url", max_n=8) + issue_urls = _urls_only(i_sub, "issue_url", max_n=8) + release_urls = _urls_only(r_sub, "release_url", max_n=8) + + return textwrap.dedent( + f""" +You are writing an **executive summary** for ONE repository. + +STRICT RULES +- Write succinct language and do not repeat yourself. +- Keep total under ~250 words. +- Use ONLY the facts below (GOAL SOURCE, EVIDENCE, IDENTITY SIGNALS). No outside knowledge. +- Only inline link a commit/change/issue when it's very representative of the point you are trying to make or when it adds to the narrative. +- If ACTIVITY_PRESENT=no, or there is no recent activity, under “Recent Developments” write EXACTLY: **No changes in {window_label}**. Do not write ACTIVITY_PRESENT=yes or no. +- Support claims with inline Markdown links **only** within the sentence/statement within the narrative and prose. The paragraphs must flow. +- No dated bullet lists or lists; synthesize into concise paragraphs. +- Do not write bracketed anchors like “[commit …]”, “[PR …]”, or “[issue …]” under any circumstance. +- Do not name any pull request, commit or issue by name (i.e., pull request 1, commit 70bcd7e6, etc.) under any circumstance. +EXAMPLE 1: +GOOD: A [consolidating pull request]((https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1)) further structured these changes. +BAD: A consolidating pull request further structured these changes, as seen in [pull request #1](https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1). +ALSO BAD: A consolidating pull request further structured these changes (https://github.com/ivichadriana/deconvolution_sc_sn_comparison/pull/1). +EXAMPLE 2: +GOOD: A [consolidating PR](https://github.com/dashnowlab/STRchive/pull/268) advanced end-to-end workflows. +BAD: A consolidating PR advanced end-to-end workflows [consolidating PR](https://github.com/dashnowlab/STRchive/pull/268). +EXAMPLE 3 (Do not reference EVIDENCE by name (commit names, issue names, etc.). Instead use inline links within the narrative): +GOOD: ...and [entrypoint logic to span development](https://github.com/JRaviLab/molevolvr2.0/commit/6e123e18aa8cb3a26c1432ee945ea1f9575b8e37), test, and production contexts, with [cloud-function deployment made more generic](https://github.com/JRaviLab/molevolvr2.0/commit/2e6cde0bf73e288d4beeb9a46cec3fc5bb491503) +BAD: ...and entrypoint logic to span development, test, and production contexts, with cloud-function deployment made more generic [2e6cde0bf73e288d4beeb9a46cec3fc5bb491503](https://github.com/JRaviLab/molevolvr2.0/commit/2e6cde0bf73e288d4beeb9a46cec3fc5bb491503) and [6e123e18aa8cb3a26c1432ee945ea1f9575b8e37](https://github.com/JRaviLab/molevolvr2.0/commit/6e123e18aa8cb3a26c1432ee945ea1f9575b8e37). +- Keep the whole report under ~250 words. Use these 2 exact sections: + +## Summary and Goal +Write 2–8 crisp sentences describing the repo’s summary and purpose from GOAL SOURCE only. You do not need to state which repository you are describing. +Evaluate what the codebase aims to do as a whole, big picture, not just within the code. +Like what are the researchers aiming to do through this code? What are they researching? How are they researching it, and what is the goal? +Also identify the scientific communities or users who benefit from the research if there is explicit evidence in GOAL CORPUS or STAR + FORKS (identity signals); +otherwise do not include this sentence at all. Keep this brief (1–2 sentences). + +## Recent Developments ({window_label}) +Write 2–10 crisp sentences that explains **what changed**, not when: summarize the scope and the substance of changes (features/fixes/docs/refactor/tests/infra/deps), +what parts of the codebase were affected (infer from file names or titles if apparent), and any issues/release outcomes, what work has been done. +Think big picture: are multiple issues or commits working towards the same goal? Use that goal in the narrative rather than specifics about the code change. +Do not list dates or create a timeline. The paragraph should be in prose narrative form, with at most 6 links total, if any. +Avoid counts without explanation and only mention counts when they aid the narrative. + +CONTEXT (INPUT; do not echo verbatim): +Project: {project_name} ({project_id}) +Repository: {owner}/{repo} + +ACTIVITY_PRESENT: {str(bool(activity_present)).lower()} + +GOAL SOURCE: +{goal_text} + +EVIDENCE (links for reasoning only; do not echo raw URLs, and do not use the names, only hyperlink URL): +Commits: +{chr(10).join("- " + u for u in (commit_urls or ["(none)"]))} +Pull Requests: +{chr(10).join("- " + u for u in (pr_urls or ["(none)"]))} +Issues: +{chr(10).join("- " + u for u in (issue_urls or ["(none)"]))} +Releases: +{chr(10).join("- " + u for u in (release_urls or ["(none)"]))} + +IDENTITY SIGNALS (for grounding beneficiaries): +Stargazers: +{chr(10).join("- " + s for s in star_lines)} +Fork owners: +{chr(10).join("- " + s for s in fork_lines)} + """ + ).strip() + + +# ---------- CLI entrypoint ---------- +def main(): + parser = argparse.ArgumentParser( + description="Generate repository-level summaries grouped by project." + ) + parser.add_argument( + "--model", + default=DEFAULT_MODEL, + help="OpenAI model (default from env or gpt-5-nano)", + ) + parser.add_argument( + "--model-high", + default=DEFAULT_MODEL, + help="OpenAI model for higher analysis (default: value of --model/DEFAULT_MODEL)" + ) + parser.add_argument( + "--model-low", + default=DEFAULT_MODEL, + help="OpenAI model for lower analysis (default: value of --model/DEFAULT_MODEL)" + ) + parser.add_argument( + "--model-medium", + default=DEFAULT_MODEL, + help="OpenAI model for medium analysis (default: value of --model/DEFAULT_MODEL)" + ) + parser.add_argument( + "--window-label", + default="the last 90 days", + help='Label for the time window (e.g. "May–July 2025")', + ) + parser.add_argument( + "--out-dir", default=REPORTS_DIR, help="Directory to write repo-level reports" + ) + args = parser.parse_args() + + tables = load_repo_frames() + seed_df = load_seed(SEED_CSV) + clone_root_torm = pathlib.Path("data/clones_goals") + + repos = group_by_repo(tables, seed_df) + if not repos: + raise SystemExit( + "No repositories found in clean tables. Run fetch/normalize first." + ) + + # Emit one Markdown per repo + for pid, pname, owner, repo in repos: + # Extract README/description/homepage context for this repo + ctx = extract_repo_context(tables, owner, repo) + + # ALWAYS derive the Goal from the full repo code via shallow clone + map-reduce + try: + clone_root = pathlib.Path("data/clones_goals") + repo_path = shallow_clone(owner, repo, clone_root) + code_goal = synthesize_repo_goal_from_code( + repo_path, + model_high=args.model_high, + model_low=args.model_low, + model_medium=args.model_medium, + call_llm_fn=call_llm, + ) + # Inject into ctx so build_repo_prompt uses it as GOAL SOURCE + # build_repo_prompt already prefers ctx["readme"] over description + ctx = dict(ctx) + ctx["readme"] = code_goal + except Exception as e: + print(f"[warn] Code-derived goal failed for {owner}/{repo}: {e}") + # Fallback: keep whatever extract_repo_context found (README/description/raw) + + # Build the LLM prompt for this repo + prompt = build_repo_prompt( + pid, pname, owner, repo, ctx, tables, args.window_label + ) + + # Call the LLM with a brief, consistent system instruction + try: + summary = call_llm( + [ + { + "role": "system", + "content": ( + f"You are a careful, evidence-bound summarizer that follows directions exactly." + f"You take information from a repository (code and activity) and summarize it into a cohesive and succinct excecutive summary, " + f"highlighting key themes in issues, pull requests, users, etc. " + f"You are very observant and are able to take multiple issues, pull requests, etc. " + "and identify general trends of 'what work has been done' and 'what key issues or work pop up consistenly'. " + f"Use ONLY the information in the user message; no external knowledge. " + f"Output exactly two Markdown sections with these headings and nothing else: " + f"'## Summary and Goal' and '## Recent Developments ({args.window_label})'. " + f"No bullets. No owners. No generic KPIs. No fluff. " + ), + }, + {"role": "user", "content": prompt}, + ], + model=args.model, + ) + except Exception as e: + # On failure, write a minimal stub so the pipeline still produces files + summary = f"_LLM call failed: {e}_" + + # Format the Markdown with a clear title + title = f"# Executive Summary: {owner}/{repo} — {pname} ({pid}) — {args.window_label}" + md = title + "\n\n" + summary + _footer() + "\n" + + # reports/____.md + out_path = os.path.join(args.out_dir, f"{pid}__{owner}__{repo}.md") + with open(out_path, "w", encoding="utf-8") as f: + f.write(md) + print(f"Wrote {out_path}") + + # Clean up the clone to avoid disk growth (Option A) + try: + delete_clone_path(repo_path) # repo_path came from shallow_clone(...) + except Exception as e: + print(f"[warn] cleanup failed for {owner}/{repo}: {e}") + + try: + if clone_root_torm.exists(): + shutil.rmtree(clone_root_torm) + clone_root_torm.mkdir( + parents=True, exist_ok=True + ) # leave an empty folder for next run + except Exception as e: + print(f"[warn] failed to clear clone cache at {clone_root_torm}: {e}") + + +if __name__ == "__main__": + main()