diff --git a/.gitignore b/.gitignore index 53d847c..a88fce0 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ dist/ *.pyc *.sublime-workspace st_package_reviewer/_version.py +.thecrawl/ diff --git a/README.md b/README.md index 7f5d603..9537e94 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,11 @@ reported by the tool, [refer to the wiki][wiki]. +## Usage as a GitHub Action + +See gh_action/README.md for how to run this as a composite action that runs on channel/registry PRs. + + ## Installation Requires **Python 3.13**. diff --git a/gh_action/README.md b/gh_action/README.md new file mode 100644 index 0000000..b483c15 --- /dev/null +++ b/gh_action/README.md @@ -0,0 +1,55 @@ +# PR Channel Action + +This composite action diffs a Package Control channel registry between a PR’s base and head commits, crawls only the changed and added packages using your thecrawl, downloads each release archive, and runs `st_package_reviewer` on the extracted contents. The job fails if any crawl, download, unzip, or review step fails. + +## Inputs + +- `pr` (required): Full PR URL, e.g. `https://github.com/wbond/package_control_channel/pull/9236`. +- `file` (optional): Path to the channel or repository file inside the repo. Default: `repository.json`. +- `thecrawl` (optional): Path to a local `thecrawl` repo, or a git URL to clone a fork/branch/commit. Default: `https://github.com/packagecontrol/thecrawl` + +You can pin a ref with `@ref` for HTTPS URLs, e.g.: + - `https://github.com/packagecontrol/thecrawl.git@feature-branch` + - `https://github.com/packagecontrol/thecrawl.git@v1.2.3` + - `https://github.com/packagecontrol/thecrawl.git@abc1234` + +## Example Usage + +```yaml +name: Channel Diff and Review +on: + pull_request: + paths: + - 'repository.json' + +jobs: + diff-and-review: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Diff and review changed/added packages + uses: ./gh_action + with: + pr: ${{ github.event.pull_request.html_url }} + file: repository.json + # thecrawl: ../thecrawl # optional path + # thecrawl: https://github.com/packagecontrol/thecrawl@my-branch # optional URL with ref +``` + +## Notes + +- The action ensures `uv` is available via `astral-sh/setup-uv`. GitHub’s hosted runners include `gh` (GitHub CLI) by default. +- If `thecrawl` is not provided, the action clones `https://github.com/packagecontrol/thecrawl`. +- Network access is required to fetch raw files, zipballs, and the GitHub API. For GitHub zipball downloads, the action falls back to `gh api` if `curl` fails. + + +## What It Does + +- Resolves base/head repos and SHAs via `gh pr view`. +- Builds a registry JSON at both SHAs using your local or cloned `thecrawl` (`uv run -m scripts.generate_registry`). +- Diffs registries by package name; prints Removed/Changed/Added to stderr and emits changed+added names to stdout. +- For each changed/added package: + - Runs `uv run -m scripts.crawl --registry --workspace --name `. + - Reads the workspace JSON and downloads each release zip. + - Unpacks the zip and runs `uv run st_package_reviewer `. + - Aggregates failures and fails the job if any occurred. diff --git a/gh_action/action.sh b/gh_action/action.sh new file mode 100755 index 0000000..86606bd --- /dev/null +++ b/gh_action/action.sh @@ -0,0 +1,324 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat >&2 < [--file ] [--thecrawl ] + +Arguments: + --pr GitHub Pull Request URL (e.g. https://github.com/wbond/package_control_channel/pull/9236) + --file Path within the repo to the channel JSON (default: repository.json) + --thecrawl Path to local thecrawl repo or URL to clone (supports @ref to pin, default: https://github.com/packagecontrol/thecrawl) + +Requires: gh, uv +EOF +} + +PR_URL="" +REL_PATH="repository.json" +THECRAWL="https://github.com/packagecontrol/thecrawl" + +while [[ $# -gt 0 ]]; do + case "$1" in + --pr) + PR_URL="$2"; shift 2;; + --file) + REL_PATH="$2"; shift 2;; + --thecrawl) + THECRAWL="$2"; shift 2;; + -h|--help) + usage; exit 0;; + *) + echo "Unknown argument: $1" >&2; usage; exit 2;; + esac +done + +if [[ -z "$PR_URL" ]]; then + echo "Error: --pr is required" >&2; usage; exit 2 +fi + +# Normalize relative path (strip leading ./) +REL_PATH="${REL_PATH#./}" + +if ! command -v gh >/dev/null 2>&1; then + echo "Error: gh (GitHub CLI) is required" >&2; exit 2 +fi +if ! command -v uv >/dev/null 2>&1; then + echo "Error: uv is required" >&2; exit 2 +fi + +# When this action is downloaded by GitHub, it does not include the .git folder. +# Our project uses hatch-vcs/setuptools-scm for dynamic versioning, which needs VCS metadata. +# Provide a fallback version so the build backend can proceed when running under Actions. +# Project-specific env var recommended by setuptools-scm +export SETUPTOOLS_SCM_PRETEND_VERSION_FOR_ST_PACKAGE_REVIEWER=${SETUPTOOLS_SCM_PRETEND_VERSION_FOR_ST_PACKAGE_REVIEWER:-0.0.0} +# Generic fallback for environments that ignore the project-specific variant +export SETUPTOOLS_SCM_PRETEND_VERSION=${SETUPTOOLS_SCM_PRETEND_VERSION:-$SETUPTOOLS_SCM_PRETEND_VERSION_FOR_ST_PACKAGE_REVIEWER} + + +setup_thecrawl() { + local src="$1"; shift || true + [[ -z "$src" ]] && src="https://github.com/packagecontrol/thecrawl" + local target="${GITHUB_WORKSPACE:-$PWD}/.thecrawl" + while [[ $# -gt 0 ]]; do + case "$1" in + --target) target="$2"; shift 2;; + *) echo "Unknown setup_thecrawl arg: $1" >&2; return 2;; + esac + done + + if [[ "$src" =~ ^https?:// || "$src" =~ ^git@ ]]; then + local url_base="$src" ref="" + if [[ "$url_base" =~ ^https?://.+@.+$ ]]; then + ref="${url_base##*@}" + url_base="${url_base%*@$ref}" + fi + if [[ -d "$target/.git" ]]; then + git -C "$target" remote set-url origin "$url_base" >/dev/null 2>&1 || true + if [[ -n "$ref" ]]; then + echo "Checking out thecrawl ref '$ref' in $target" >&2 + git -C "$target" fetch --depth 1 origin "$ref" >&2 + git -C "$target" checkout -q FETCH_HEAD >&2 + fi + echo "$target"; return 0 + fi + if [[ -n "$ref" ]]; then + echo "Cloning thecrawl $url_base at ref '$ref' into $target" >&2 + git init -q "$target" >&2 + git -C "$target" remote add origin "$url_base" >&2 + git -C "$target" fetch --depth 1 origin "$ref" >&2 + git -C "$target" checkout -q FETCH_HEAD >&2 + else + echo "Cloning thecrawl from $url_base into $target" >&2 + git clone --depth 1 "$url_base" "$target" >&2 + fi + echo "$target"; return 0 + fi + echo "$src"; return 0 +} + + +fetch_pr_metadata() { + local pr_url="$1" + BASE_NWO=$(echo "$pr_url" | awk -F/ '{print $4"/"$5}') + IFS=: read -r HEAD_NWO BASE_SHA HEAD_SHA < <( + gh pr view "$pr_url" \ + --json headRepository,baseRefOid,headRefOid \ + -q '[.headRepository.nameWithOwner // "", .baseRefOid, .headRefOid] | join(":")' + ) + if [[ -z "$BASE_NWO" || -z "$BASE_SHA" || -z "$HEAD_SHA" ]]; then + echo "Error: failed to resolve PR details via gh" >&2 + echo " PR: $pr_url" >&2 + echo " base nwo: ${BASE_NWO:-}" >&2 + echo " base sha: ${BASE_SHA:-}" >&2 + echo " head nwo: ${HEAD_NWO:-} (may match base)" >&2 + echo " head sha: ${HEAD_SHA:-}" >&2 + echo "Hint:" >&2 + echo " - Commands used: 'gh pr view --json baseRefOid,headRefOid,headRepository'" >&2 + return 2 + fi + if [[ -z "$HEAD_NWO" ]]; then + HEAD_NWO="$BASE_NWO" + fi + BASE_URL="https://raw.githubusercontent.com/${BASE_NWO}/${BASE_SHA}/${REL_PATH}" + HEAD_URL="https://raw.githubusercontent.com/${HEAD_NWO}/${HEAD_SHA}/${REL_PATH}" + echo "Base URL: $BASE_URL" >&2 + echo "Target URL: $HEAD_URL" >&2 +} + + +# Robust ZIP downloader with fallback to gh for GitHub zipball URLs +download_zip() { + local url="$1" dest="$2" + mkdir -p "$(dirname "$dest")" + rm -f "$dest.part" "$dest" + # First try curl with retries + if curl -fSL --retry 3 --retry-all-errors --connect-timeout 15 --max-time 600 \ + -o "$dest.part" "$url"; then + mv "$dest.part" "$dest" + return 0 + fi + rm -f "$dest.part" + # Fallback for codeload.github.com///zip/ + if [[ "$url" =~ ^https://codeload\.github\.com/([^/]+)/([^/]+)/zip/(.+)$ ]]; then + local owner="${BASH_REMATCH[1]}" repo="${BASH_REMATCH[2]}" ref="${BASH_REMATCH[3]}" + echo " curl failed; using gh api zipball for $owner/$repo@$ref" >&2 + if gh api -H "Accept: application/octet-stream" \ + "repos/${owner}/${repo}/zipball/${ref}" > "$dest.part"; then + mv "$dest.part" "$dest" + return 0 + fi + rm -f "$dest.part" + fi + return 1 +} + + +echo "::group::Fetching PR metadata" +echo "Resolving PR metadata via gh: $PR_URL" >&2 +if ! fetch_pr_metadata "$PR_URL"; then + echo "::error ::Error: failed to resolve PR details via gh" >&2 + exit 2 +fi +echo "::endgroup::" + + +echo "::group::Getting thecrawl" +CRAWLER_REPO=$(setup_thecrawl "$THECRAWL" --target "${GITHUB_WORKSPACE:-$PWD}/.thecrawl") +if [[ ! -d "$CRAWLER_REPO" ]]; then + echo "::error ::Error: could not find or clone thecrawl" >&2 + exit 2 +fi +echo "Using thecrawl at: $CRAWLER_REPO" >&2 +echo "::endgroup::" + +TMPDIR=$(mktemp -d) +trap 'rm -rf "$TMPDIR"' EXIT + +BASE_REG="$TMPDIR/base_registry.json" +HEAD_REG="$TMPDIR/head_registry.json" + +echo "::group::Generating base registry…" >&2 +(cd "$CRAWLER_REPO" && uv run -m scripts.generate_registry -c "$BASE_URL" -o "$BASE_REG") +echo "::endgroup::" + +echo "::group::Generating target registry…" >&2 +(cd "$CRAWLER_REPO" && uv run -m scripts.generate_registry -c "$HEAD_URL" -o "$HEAD_REG") +echo "::endgroup::" + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +# Invoke Python diff to print results and collect changed+added package names +PKGS=() +while IFS= read -r -d '' __pkg; do + PKGS+=("$__pkg") +done < <(python3 "$SCRIPT_DIR/diff_repository.py" --base-file "$BASE_REG" --target-file "$HEAD_REG" -z) + +if [[ ${#PKGS[@]} -eq 0 ]]; then + echo "::notice ::No changed or added packages to crawl." >&2 + exit 0 +fi + +echo "Crawling ${#PKGS[@]} package(s) from target registry…" >&2 +failures=0 +for pkg in "${PKGS[@]}"; do + echo "::group::Crawling: $pkg" >&2 + # Use workspace file output for robust parsing + wsdir="$TMPDIR/workspaces" + mkdir -p "$wsdir" + wsfile="$wsdir/${pkg}.json" + echo "Workspace file is $wsfile" >&2 + set +e + (cd "$CRAWLER_REPO" && uv run -m scripts.crawl --registry "$HEAD_REG" --workspace "$wsfile" --name "$pkg" 2> >(cat >&2)) + STATUS=$? + set -e + if [[ $STATUS -ne 0 || ! -s "$wsfile" ]]; then + echo "::error ::! Crawl failed for $pkg" >&2 + failures=$((failures+1)) + continue + fi + + # Extract release URLs (and versions) from workspace + RELS=() + while IFS= read -r -d '' __rec; do + RELS+=("$__rec") + done < <(python3 "$SCRIPT_DIR/parse_workspace.py" "$wsfile" "$pkg" -z) + if [[ ${#RELS[@]} -eq 0 ]]; then + echo "::error ::! No releases found for $pkg" >&2 + failures=$((failures+1)) + continue + fi + echo "::endgroup::" + + i=0 + for rec in "${RELS[@]}"; do + i=$((i+1)) + url="${rec%%$'\t'*}" + ver="${rec#*$'\t'}" + + if [[ -z "$url" ]]; then + echo "::error ::! Missing release URL for $pkg release #$i" >&2 + failures=$((failures+1)) + continue + fi + + # if no tab present, ver==url; fix that + if [[ "$ver" == "$url" ]]; then ver=""; fi + if [[ -z "$ver" ]]; then + echo "::warning ::Could not extract a version for $pkg release #$i (url: $url); using r$i" >&2 + ver="r$i" + fi + # sanitize for filesystem path + safe_ver=$(printf '%s' "$ver" | sed 's/[^A-Za-z0-9._-]/_/g') + + workdir="$TMPDIR/review/$pkg/$safe_ver" + mkdir -p "$workdir" + + zipfile="$workdir/pkg.zip" + echo "::group::Downloading $pkg-$ver" >&2 + echo " Downloading release $ver: $url" >&2 + if ! download_zip "$url" "$zipfile"; then + echo "::error ::! Download failed for $pkg@$ver" >&2 + failures=$((failures+1)) + continue + fi + + echo " Unpacking…" >&2 + # Prefer unzip; fallback to Python zipfile + if command -v unzip >/dev/null 2>&1; then + if ! unzip -q -o "$zipfile" -d "$workdir"; then + echo "::error ::! Unzip failed for $pkg@$ver" >&2 + failures=$((failures+1)) + continue + fi + else + echo "::notice ::unzip not available; falling back to use Python." + python3 - "$zipfile" "$workdir" <<'PY' +import sys, zipfile, os +zf = zipfile.ZipFile(sys.argv[1]) +zf.extractall(sys.argv[2]) +PY + if [[ $? -ne 0 ]]; then + echo "::error ::! Unzip failed for $pkg@$ver (Python)" >&2 + failures=$((failures+1)) + continue + fi + fi + + # Determine the top-level extracted directory + topdir=$(find "$workdir" -mindepth 1 -maxdepth 1 -type d | head -n1) + if [[ -z "$topdir" ]]; then + echo "::error ::! Could not locate extracted folder for $pkg@$ver" >&2 + failures=$((failures+1)) + continue + fi + echo "::endgroup::" + + echo "::group::Reviewing $pkg-$safe_ver" >&2 + echo " Reviewing with st_package_reviewer: $topdir" >&2 + ROOT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" + if ! (cd "$ROOT_DIR" && uv run st_package_reviewer "$topdir") | awk ' + /^Reporting [0-9]+ failures:/ { mode = "error"; next } + /^Reporting [0-9]+ warnings:/ { mode = "warning"; next } + /^- / && mode { + sub(/^- /, ""); + print "::" mode " title=CHECK ::" $0; + next; + } + { mode = ""; print } + '; + then + echo " ! Review failed for $pkg@$ver" >&2 + failures=$((failures+1)) + continue + fi + echo "::endgroup::" + done +done + +if [[ $failures -gt 0 ]]; then + echo "::error ::Completed with $failures failure(s)." >&2 + exit 1 +else + echo "::notice title=PASS ::Completed successfully." >&2 + exit 0 +fi diff --git a/gh_action/action.yml b/gh_action/action.yml new file mode 100644 index 0000000..3305040 --- /dev/null +++ b/gh_action/action.yml @@ -0,0 +1,25 @@ +name: Package Reviewer PR action +description: Review a repository or channel PR +inputs: + pr: + description: Pull Request URL (e.g. https://github.com/wbond/package_control_channel/pull/123) + required: true + file: + description: Path to channel JSON within the repo + default: repository.json + required: false + thecrawl: + description: "Optional path to a local thecrawl repo, or a URL to clone (supports @ref to pin)" + default: https://github.com/packagecontrol/thecrawl + required: false +runs: + using: composite + steps: + - name: Ensure uv is available + uses: astral-sh/setup-uv@v3 + + - name: Run Package Reviewer + shell: bash + run: | + set -euo pipefail + "${{ github.action_path }}/action.sh" --pr "${{ inputs.pr }}" --file "${{ inputs.file }}" --thecrawl "${{ inputs.thecrawl }}" diff --git a/gh_action/diff_repository.py b/gh_action/diff_repository.py new file mode 100644 index 0000000..06de584 --- /dev/null +++ b/gh_action/diff_repository.py @@ -0,0 +1,145 @@ +import argparse +import json +import sys + + +""" +Tooling: diff two registry files and print added/changed/removed names. + +""" + + +def eprint(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + + +def main(argv=None) -> int: + parser = argparse.ArgumentParser( + description="Diff two registry files and report added/changed/removed package names.", + ) + parser.add_argument('--base-file', required=True, help='Path to base registry JSON to diff') + parser.add_argument('--target-file', required=True, help='Path to target registry JSON to diff') + parser.add_argument( + '-z', + action='store_true', + help='Separate entries with NUL (\\0) instead of newlines') + + args = parser.parse_args(argv) + + with open(args.base_file, 'r', encoding='utf-8') as f: + base_data = json.load(f) + with open(args.target_file, 'r', encoding='utf-8') as f: + target_data = json.load(f) + + base_map = extract_registry_map(base_data) + target_map = extract_registry_map(target_data) + + base_names = set(base_map.keys()) + target_names = set(target_map.keys()) + + removed = sorted(base_names - target_names) + added = sorted(target_names - base_names) + + common = base_names & target_names + changed = [] + for name in common: + b = base_map.get(name) + t = target_map.get(name) + if _normalize_package(b) != _normalize_package(t): + changed.append(name) + changed.sort() + + # Print summary to stderr + eprint( + "::notice title=CHANGES ::" + f"Removed {_format_oxford_list(removed)}, " + f"changed {_format_oxford_list(changed)}, " + f"added {_format_oxford_list(added)}." + ) + + if args.z: + for name in changed + added: + print(name, end="\0") + + return 0 + + +def extract_registry_map(registry_json) -> dict: + """Return mapping name->package for the expected registry schema. + + Expected structure: + { + "repositories": [ ... ], + "packages": [ {"name": ...}, ... ] + } + """ + if not isinstance(registry_json, dict): + eprint("::error ::registry JSON must be an object with 'packages' list") + sys.exit(2) + + pkgs = registry_json.get('packages') + if not isinstance(pkgs, list): + eprint("::error ::registry JSON must contain 'packages' as a list") + sys.exit(2) + + result = {} + for item in pkgs: + if isinstance(item, dict) and 'name' in item: + result[item['name']] = item + return result + + +def _format_oxford_list(items): + n = len(items) + if n == 0: + return "(none)" + if n == 1: + return items[0] + if n == 2: + return f"{items[0]} and {items[1]}" + return ", ".join(items[:-1]) + f", and {items[-1]}" + + +def _normalize_package(pkg: dict) -> dict: + """Return a normalized copy of a package for meaningful comparisons. + + - Drop ephemeral fields that differ across refs (e.g. 'source'). + - Sort list fields where ordering is not semantically meaningful. + - Sort release entries for stable ordering. + """ + if not isinstance(pkg, dict): + return pkg + + # Shallow copy, "source" is a URL that can include a commit sha + norm = {k: v for k, v in pkg.items() if k != 'source'} + + # Normalize simple list fields + for key in ('labels', 'previous_names'): + if key in norm and isinstance(norm[key], list): + try: + norm[key] = sorted(norm[key]) + except TypeError: + # Mixed types; fall back to stringified sort + norm[key] = sorted(norm[key], key=lambda x: str(x)) + + # Normalize releases: sort entries by stable key + if 'releases' in norm and isinstance(norm['releases'], list): + rels = norm['releases'] + + # Brutally convert a complete dict to a key + def rel_key(d): + if isinstance(d, dict): + # JSON string as a stable key + try: + return json.dumps(d, sort_keys=True) + except Exception: + return str(d) + return str(d) + + norm['releases'] = sorted(rels, key=rel_key) + + return norm + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/gh_action/parse_workspace.py b/gh_action/parse_workspace.py new file mode 100644 index 0000000..aabdc13 --- /dev/null +++ b/gh_action/parse_workspace.py @@ -0,0 +1,54 @@ +import argparse +import json + + +""" +Tooling: given a workspace file and a package name extract all url/version pairs from the releases. + +""" + + +def main(argv=None) -> int: + p = argparse.ArgumentParser( + description=( + "Print release (url, version) pairs from a workspace file" + " for a specific package" + ) + ) + p.add_argument("workspace", help="Path to workspace JSON") + p.add_argument("name", help="Package name to extract") + p.add_argument( + "-z", + action="store_true", + help="Separate entries with NUL (\\0) instead of newlines", + ) + args = p.parse_args(argv) + + try: + with open(args.workspace, "r", encoding="utf-8") as f: + data = json.load(f) + except Exception: + return 1 + + pkgs = data.get("packages", {}) + pkg = pkgs.get(args.name) + if not isinstance(pkg, dict): + return 2 + + releases = pkg.get("releases", []) + for rel in releases: + if not isinstance(rel, dict): + continue + url = rel.get("url") + if not url: + continue + ver = rel.get("version", "") + if args.z: + print(f"{url}\t{ver}", end="\0") + else: + print(f"{url}\t{ver}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())