fix: Fix custom pre-commit hook

jpmckinney · jpmckinney · commit 8f601d7983cd · 2024-10-10T17:29:49.000-04:00
This command was being run by pre-commit, as, for example, `pre-commit run nbqa --all-files` or `--files ...`

Given the configuration in .pre-commit-config.yaml, this runs: `nbqa manage FILE ...`

nbqa creates a subprocess, replacing the original files with temporary files. (It then rewrites stdout and stderr to
replace the temporary filenames with the original filenames, making this behavior hard to observe.)

The temporary files are transformations of the original files, in which magic cells are removed. This makes sense for
other linters, that operate on Python code.

Since we want to operate on SQL magics, nbqa is an inappropriate entry point for a pre-commit hook.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -26,7 +26,7 @@ jobs:
       - uses: pre-commit/action@v3.0.1
         continue-on-error: true
         with:
-          extra_args: nbqa --files ${{ steps.changed-files.outputs.all_changed_files }}
+          extra_args: local --files ${{ steps.changed-files.outputs.all_changed_files }}
       - if: ${{ env.PAT }}
         uses: stefanzweifel/git-auto-commit-action@v5
         with:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -24,8 +24,10 @@ repos:
     rev: 1.8.7
     hooks:
       - id: nbqa-black
-      - id: nbqa
-        entry: nbqa manage
-        name: nbqa-manage
-        alias: nbqa-manage
+  - repo: local
+    hooks:
+      - id: local
+        name: local
+        language: python
+        entry: ./manage.py
         additional_dependencies: [click, jsonschema, nbmerge, nbformat, sqlfluff]
diff --git a/manage.py b/manage.py
@@ -1,12 +1,11 @@
 #!/usr/bin/env python
 import json
-import os
 from pathlib import Path
 
 import click
 import jsonschema
+import nbformat
 import sqlfluff
-from nbformat import write as write_notebook
 from nbmerge import merge_notebooks
 from sqlfluff.core import FluffConfig
 
@@ -109,80 +108,72 @@ def __init__(self, filename):
         super().__init__(f"{filename} is invalid")
 
 
-def yield_notebooks():
-    for entry in os.scandir(BASEDIR):
-        if not entry.name.endswith(".ipynb"):
-            continue
+def json_dump(path, notebook):
+    with path.open("w") as f:
+        # Use indent=2 like Google Colab for small diffs.
+        json.dump(notebook, f, ensure_ascii=False, indent=2)
+        f.write("\n")
 
-        path = Path(entry.path)
-        with path.open() as f:
-            try:
-                notebook = json.load(f)
-            except json.decoder.JSONDecodeError as e:
-                raise InvalidNotebookError(path) from e
 
-        yield entry.name, path, notebook
+def json_load(path):
+    with path.open() as f:
+        try:
+            return json.load(f)
+        except json.decoder.JSONDecodeError as e:
+            raise InvalidNotebookError(path) from e
 
 
-def yield_cells(notebook):
-    for cell in notebook["cells"]:
-        if cell["cell_type"] != "code":
-            continue
+@click.command()
+@click.argument("filename", nargs=-1, type=click.Path(exists=True, dir_okay=False, path_type=Path))
+def pre_commit(filename):
+    """Format SQL cells in Jupyter Notebooks and merge components to build notebooks."""
+    nonzero = False
 
-        source = cell["source"]
-        if "%%sql" not in source[0]:
-            continue
+    filenames = [path for path in filename if path.name.startswith("component_")]
 
-        sql = "".join(source[1:])
+    for path in filenames:
+        notebook = json_load(path)
 
-        fix = sqlfluff.fix(sql, config=FLUFF_CONFIG)
-        for warning in sqlfluff.lint(fix, config=FLUFF_CONFIG):
-            click.secho(f"{warning['code']}:{warning['name']} {warning['description']}", fg="yellow")
-            click.echo(fix[:warning['start_file_pos']], nl=False)
-            click.secho(fix[warning['start_file_pos']:warning['end_file_pos']], fg="red", nl=False)
-            click.echo(fix[warning['end_file_pos']:])
+        for cell in notebook["cells"]:
+            if cell["cell_type"] != "code":
+                continue
 
-        yield source, cell, sql, fix
+            source = cell["source"]
+            if "%%sql" not in source[0]:
+                continue
 
+            fix = sqlfluff.fix("".join(source[1:]), config=FLUFF_CONFIG)
+            cell["source"] = [source[0], "\n", *fix.splitlines(keepends=True)]
 
-def build_notebook(slug):
-    try:
-        notebook = merge_notebooks(BASEDIR, [f"{c}.ipynb" for c in NOTEBOOKS[slug]], verbose=False)
-        notebook["metadata"]["colab"]["name"] = slug
-    except jsonschema.exceptions.ValidationError as e:
-        raise InvalidNotebookError(f"{slug}.ipynb") from e
-    else:
-        return notebook
+            warnings = sqlfluff.lint(fix, config=FLUFF_CONFIG)
+            nonzero |= bool(warnings)
 
+            for warning in warnings:
+                click.secho(f"{warning['code']}:{warning['name']} {warning['description']}", fg="yellow")
+                click.echo(fix[:warning['start_file_pos']], nl=False)
+                click.secho(fix[warning['start_file_pos']:warning['end_file_pos']], fg="red", nl=False)
+                click.echo(fix[warning['end_file_pos']:])
 
-def json_dump(path, notebook):
-    with path.open("w") as f:
-        # Use indent=2 like Google Colab for small diffs.
-        json.dump(notebook, f, ensure_ascii=False, indent=2)
-        f.write("\n")
+        json_dump(path, notebook)
 
+    for slug, components in NOTEBOOKS.items():
+        if any(path.stem in components for path in filenames):
+            template_path = Path(f"{slug}.ipynb")
+            with template_path.open("w", encoding="utf8") as f:
+                try:
+                    notebook = merge_notebooks(BASEDIR, [f"{c}.ipynb" for c in NOTEBOOKS[slug]], verbose=False)
+                    notebook["metadata"]["colab"]["name"] = slug
+                except jsonschema.exceptions.ValidationError as e:
+                    raise InvalidNotebookError(f"{slug}.ipynb") from e
+                else:
+                    nbformat.write(notebook, f)
 
-@click.command()
-@click.argument("filename", nargs=-1, type=click.Path(exists=True, dir_okay=False, path_type=Path))
-def pre_commit(filename):
-    """Format SQL cells in Jupyter Notebooks and merge components to build notebooks."""
-    resolved = [path.resolve() for path in filename]
-
-    for _, filepath, notebook in yield_notebooks():
-        if not resolved or filepath.resolve() in resolved:
-            for source, cell, _, sql_formatted in yield_cells(notebook):
-                cell["source"] = [source[0], "\n", *sql_formatted.splitlines(keepends=True)]
-
-        json_dump(filepath, notebook)
-
-    for slug in NOTEBOOKS:
-        filepath = Path(f"{slug}.ipynb")
-        with filepath.open("w", encoding="utf8") as f:
-            write_notebook(build_notebook(slug), f)
-        # nbformat uses indent=1.
-        with filepath.open() as f:
-            notebook = json.load(f)
-        json_dump(filepath, notebook)
+            # nbformat.write() uses indent=1. Rewrite with indent=2 like Google Colab.
+            # https://github.com/jupyter/nbformat/blob/ba2c6f5/nbformat/v4/nbjson.py#L51
+            json_dump(template_path, json_load(template_path))
+
+    if nonzero:
+        raise click.Abort("error")
 
 
 if __name__ == "__main__":