andrewfulton9 · andrewfulton9 · May 7, 2025 · May 12, 2025 · May 13, 2025 · Jun 12, 2025
diff --git a/.bazelrc b/.bazelrc
@@ -1,11 +1,14 @@
 # Needed to work with ZetaSQL dependency.
+# Zetasql is removed.
+# This is a candidate for removal
 build --cxxopt="-std=c++17"
 
 # Needed to avoid zetasql proto error.
+# Zetasql is removed.
+# This is a candidate for removal
 build --protocopt=--experimental_allow_proto3_optional
 
 # icu@: In create_linking_context: in call to create_linking_context(),
 # parameter 'user_link_flags' is deprecated and will be removed soon.
 # It may be temporarily re-enabled by setting --incompatible_require_linker_input_cc_api=false
 build --incompatible_require_linker_input_cc_api=false
-
diff --git a/.bazelversion b/.bazelversion
@@ -1 +1 @@
-6.5.0
+6.5.0
diff --git a/.github/workflows/ci-lint.yml b/.github/workflows/ci-lint.yml
@@ -0,0 +1,21 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+     branches: [master]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/[email protected]
+      with:
+        # Ensure the full history is fetched
+        # This is required to run pre-commit on a specific set of commits
+        # TODO: Remove this when all the pre-commit issues are fixed
+        fetch-depth: 0
+    - uses: actions/[email protected]
+      with:
+        python-version: 3.13
+    - uses: pre-commit/[email protected]
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -29,7 +29,9 @@ jobs:
     - name: Install built wheel
       shell: bash
       run: |
-        pip install dist/*.whl['test']
+        PYTHON_VERSION_TAG="cp$(echo ${{ matrix.python-version }} | sed 's/\.//')"
+        WHEEL_FILE=$(ls dist/*${PYTHON_VERSION_TAG}*.whl)
+        pip install "${WHEEL_FILE}[test]"
 
     - name: Run Test
       run: |

diff --git a/.gitignore b/.gitignore
@@ -126,4 +126,4 @@ dmypy.json
 .pyre/
 
 # pb2.py files
-*_pb2.py
+*_pb2.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,39 @@
+# pre-commit is a tool to perform a predefined set of tasks manually and/or
+# automatically before git commits are made.
+#
+# Config reference: https://pre-commit.com/#pre-commit-configyaml---top-level
+#
+# Common tasks
+#
+# - Register git hooks: pre-commit install --install-hooks
+# - Run on all files:   pre-commit run --all-files
+#
+# These pre-commit hooks are run as CI.
+#
+# NOTE: if it can be avoided, add configs/args in pyproject.toml or below instead of creating a new `.config.file`.
+# https://pre-commit.ci/#configuration
+ci:
+  autoupdate_schedule: monthly
+  autofix_commit_msg: |
+    [pre-commit.ci] Apply automatic pre-commit fixes
+
+repos:
+  # general
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: end-of-file-fixer
+        exclude: '\.svg$|\.patch$'
+      - id: trailing-whitespace
+        exclude: '\.svg$|\.patch$'
+      - id: check-json
+      - id: check-yaml
+        args: [--allow-multiple-documents, --unsafe]
+      - id: check-toml
+
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.5.6
+    hooks:
+      - id: ruff
+        args: ["--fix"]
+      - id: ruff-format
diff --git a/README.md b/README.md
@@ -238,4 +238,3 @@ tag.
   * [TensorFlow Data Validation PyPI](https://pypi.org/project/tensorflow-data-validation/)
   * [TensorFlow Data Validation Paper](https://mlsys.org/Conferences/2019/doc/2019/167.pdf)
   * [TensorFlow Data Validation Slides](https://conf.slac.stanford.edu/xldb2018/sites/xldb2018.conf.slac.stanford.edu/files/Tues_09.45_NeoklisPolyzotis_Data%20Analysis%20and%20Validation%20(1).pdf)
-
diff --git a/WORKSPACE b/WORKSPACE
@@ -71,18 +71,6 @@ http_archive(
     ],
 )
 
-# Needed by abseil-py by zetasql.
-http_archive(
-    name = "six_archive",
-    build_file = "//third_party:six.BUILD",
-    sha256 = "105f8d68616f8248e24bf0e9372ef04d3cc10104f1980f54d57b2ce73a5ad56a",
-    strip_prefix = "six-1.10.0",
-    urls = [
-        "http://mirror.bazel.build/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
-        "https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
-    ],
-)
-
 load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps")
 
 protobuf_deps()
@@ -112,6 +100,16 @@ http_archive(
     url = "https://github.com/abseil/abseil-cpp/archive/%s.tar.gz" % COM_GOOGLE_ABSL_COMMIT,
 )
 
+
+# re2 required for google tests
+http_archive(
+    name = "com_googlesource_code_re2",
+    #    build_file = "//third_party:re2.BUILD",
+    sha256 = "b90430b2a9240df4459108b3e291be80ae92c68a47bc06ef2dc419c5724de061",
+    strip_prefix = "re2-a276a8c738735a0fe45a6ee590fe2df69bcf4502",
+    urls = ["https://github.com/google/re2/archive/a276a8c738735a0fe45a6ee590fe2df69bcf4502.tar.gz"],
+)
+
 # Will be loaded by workspace.bzl from head
 # TFMD_COMMIT = "404805761e614561cceedc429e67c357c62be26d"  # 1.17.1
 
@@ -218,46 +216,6 @@ load("@bazel_gazelle//:deps.bzl", "gazelle_dependencies")  #, "go_repository")
 
 gazelle_dependencies()
 
-################################################################################
-# ZetaSQL                                                                      #
-################################################################################
-
-ZETASQL_COMMIT = "a516c6b26d183efc4f56293256bba92e243b7a61"  # 11/01/2024
-
-http_archive(
-    name = "com_google_zetasql",
-    patch_args = ["-p1"],
-    patches = ["//third_party:zetasql.patch"],
-    sha256 = "1afc2210d4aad371eff0a6bfdd8417ba99e02183a35dff167af2fa6097643f26",
-    strip_prefix = "zetasql-%s" % ZETASQL_COMMIT,
-    urls = ["https://github.com/google/zetasql/archive/%s.tar.gz" % ZETASQL_COMMIT],
-)
-
-load("@com_google_zetasql//bazel:zetasql_deps_step_1.bzl", "zetasql_deps_step_1")
-
-zetasql_deps_step_1()
-
-load("@com_google_zetasql//bazel:zetasql_deps_step_2.bzl", "zetasql_deps_step_2")
-
-zetasql_deps_step_2(
-    analyzer_deps = True,
-    evaluator_deps = True,
-    java_deps = False,
-    testing_deps = False,
-    tools_deps = False,
-)
-
-# No need to run zetasql_deps_step_3 and zetasql_deps_step_4 since all necessary dependencies are
-# already installed.
-
-# load("@com_google_zetasql//bazel:zetasql_deps_step_3.bzl", "zetasql_deps_step_3")
-
-# zetasql_deps_step_3()
-
-# load("@com_google_zetasql//bazel:zetasql_deps_step_4.bzl", "zetasql_deps_step_4")
-
-# zetasql_deps_step_4()
-
 _PLATFORMS_VERSION = "0.0.6"
 
 http_archive(

diff --git a/g3doc/custom_data_validation.md b/g3doc/custom_data_validation.md
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,3 +19,130 @@ requires = [
   # Required for using org_tensorflow bazel repository.
   "numpy~=1.22.0",
 ]
+
+[tool.ruff]
+line-length = 88
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    "W",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # pep8 naming
+    "N",
+    # pydocstyle
+    "D",
+    # annotations
+    "ANN",
+    # debugger
+    "T10",
+    # flake8-pytest
+    "PT",
+    # flake8-return
+    "RET",
+    # flake8-unused-arguments
+    "ARG",
+    # flake8-fixme
+    "FIX",
+    # flake8-eradicate
+    "ERA",
+    # pandas-vet
+    "PD",
+    # numpy-specific rules
+    "NPY",
+]
+
+ignore = [
+    "D104",   # Missing docstring in public package
+    "D100",   # Missing docstring in public module
+    "D211",   # No blank line before class
+    "PD901",  # Avoid using 'df' for pandas dataframes. Perfectly fine in functions with limited scope
+    "ANN201", # Missing return type annotation for public function (makes no sense for NoneType return types...)
+    "ANN101", # Missing type annotation for `self`
+    "ANN204", # Missing return type annotation for special method
+    "ANN002", # Missing type annotation for `*args`
+    "ANN003", # Missing type annotation for `**kwargs`
+    "D105",   # Missing docstring in magic method
+    "D203",   # 1 blank line before after class docstring
+    "D204",   # 1 blank line required after class docstring
+    "D413",   # 1 blank line after parameters
+    "SIM108", # Simplify if/else to one line; not always clearer
+    "D206",   # Docstrings should be indented with spaces; unnecessary when running ruff-format
+    "E501",   # Line length too long; unnecessary when running ruff-format
+    "W191",   # Indentation contains tabs; unnecessary when running ruff-format
+
+    # REMOVE AFTER FIXING
+    # ANN rules (flake8-annotations)
+    "ANN001", # Missing type annotation for function argument `args`
+    "ANN102", # Missing type annotation for `cls` in classmethod
+    "ANN202", # Missing Missing return type annotation for private function
+    "ANN205", # Missing return type annotation for staticmethod
+    "ANN206", # Missing return type annotation for classmethod
+    "ANN401", # Dynamically typed expressions (typing.Any) are disallowed in `domain`
+    # ARG rules (flake8-unused-arguments)
+    "ARG001", # Unused function argument
+    "ARG002", # Unused method argument
+    # B rules (flake8-bugbear)
+    "B005",   # Using `.strip()` with multi-character strings is misleading
+    "B007",   # Loop control variable not used within loop body
+    "B008",   # Do not perform function call in argument defaults; instead, perform the call within the function, or read the default from a module-level singleton variable
+    "B904",   # Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling
+    # D rules (pydocstyle)
+    "D101",   # Missing docstring in public class
+    "D102",   # Missing docstring in public method
+    "D103",   # Missing docstring in public function
+    "D107",   # Missing docstring in `__init__`,
+    "D401",   # First line of docstring should be in imperative mood: "Loads the vocabulary from the specified path."
+    "D404",   # First word of the docstring should not be "This"
+    "D417",   # Missing argument descriptions in the docstring
+    # E rules (pycodestyle)
+    "E731",   # Do not assign a `lambda` expression, use a `def`
+    "E741",   # Ambiguous variable name
+    # ERA rules (flake8-eradicate)
+    "ERA001", # Found commented-out code
+    # F rules (Pyflakes)
+    "F821",   # Undefined name
+    # FIX rules (flake8-fixme)
+    "FIX002", # Line contains TODO, consider resolving the issue
+    # N rules (pep8-naming)
+    "N802",   # Function name should be lowercase,
+    # NPY rules (numpy-specific rules)
+    "NPY002", # Replace legacy
+    # PD rules (pandas-vet)
+    "PD002",  # `inplace=True` should be avoided; it has inconsistent behavior
+    "PD003",  # `.isna` is preferred to `.isnull`; functionality is equivalent
+    "PD011",  # Use `.to_numpy()` instead of `.values`
+    "PD015",  # Use `.merge` method instead of `pd.merge` function
+    # PT rules (flake8-pytest-style)
+    "PT009",  # Use a regular `assert` instead of unittest-style `assertEqual`
+    "PT018",  # Assertion should be broken down into multiple parts
+    "PT027",  # Use `pytest.raises` instead of unittest-style `assertRaisesRegex`
+    # RET rules (flake8-return)
+    "RET504", # Unnecessary assignment to variable before `return` statement
+    "RET505", # Unnecessary `elif` after `return` statement
+    # SIM rules (flake8-simplify)
+    "SIM101", # Multiple `isinstance` calls for `maybe_collection`, merge into a single call
+    "SIM102", # Use a single `if` statement instead of nested `if` statements
+    "SIM103", # Return the condition directly
+    "SIM105", # Use `contextlib.suppress(...)` instead of `try`-`except`-`pass`
+    "SIM117", # Use a single `with` statement with multiple contexts instead of nested `with` statements
+    "SIM211", # Use `not ...` instead of `False if ... else True`
+    # UP rules (pyupgrade)
+    "UP008",  # Use `super()` instead of `super(__class__, self)`
+    "UP028",  # Replace `yield` over `for` loop with `yield from`
+    "UP031",  # Use format specifiers instead of percent format
+]
+
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401"]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -238,4 +238,3 @@ tag.
		* [TensorFlow Data Validation PyPI](https://pypi.org/project/tensorflow-data-validation/)
		* [TensorFlow Data Validation Paper](https://mlsys.org/Conferences/2019/doc/2019/167.pdf)
		* [TensorFlow Data Validation Slides](https://conf.slac.stanford.edu/xldb2018/sites/xldb2018.conf.slac.stanford.edu/files/Tues_09.45_NeoklisPolyzotis_Data%20Analysis%20and%20Validation%20(1).pdf)