Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add pre-commit config for airbyte-python-cdk #270

Merged
merged 12 commits into from
Jan 29, 2025
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
exclude: |
(?x)(
# Python/system files
^.*/__init__\.py$|
^.*?/\.venv/.*$|
^.*?/node_modules/.*$|
^.*?/\.ruff_cache/.*$|

# Package management
^.*?/poetry\.lock$|
^.*?/package-lock\.json$|
^.*?/pnpm-lock\.yaml$|

# Build and test artifacts
^.*?/build/.*$|
^.*?/dist/.*$|
^.*?/\.pytest_cache/.*$|
^.*?/\.coverage$|
^.*?/coverage\.xml$|
^.*?/\.mypy_cache/.*$
)

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: check-toml

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.8.3
hooks:
# Run the linter with repo-defined settings
- id: ruff
args: [--fix]

# Run the formatter with repo-defined settings
- id: ruff-format

- repo: https://github.com/pre-commit/mirrors-prettier
rev: v3.0.3
hooks:
- id: prettier
types_or: [json, yaml]
additional_dependencies:
- [email protected]

- repo: local
hooks:
- id: addlicense
name: Add license headers
entry: addlicense -c "Airbyte, Inc." -l apache -v -f LICENSE_SHORT
language: golang
additional_dependencies: [github.com/google/[email protected]]
files: \.py$
1 change: 1 addition & 0 deletions LICENSE_SHORT
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Copyright (c) 2025 Airbyte, Inc., all rights reserved.
272 changes: 126 additions & 146 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,202 +1,182 @@
[build-system]
requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
build-backend = "poetry_dynamic_versioning.backend"
requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]

[tool.airbyte_ci]
mount_docker_socket = true
optional_poetry_groups = ["dev"]
poe_tasks = ["check-ci"]
poetry_extras = ["file-based", "vector-db-based"]
python_versions = ["3.10", "3.11"]

[tool.check-wheel-contents]
# Quality control for Python wheel generation. Docs here:
# - https://github.com/jwodder/check-wheel-contents
ignore = [
"W002" # Duplicate files. (TODO: Fix the few duplicate files, mostly `__init__.py` files that have only copyright text.)
]

[tool.isort]
skip = ["__init__.py"] # TODO: Remove after this is fixed: https://github.com/airbytehq/airbyte-python-cdk/issues/12

[tool.poe.tasks]
_format-check-prettier = {cmd = "npx prettier . --check", help = "Check formatting with prettier."}
# Format check tasks
_format-check-ruff = {cmd = "ruff format --check .", help = "Check formatting with Ruff."}
_format-fix-prettier = {cmd = "npx prettier . --write", help = "Format with prettier."}
# Format fix tasks
_format-fix-ruff = {cmd = "ruff format .", help = "Format with Ruff."}
# Linting/Typing check tasks
_lint-ruff = {cmd = "poetry run ruff check .", help = "Lint with Ruff."}
# Build tasks
assemble = {cmd = "bin/generate-component-manifest-dagger.sh", help = "Generate component manifest files."}
build = {help = "Run all tasks to build the package.", sequence = ["assemble", "build-package"]}
build-package = {cmd = "poetry build", help = "Build the python package: source and wheels archives."}
check-all = {help = "Lint, format, and type-check modified files.", ignore_fail = "return_non_zero", sequence = ["check-lockfile", "format-check", "lint", "type-check"]}
check-ci = {help = "Build the package, lint and run unit tests. Does not include type-checking.", sequence = ["build", "check-lockfile", "lint", "unit-test-with-cov"]}
# TODO: find a version of the modified mypy check that works both locally and in CI.
check-local = {help = "Lint all code, type-check modified files, and run unit tests.", sequence = ["check-lockfile", "lint", "type-check", "unit-test-with-cov"]}
# Lockfile check task
check-lockfile = {cmd = "poetry check", help = "Check the poetry lock file."}
# API Docs with PDoc
docs-generate = {cmd = "python -m docs.generate run", env = {PDOC_ALLOW_EXEC = "1"}, help = "Generate API documentation with PDoc."}
docs-preview = {help = "Generate API documentation with PDoc and then open the docs in the default web browser.", shell = "poe docs-generate && open docs/generated/index.html"}
fix-all = {help = "Lint-fix and format-fix modified files, ignoring unsafe fixes.", ignore_fail = "return_non_zero", sequence = ["format-fix", "lint-fix"]}
fix-and-check = {help = "Lint-fix and format-fix, then re-check to see if any issues remain.", ignore_fail = "return_non_zero", sequence = ["check-all", "fix-all"]}
format-check = {help = "Check formatting for all file types.", ignore_fail = "return_non_zero", sequence = ["_format-check-prettier", "_format-check-ruff"]}
format-fix = {help = "Format all file types.", ignore_fail = "return_non_zero", sequence = ["_format-fix-prettier", "_format-fix-ruff"]}
# Installation
install = {shell = "poetry install --all-extras"}
lint = {help = "Lint all code. Includes type checking.", ignore_fail = "return_non_zero", sequence = ["_lint-ruff", "type-check"]}
# Linting/Typing fix tasks
lint-fix = {cmd = "poetry run ruff check --fix .", help = "Auto-fix any lint issues that Ruff can automatically resolve (excluding 'unsafe' fixes)."}
lint-fix-unsafe = {cmd = "poetry run ruff check --fix --unsafe-fixes .", help = "Lint-fix modified files, including 'unsafe' fixes. It is recommended to first commit any pending changes and then always manually review any unsafe changes applied."}
# Build and check
pre-push = {help = "Run all build and check tasks.", sequence = ["build", "check-local"]}
pytest = {cmd = "poetry run coverage run -m pytest --durations=10", help = "Run all pytest tests."}
pytest-fast = {cmd = "poetry run coverage run -m pytest --durations=5 --exitfirst -m 'not flaky and not slow and not requires_creds'", help = "Run pytest tests, failing fast and excluding slow tests."}
type-check = {cmd = "poetry run mypy airbyte_cdk", help = "Type check modified files with mypy."}
unit-test-with-cov = {cmd = "pytest -s unit_tests --cov=airbyte_cdk --cov-report=term --cov-config ./pyproject.toml", help = "Run unit tests and create a coverage report."}

[tool.poetry]
name = "airbyte-cdk"
description = "A framework for writing Airbyte Connectors."
authors = ["Airbyte <[email protected]>"]
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3.10",
"Topic :: Scientific/Engineering",
"Topic :: Software Development :: Libraries :: Python Modules"
]
description = "A framework for writing Airbyte Connectors."
documentation = "https://docs.airbyte.io/"
homepage = "https://airbyte.com"
keywords = ["airbyte", "cdk", "connector-development-kit"]
license = "MIT"
name = "airbyte-cdk"
readme = "README.md"
homepage = "https://airbyte.com"
repository = "https://github.com/airbytehq/airbyte-python-cdk"
documentation = "https://docs.airbyte.io/"
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"Topic :: Scientific/Engineering",
"Topic :: Software Development :: Libraries :: Python Modules",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3.10",
]
keywords = ["airbyte", "connector-development-kit", "cdk"]

# Python CDK uses dynamic versioning: https://github.com/mtkennerly/poetry-dynamic-versioning
version = "0.0.0" # Version will be calculated dynamically.

[tool.poetry-dynamic-versioning]
enable = true
version = "0.0.0" # Version will be calculated dynamically.

[tool.poetry.dependencies]
python = "^3.10,<3.13"
Jinja2 = "~3.1.2"
PyYAML = "^6.0.1"
Unidecode = "^1.3"
airbyte-protocol-models-dataclasses = "^0.14"
# Extras depedencies
avro = {optional = true, version = "~1.11.2"}
backoff = "*"
cachetools = "*"
cohere = {optional = true, version = "4.21"}
cryptography = ">=42.0.5,<44.0.0"
dpath = "^2.1.6"
dunamai = "^1.22.0"
fastavro = {optional = true, version = "~1.8.0"}
genson = "1.3.0"
isodate = "~0.6.1"
Jinja2 = "~3.1.2"
jsonref = "~0.2"
jsonschema = "~4.17.3" # 4.18 has some significant breaking changes: https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0
jsonschema = "~4.17.3" # 4.18 has some significant breaking changes: https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0
langchain = {optional = true, version = "0.1.16"}
langchain_core = {optional = true, version = "0.1.42"}
markdown = {optional = true, version = "*"}
nltk = {optional = true, version = "3.9.1"}
# This will ensure that even when you run poetry install or pip install, the compatible version of numpy will always be chosen.
# airbyte-ci will try to install latest version when --use-local-cdk is used, resulting in the conflict.
numpy = "<2"
openai = {extras = ["embeddings"], optional = true, version = "0.27.9"}
orjson = "^3.10.7"
pandas = "2.2.2"
pdf2image = {optional = true, version = "1.16.3"}
"pdfminer.six" = {optional = true, version = "20221105"}
pendulum = "<3.0.0"
psutil = "6.1.0"
pyarrow = {optional = true, version = "~15.0.0"}
pydantic = "^2.7"
pyjwt = "^2.8.0"
pyrate-limiter = "~3.1.0"
pytesseract = {optional = true, version = "0.3.10"}
python = "^3.10,<3.13"
python-calamine = {optional = true, version = "0.2.3"}
python-dateutil = "*"
python-snappy = {optional = true, version = "0.7.3"}
python-ulid = "^3.0.0"
PyYAML = "^6.0.1"
pytz = "2024.2"
rapidfuzz = "^3.10.1"
requests = "*"
requests_cache = "*"
wcmatch = "10.0"
# Extras depedencies
avro = { version = "~1.11.2", optional = true }
cohere = { version = "4.21", optional = true }
fastavro = { version = "~1.8.0", optional = true }
langchain = { version = "0.1.16", optional = true }
langchain_core = { version = "0.1.42", optional = true }
markdown = { version = "*", optional = true }
openai = { version = "0.27.9", extras = ["embeddings"], optional = true }
pdf2image = { version = "1.16.3", optional = true }
"pdfminer.six" = { version = "20221105", optional = true }
pyarrow = { version = "~15.0.0", optional = true }
pytesseract = { version = "0.3.10", optional = true }
python-calamine = { version = "0.2.3", optional = true }
python-snappy = { version = "0.7.3", optional = true }
tiktoken = { version = "0.8.0", optional = true }
nltk = { version = "3.9.1", optional = true }
# This will ensure that even when you run poetry install or pip install, the compatible version of numpy will always be chosen.
# airbyte-ci will try to install latest version when --use-local-cdk is used, resulting in the conflict.
numpy = "<2"
unstructured = { version = "0.10.27", extras = ["docx", "pptx"], optional = true }
"unstructured.pytesseract" = { version = ">=0.3.12", optional = true }
pyjwt = "^2.8.0"
cryptography = ">=42.0.5,<44.0.0"
pytz = "2024.2"
orjson = "^3.10.7"
serpyco-rs = "^1.10.2"
sqlalchemy = {version = "^2.0,!=2.0.36", optional = true }
sqlalchemy = {optional = true, version = "^2.0,!=2.0.36"}
tiktoken = {optional = true, version = "0.8.0"}
unstructured = {extras = ["docx", "pptx"], optional = true, version = "0.10.27"}
"unstructured.pytesseract" = {optional = true, version = ">=0.3.12"}
wcmatch = "10.0"
xmltodict = ">=0.13,<0.15"
Unidecode = "^1.3"

[tool.poetry.extras]
file-based = ["avro", "fastavro", "markdown", "pdf2image", "pdfminer.six", "pyarrow", "pytesseract", "python-calamine", "python-snappy", "unstructured", "unstructured.pytesseract"]
sql = ["sqlalchemy"]
vector-db-based = ["cohere", "langchain", "openai", "tiktoken"]

[tool.poetry.group.dev.dependencies]
asyncio = "3.4.3"
freezegun = "*"
mypy = "*"
asyncio = "3.4.3"
ruff = "^0.7.2"
pdoc = "^15.0.0"
poethepoet = "^0.24.2"
pympler = "*"
pyproject-flake8 = "^6.1.0"
pytest = "^7"
pytest-memray = "^1.6.0"
pympler = "*"
pytest-cov = "*"
pytest-httpserver = "*"
pytest-memray = "^1.6.0"
pytest-mock = "*"
requests-mock = "*"
# Stubs packages for mypy typing
types-requests = "^2.32.0.20241016"
ruff = "^0.7.2"
types-cachetools = "^5.5.0.20240820"
types-python-dateutil = "^2.9.0.20241003"
types-pyyaml = "^6.0.12.20240917"
types-cachetools = "^5.5.0.20240820"

[tool.poetry.extras]
file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "python-snappy"]
vector-db-based = ["langchain", "openai", "cohere", "tiktoken"]
sql = ["sqlalchemy"]
# Stubs packages for mypy typing
types-requests = "^2.32.0.20241016"

[tool.poetry.scripts]

source-declarative-manifest = "airbyte_cdk.cli.source_declarative_manifest:run"

[tool.isort]
skip = ["__init__.py"] # TODO: Remove after this is fixed: https://github.com/airbytehq/airbyte-python-cdk/issues/12
[tool.poetry-dynamic-versioning]
enable = true

[tool.pytest.ini_options]
filterwarnings = [
"ignore::airbyte_cdk.sources.source.ExperimentalClassWarning"
]
log_cli = true
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
log_cli_level = "INFO"

[tool.ruff]
target-version = "py310"
line-length = 100
target-version = "py310"

[tool.ruff.lint]
select = ["I"]

[tool.poe.tasks]
# Installation
install = { shell = "poetry install --all-extras" }

# Build tasks
assemble = {cmd = "bin/generate-component-manifest-dagger.sh", help = "Generate component manifest files."}
build-package = {cmd = "poetry build", help = "Build the python package: source and wheels archives."}
build = {sequence = ["assemble", "build-package"], help = "Run all tasks to build the package."}

# Format check tasks
_format-check-ruff = {cmd = "ruff format --check .", help = "Check formatting with Ruff."}
_format-check-prettier = {cmd = "npx prettier . --check", help = "Check formatting with prettier."}
format-check = {sequence = ["_format-check-ruff", "_format-check-prettier"], help = "Check formatting for all file types.", ignore_fail = "return_non_zero"}

# Format fix tasks
_format-fix-ruff = {cmd = "ruff format .", help = "Format with Ruff."}
_format-fix-prettier = {cmd = "npx prettier . --write", help = "Format with prettier."}
format-fix = {sequence = ["_format-fix-ruff", "_format-fix-prettier"], help = "Format all file types.", ignore_fail = "return_non_zero"}

# Linting/Typing check tasks
_lint-ruff = {cmd = "poetry run ruff check .", help = "Lint with Ruff."}
type-check = {cmd = "poetry run mypy airbyte_cdk", help = "Type check modified files with mypy."}
lint = {sequence = ["_lint-ruff", "type-check"], help = "Lint all code. Includes type checking.", ignore_fail = "return_non_zero"}

# Lockfile check task
check-lockfile = {cmd = "poetry check", help = "Check the poetry lock file."}

# Linting/Typing fix tasks
lint-fix = { cmd = "poetry run ruff check --fix .", help = "Auto-fix any lint issues that Ruff can automatically resolve (excluding 'unsafe' fixes)." }
lint-fix-unsafe = { cmd = "poetry run ruff check --fix --unsafe-fixes .", help = "Lint-fix modified files, including 'unsafe' fixes. It is recommended to first commit any pending changes and then always manually review any unsafe changes applied." }

# Combined Check and Fix tasks

check-all = {sequence = ["lint", "format-check", "type-check", "check-lockfile"], help = "Lint, format, and type-check modified files.", ignore_fail = "return_non_zero"}
fix-all = {sequence = ["format-fix", "lint-fix"], help = "Lint-fix and format-fix modified files, ignoring unsafe fixes.", ignore_fail = "return_non_zero"}
fix-and-check = {sequence = ["fix-all", "check-all"], help = "Lint-fix and format-fix, then re-check to see if any issues remain.", ignore_fail = "return_non_zero"}

# PyTest tasks

pytest = {cmd = "poetry run coverage run -m pytest --durations=10", help = "Run all pytest tests."}
pytest-fast = {cmd = "poetry run coverage run -m pytest --durations=5 --exitfirst -m 'not flaky and not slow and not requires_creds'", help = "Run pytest tests, failing fast and excluding slow tests."}
unit-test-with-cov = {cmd = "pytest -s unit_tests --cov=airbyte_cdk --cov-report=term --cov-config ./pyproject.toml", help = "Run unit tests and create a coverage report."}

# Combined check tasks (other)

# TODO: find a version of the modified mypy check that works both locally and in CI.
check-local = {sequence = ["lint", "type-check", "check-lockfile", "unit-test-with-cov"], help = "Lint all code, type-check modified files, and run unit tests."}
check-ci = {sequence = ["check-lockfile", "build", "lint", "unit-test-with-cov"], help = "Build the package, lint and run unit tests. Does not include type-checking."}

# Build and check
pre-push = {sequence = ["build", "check-local"], help = "Run all build and check tasks."}

# API Docs with PDoc
docs-generate = {env = {PDOC_ALLOW_EXEC = "1"}, cmd = "python -m docs.generate run", help="Generate API documentation with PDoc."}
docs-preview = {shell = "poe docs-generate && open docs/generated/index.html", help="Generate API documentation with PDoc and then open the docs in the default web browser."}

[tool.check-wheel-contents]
# Quality control for Python wheel generation. Docs here:
# - https://github.com/jwodder/check-wheel-contents
ignore = [
"W002" # Duplicate files. (TODO: Fix the few duplicate files, mostly `__init__.py` files that have only copyright text.)
]

[tool.pytest.ini_options]
log_cli = true
log_cli_level = "INFO"
log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
filterwarnings = [
"ignore::airbyte_cdk.sources.source.ExperimentalClassWarning"
]

[tool.airbyte_ci]
python_versions = ["3.10", "3.11"]
optional_poetry_groups = ["dev"]
poetry_extras = ["file-based", "vector-db-based"]
poe_tasks = ["check-ci"]
mount_docker_socket = true