diff --git a/.gitignore b/.gitignore index 31223eb62..ea2d3b6d9 100644 --- a/.gitignore +++ b/.gitignore @@ -78,6 +78,9 @@ docs/_build/ docs/build docs/*.tar.gz +# Sphinx mddocs +mddocs/_build/ + # PyBuilder target/ diff --git a/mkdocs/config/en/mkdocs.yml b/mkdocs/config/en/mkdocs.yml new file mode 100644 index 000000000..c883ec6b1 --- /dev/null +++ b/mkdocs/config/en/mkdocs.yml @@ -0,0 +1,142 @@ +site_name: onETL Docs +docs_dir: '../../docs/en' # Where to find the English markdown files +site_dir: '../../generated/en' # Where to put the English HTML files + +theme: + name: material + custom_dir: '../../overrides/' # This is where the customization of the theme lives + logo: assets/images/logo.svg # The logo is shared by all languages + favicon: assets/images/icon.svg # The favicon is shared by all languages + language: en + features: + - navigation.indexes + - content.tabs.link + - content.code.copy + - content.code.select + palette: + - scheme: default + toggle: + icon: material/weather-night + name: Switch to dark mode + - scheme: slate + toggle: + icon: material/weather-sunny + name: Switch to light mode + locale: en + highlightjs: true + hljs_languages: + - yaml + +extra_css: + - assets/stylesheets/autodoc_pydantic.css # CSS is shared by all languages + +extra: # Language Selection + onetl_logo_wide: "[![onETL logo](../en/assets/images/logo_wide.svg)](https://github.com/MobileTeleSystems/onetl)" + repo_status_badge: "[![Repo status - Active](https://www.repostatus.org/badges/latest/active.svg)](https://github.com/MobileTeleSystems/onetl)" + pypi_release_bage: "[![PyPI - Latest Release](https://img.shields.io/pypi/v/onetl)](https://pypi.org/project/onetl/)" + pypi_license_bage: "[![PyPI - License](https://img.shields.io/pypi/l/onetl.svg)](https://github.com/MobileTeleSystems/onetl/blob/develop/LICENSE.txt)" + pypi_pyversion_bage: "[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/onetl.svg)](https://pypi.org/project/onetl/)" + pypi_downloads_bage: "[![PyPI - Downloads](https://img.shields.io/pypi/dm/onetl)](https://pypi.org/project/onetl/)" + docs_status_badge: "[![Documentation - ReadTheDocs](https://readthedocs.org/projects/onetl/badge/?version=stable)](https://onetl.readthedocs.io/)" + ci_status_badge: "[![Github Actions - latest CI build status](https://github.com/MobileTeleSystems/onetl/workflows/Tests/badge.svg)](https://github.com/MobileTeleSystems/onetl/actions)" + precommit_badge: "[![pre-commit.ci Status](https://results.pre-commit.ci/badge/github/MobileTeleSystems/onetl/develop.svg)](https://results.pre-commit.ci/latest/github/MobileTeleSystems/onetl/develop)" + test_cov_badge: "[![Test coverage - percent](https://codecov.io/gh/MobileTeleSystems/onetl/branch/develop/graph/badge.svg?token=RIO8URKNZJ)](https://codecov.io/gh/MobileTeleSystems/onetl)" + alternate: + + # Switch to English + - name: English + link: /en/ + lang: en + + # Switch to Russian + # - name: Русский + # link: /ru/ + # lang: ru + +plugins: + - autorefs + - mkdocstrings: + default_handler: python + handlers: + python: + options: + show_source: false + show_root_heading: false + show_root_toc_entry: false + - macros + - plantuml: + puml_url: https://www.plantuml.com/plantuml/ + puml_keyword: plantuml + # - i18n: + # docs_structure: folder + # languages: + # - locale: en + # default: true + # name: English + # build: true + # - locale: ru + # name: Русский + # build: false + +markdown_extensions: + - attr_list + - md_in_html + - admonition + - pymdownx.details + - pymdownx.critic + - pymdownx.snippets: + base_path: ["."] + check_paths: true + - toc: + permalink: true + - pymdownx.tabbed: + alternate_style: true + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format + +# nav: +# - "Concepts": concepts +# - "Quickstart": quickstart +# - "Logging": logging +# - "Security": security +# - "Contributing Guide": contributing +# - "Plugins": plugins +# - changelog: +# - changelog/index.md +# - "0.13.4": changelog/0.13.4 +# - "0.13.3": changelog/0.13.3 +# - "0.13.1": changelog/0.13.1 +# - "0.13.0": changelog/0.13.0 +# - "0.12.5": changelog/0.12.5 +# - "0.12.4": changelog/0.12.4 +# - "0.12.3": changelog/0.12.3 +# - "0.12.2": changelog/0.12.2 +# - "0.12.1": changelog/0.12.1 +# - "0.12.0": changelog/0.12.0 +# - "0.11.2": changelog/0.11.2 +# - "0.11.1": changelog/0.11.1 +# - "0.11.0": changelog/0.11.0 +# - "0.10.2": changelog/0.10.2 +# - "0.10.1": changelog/0.10.1 +# - "0.10.0": changelog/0.10.0 +# - "0.9.5": changelog/0.9.5 +# - "0.9.4": changelog/0.9.4 +# - "0.9.3": changelog/0.9.3 +# - "0.9.2": changelog/0.9.2 +# - "0.9.1": changelog/0.9.1 +# - "0.9.0": changelog/0.9.0 +# - "0.8.1": changelog/0.8.1 +# - "0.8.0": changelog/0.8.0 +# - "0.7.2": changelog/0.7.2 +# - "0.7.1": changelog/0.7.1 +# - "0.7.0": changelog/0.7.0 +# - "DB": +# - db_/index.md +# - "DBReader": db_/reader +# - "DBWriter": db_/writer diff --git a/mkdocs/docs/en/changelog/0.10.0.md b/mkdocs/docs/en/changelog/0.10.0.md new file mode 100644 index 000000000..addf02ac5 --- /dev/null +++ b/mkdocs/docs/en/changelog/0.10.0.md @@ -0,0 +1,535 @@ +# 0.10.0 (2023-12-18) + +## Breaking Changes + +- Upgrade `etl-entities` from v1 to v2 ([#172](https://github.com/MobileTeleSystems/onetl/pull/172)). + + This implies that `HWM` classes are now have different internal structure than they used to. + + Before: + + ```python + from etl_entities.old_hwm import IntHWM as OldIntHWM + from etl_entities.source import Column, Table + from etl_entities.process import Process + + hwm = OldIntHWM( + process=Process(name="myprocess", task="abc", dag="cde", host="myhost"), + source=Table(name="schema.table", instance="postgres://host:5432/db"), + column=Column(name="col1"), + value=123, + ) + ``` + + After: + + ```python + from etl_entities.hwm import ColumnIntHWM + + hwm = ColumnIntHWM( + name="some_unique_name", + description="any value you want", + source="schema.table", + expression="col1", + value=123, + ) + ``` + + **Breaking change:** If you used HWM classes from `etl_entities` module, you should rewrite your code to make it compatible with new version. + +??? "More details" + + - `HWM` classes used by previous onETL versions were moved from `etl_entities` to `etl_entities.old_hwm` submodule. They are here for compatibility reasons, but are planned to be removed in `etl-entities` v3 release. + - New `HWM` classes have flat structure instead of nested. + - New `HWM` classes have mandatory `name` attribute (it was known as `qualified_name` before). + - Type aliases used while serializing and deserializing `HWM` objects to `dict` representation were changed too: `int` → `column_int`. + + + To make migration simpler, you can use new method: + + ```python + old_hwm = OldIntHWM(...) + new_hwm = old_hwm.as_new_hwm() + ``` + + Which automatically converts all fields from old structure to new one, including `qualified_name` → `name`. + +- **Breaking changes:** + + - Methods `BaseHWMStore.get()` and `BaseHWMStore.save()` were renamed to `get_hwm()` and `set_hwm()`. + - They now can be used only with new HWM classes from `etl_entities.hwm`, **old HWM classes are not supported**. + + If you used them in your code, please update it accordingly. + +- YAMLHWMStore **CANNOT read files created by older onETL versions** (0.9.x or older). + +??? "Update procedure" + + ```python + # pip install onetl==0.9.5 + + # Get qualified_name for HWM + + + # Option 1. HWM is built manually + from etl_entities import IntHWM, FileListHWM + from etl_entities.source import Column, Table, RemoteFolder + from etl_entities.process import Process + + # for column HWM + old_column_hwm = IntHWM( + process=Process(name="myprocess", task="abc", dag="cde", host="myhost"), + source=Table(name="schema.table", instance="postgres://host:5432/db"), + column=Column(name="col1"), + ) + qualified_name = old_column_hwm.qualified_name + # "col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost" + + # for file HWM + old_file_hwm = FileListHWM( + process=Process(name="myprocess", task="abc", dag="cde", host="myhost"), + source=RemoteFolder(name="/absolute/path", instance="ftp://ftp.server:21"), + ) + qualified_name = old_file_hwm.qualified_name + # "file_list#/absolute/path@ftp://ftp.server:21#cde.abc.myprocess@myhost" + + + # Option 2. HWM is generated automatically (by DBReader/FileDownloader) + # See onETL logs and search for string like qualified_name = '...' + + qualified_name = "col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost" + + + # Get .yml file path by qualified_name + + import os + from pathlib import PurePosixPath + from onetl.hwm.store import YAMLHWMStore + + # here you should pass the same arguments as used on production, if any + yaml_hwm_store = YAMLHWMStore() + hwm_path = yaml_hwm_store.get_file_path(qualified_name) + print(hwm_path) + + # for column HWM + # LocalPosixPath('/home/maxim/.local/share/onETL/yml_hwm_store/col1__schema.table__postgres_host_5432_db__cde.abc.myprocess__myhost.yml') + + # for file HWM + # LocalPosixPath('/home/maxim/.local/share/onETL/yml_hwm_store/file_list__absolute_path__ftp_ftp.server_21__cde.abc.myprocess__myhost.yml') + + + # Read raw .yml file content + + from yaml import safe_load, dump + + raw_old_hwm_items = safe_load(hwm_path.read_text()) + print(raw_old_hwm_items) + + # for column HWM + # [ + # { + # "column": { "name": "col1", "partition": {} }, + # "modified_time": "2023-12-18T10: 39: 47.377378", + # "process": { "dag": "cde", "host": "myhost", "name": "myprocess", "task": "abc" }, + # "source": { "instance": "postgres: //host:5432/db", "name": "schema.table" }, + # "type": "int", + # "value": "123", + # }, + # ] + + # for file HWM + # [ + # { + # "modified_time": "2023-12-18T11:15:36.478462", + # "process": { "dag": "cde", "host": "myhost", "name": "myprocess", "task": "abc" }, + # "source": { "instance": "ftp://ftp.server:21", "name": "/absolute/path" }, + # "type": "file_list", + # "value": ["file1.txt", "file2.txt"], + # }, + # ] + + + # Convert file content to new structure, compatible with onETL 0.10.x + raw_new_hwm_items = [] + for old_hwm in raw_old_hwm_items: + new_hwm = {"name": qualified_name, "modified_time": old_hwm["modified_time"]} + + if "column" in old_hwm: + new_hwm["expression"] = old_hwm["column"]["name"] + new_hwm["entity"] = old_hwm["source"]["name"] + old_hwm.pop("process", None) + + if old_hwm["type"] == "int": + new_hwm["type"] = "column_int" + new_hwm["value"] = old_hwm["value"] + + elif old_hwm["type"] == "date": + new_hwm["type"] = "column_date" + new_hwm["value"] = old_hwm["value"] + + elif old_hwm["type"] == "datetime": + new_hwm["type"] = "column_datetime" + new_hwm["value"] = old_hwm["value"] + + elif old_hwm["type"] == "file_list": + new_hwm["type"] = "file_list" + new_hwm["value"] = [ + os.fspath(PurePosixPath(old_hwm["source"]["name"]).joinpath(path)) + for path in old_hwm["value"] + ] + + else: + raise ValueError("WAT?") + + raw_new_hwm_items.append(new_hwm) + + + print(raw_new_hwm_items) + # for column HWM + # [ + # { + # "name": "col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost", + # "modified_time": "2023-12-18T10:39:47.377378", + # "expression": "col1", + # "source": "schema.table", + # "type": "column_int", + # "value": 123, + # }, + # ] + + # for file HWM + # [ + # { + # "name": "file_list#/absolute/path@ftp://ftp.server:21#cde.abc.myprocess@myhost", + # "modified_time": "2023-12-18T11:15:36.478462", + # "entity": "/absolute/path", + # "type": "file_list", + # "value": ["/absolute/path/file1.txt", "/absolute/path/file2.txt"], + # }, + # ] + + + # Save file with new content + with open(hwm_path, "w") as file: + dump(raw_new_hwm_items, file) + + + # Stop Python interpreter and update onETL + # pip install onetl==0.10.0 + # Check that new .yml file can be read + + from onetl.hwm.store import YAMLHWMStore + + qualified_name = ... + + # here you should pass the same arguments as used on production, if any + yaml_hwm_store = YAMLHWMStore() + yaml_hwm_store.get_hwm(qualified_name) + + # for column HWM + # ColumnIntHWM( + # name='col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost', + # description='', + # entity='schema.table', + # value=123, + # expression='col1', + # modified_time=datetime.datetime(2023, 12, 18, 10, 39, 47, 377378), + # ) + + # for file HWM + # FileListHWM( + # name='file_list#/absolute/path@ftp://ftp.server:21#cde.abc.myprocess@myhost', + # description='', + # entity=AbsolutePath('/absolute/path'), + # value=frozenset({AbsolutePath('/absolute/path/file1.txt'), AbsolutePath('/absolute/path/file2.txt')}), + # expression=None, + # modified_time=datetime.datetime(2023, 12, 18, 11, 15, 36, 478462) + # ) + + + # That's all! + ``` + +But most of users use other HWM store implementations which do not have such issues. + +- Several classes and functions were moved from `onetl` to `etl_entities`: + +=== "onETL `0.9.x` and older" + + ```python + from onetl.hwm.store import ( + detect_hwm_store, + BaseHWMStore, + HWMStoreClassRegistry, + register_hwm_store_class, + HWMStoreManager, + MemoryHWMStore, + ) + ``` + +=== "nETL `0.10.x` and newer" + + ```python + from etl_entities.hwm_store import ( + detect_hwm_store, + BaseHWMStore, + HWMStoreClassRegistry, + register_hwm_store_class, + HWMStoreManager, + MemoryHWMStore, + ) + ``` + + They still can be imported from old module, but this is deprecated and will be removed in v1.0.0 release. + +- Change the way of passing `HWM` to `DBReader` and `FileDownloader` classes: + +=== "onETL `0.9.x` and older" + + ```python linenums="1" hl_lines="12-21" + # Simple + reader = DBReader( + connection=..., + source=..., + hwm_column="col1", + ) + + + # Complex + reader = DBReader( + connection=..., + source=..., + hwm_column=( + "col1", + "cast(col1 as date)", + ), + ) + + + # Files + downloader = FileDownloader( + connection=..., + source_path=..., + target_path=..., + hwm_type="file_list", + ) + ``` + +=== "onETL `0.10.x` and newer" + + ```python linenums="1" hl_lines="12-21" + # Simple + reader = DBReader( + connection=..., + source=..., + hwm=DBReader.AutoDetectHWM( + # name is mandatory now! + name="my_unique_hwm_name", + expression="col1", + ), + ) + + # Complex + reader = DBReader( + connection=..., + source=..., + hwm=DBReader.AutoDetectHWM( + # name is mandatory now! + name="my_unique_hwm_name", + expression="cast(col1 as date)", + ), + ) + + # Files + downloader = FileDownloader( + connection=..., + source_path=..., + target_path=..., + hwm=FileListHWM( + # name is mandatory now! + name="another_unique_hwm_name", + ), + ) + ``` + + + New HWM classes have **mandatory** `name` attribute which should be passed explicitly, + instead of generating if automatically under the hood. + + Automatic `name` generation using the old `DBReader.hwm_column` / `FileDownloader.hwm_type` + syntax is still supported, but will be removed in v1.0.0 release. ([#179](https://github.com/MobileTeleSystems/onetl/pull/179)) + +- Performance of read Incremental and Batch strategies has been drastically improved. ([#182](https://github.com/MobileTeleSystems/onetl/pull/182)). + +??? "Before and after in details" + + `DBReader.run()` + incremental/batch strategy behavior in versions 0.9.x and older: + + - Get table schema by making query `SELECT * FROM table WHERE 1=0` (if `DBReader.columns` has `*``) + - Expand `*`` to real column names from table, add here `hwm_column`, remove duplicates (as some RDBMS does not allow that). + - Create dataframe from query like `SELECT hwm_expression AS hwm_column, ...other table columns... FROM table WHERE hwm_expression > prev_hwm.value`. + - Determine HWM class using dataframe schema: `df.schema[hwm_column].dataType`. + - Determine x HWM column value using Spark: `df.select(max(hwm_column)).collect()`. + - Use `max(hwm_column)` as next HWM value, and save it to HWM Store. + - Return dataframe to user. + + This was far from ideal: + + - Dataframe content (all rows or just changed ones) was loaded from the source to Spark only to get min/max values of specific column. + + - Step of fetching table schema and then substituting column names in the next query caused some unexpected errors. + + For example, source contains columns with mixed name case, like `"CamelColumn"` or `"spaced column"`. + + Column names were *not* escaped during query generation, leading to queries that cannot be executed by database. + + So users have to *explicitly* pass column names `DBReader`, wrapping columns with mixed naming with `"``: + + ```python + reader = DBReader( + connection=..., + source=..., + columns=[ # passing '*' here leads to wrong SQL query generation + "normal_column", + '"CamelColumn"', + '"spaced column"', + ..., + ], + ) + ``` + - Using `DBReader` with `IncrementalStrategy` could lead to reading rows already read before. + + Dataframe was created from query with WHERE clause like `hwm.expression > prev_hwm.value`, + not `hwm.expression > prev_hwm.value AND hwm.expression <= current_hwm.value`. + + So if new rows appeared in the source **after** HWM value is determined, + they can be read by accessing dataframe content (because Spark dataframes are lazy), + leading to inconsistencies between HWM value and dataframe content. + + This may lead to issues then `DBReader.run()` read some data, updated HWM value, and next call of `DBReader.run()`` + will read rows that were already read in previous run. + + `DBReader.run()` + incremental/batch strategy behavior in versions 0.10.x and newer: + + - Detect type of HWM expression: `SELECT hwm.expression FROM table WHERE 1=0`. + - Determine corresponding Spark type `df.schema[0]` and when determine matching HWM class (if `DBReader.AutoDetectHWM` is used). + - Get min/max values by querying the source: `SELECT MAX(hwm.expression) FROM table WHERE hwm.expression >= prev_hwm.value`. + - Use `max(hwm.expression)` as next HWM value, and save it to HWM Store. + - Create dataframe from query `SELECT ... table columns ... FROM table WHERE hwm.expression > prev_hwm.value AND hwm.expression <= current_hwm.value`, baking new HWM value into the query. + - Return dataframe to user. + + Improvements: + + - Allow source to calculate min/max instead of loading everything to Spark. This should be **faster** on large amounts of data (**up to x2**), because we do not transfer all the data from the source to Spark. This can be even faster if source have indexes for HWM column. + - Columns list is passed to source as-is, without any resolving on `DBReader` side. So you can pass `DBReader(columns=["*"])` to read tables with mixed columns naming. + - Restrict dataframe content to always match HWM values, which leads to never reading the same row twice. + + **Breaking change**: HWM column is not being implicitly added to dataframe. It was a part of `SELECT` clause, but now it is mentioned only in `WHERE` clause. + + So if you had code like this, you have to rewrite it: + +=== "onETL `0.9.x` and older" + + ```python linenums="1" hl_lines="1-16" + reader = DBReader( + connection=..., + source=..., + columns=[ + "col1", + "col2", + ], + hwm_column="hwm_col", + ) + + df = reader.run() + # hwm_column value is in the dataframe + assert df.columns == ["col1", "col2", "hwm_col"] + + + reader = DBReader( + connection=..., + source=..., + columns=[ + "col1", + "col2", + ], + hwm_column=( + "hwm_col", + "cast(hwm_col as int)", + ), + ) + + df = reader.run() + # hwm_expression value is in the dataframe + assert df.columns == ["col1", "col2", "hwm_col"] + ``` + +=== "onETL `0.10.x` and newer" + + ```python linenums="1" hl_lines="1-16" + reader = DBReader( + connection=..., + source=..., + columns=[ + "col1", + "col2", + # add hwm_column explicitly + "hwm_col", + ], + hwm_column="hwm_col", + ) + + df = reader.run() + # if columns list is not updated, + # this fill fail + assert df.columns == ["col1", "col2", "hwm_col"] + + reader = DBReader( + connection=..., + source=..., + columns=[ + "col1", + "col2", + # add hwm_expression explicitly + "cast(hwm_col as int) as hwm_col", + ], + hwm_column=( + "hwm_col", + "cast(hwm_col as int)", + ), + ) + df = reader.run() + # if columns list is not updated, + # this fill fail + assert df.columns == ["col1", "col2", "hwm_col"] + ``` + + But most users just use `columns=["*"]` anyway, they won't see any changes. + +- `FileDownloader.run()` now updates HWM in HWM Store not after each file is being successfully downloaded, + but after all files were handled. + + This is because: + + - FileDownloader can be used with `DownloadOptions(workers=N)`, which could lead to race condition - one thread can save to HWM store one HWM value when another thread will save different value. + - FileDownloader can download hundreds and thousands of files, and issuing a request to HWM Store for each file could potentially DDoS HWM Store. ([#189](https://github.com/MobileTeleSystems/onetl/pull/189)) + + There is a exception handler which tries to save HWM to HWM store if download process was interrupted. But if it was interrupted by force, like sending `SIGKILL` event, + HWM will not be saved to HWM store, so some already downloaded files may be downloaded again next time. + + But unexpected process kill may produce other negative impact, like some file will be downloaded partially, so this is an expected behavior. + +## Features + +- Add Python 3.12 compatibility. ([#167](https://github.com/MobileTeleSystems/onetl/pull/167)) +- `Excel` file format now can be used with Spark 3.5.0. ([#187](https://github.com/MobileTeleSystems/onetl/pull/187)) +- `SnapshotBatchStagy` and `IncrementalBatchStrategy` does no raise exceptions if source does not contain any data. + Instead they stop at first iteration and return empty dataframe. ([#188](https://github.com/MobileTeleSystems/onetl/pull/188)) +- Cache result of `connection.check()` in high-level classes like `DBReader`, `FileDownloader` and so on. This makes logs less verbose. ([#190](https://github.com/MobileTeleSystems/onetl/pull/190)) + +## Bug Fixes + +- Fix `@slot` and `@hook` decorators returning methods with missing arguments in signature (Pylance, VS Code). ([#183](https://github.com/MobileTeleSystems/onetl/pull/183)) +- Kafka connector documentation said that it does support reading topic data incrementally by passing `group.id` or `groupIdPrefix`. + Actually, this is not true, because Spark does not send information to Kafka which messages were consumed. + So currently users can only read the whole topic, no incremental reads are supported. diff --git a/mkdocs/docs/en/changelog/0.10.1.md b/mkdocs/docs/en/changelog/0.10.1.md new file mode 100644 index 000000000..6280dc26b --- /dev/null +++ b/mkdocs/docs/en/changelog/0.10.1.md @@ -0,0 +1,29 @@ +# 0.10.1 (2024-02-05) + +## Features + +- Add support of `Incremental Strategies` for `Kafka` connection: + + ```python + reader = DBReader( + connection=Kafka(...), + source="topic_name", + hwm=DBReader.AutoDetectHWM(name="some_hwm_name", expression="offset"), + ) + + with IncrementalStrategy(): + df = reader.run() + ``` + + This lets you resume reading data from a Kafka topic starting at the last committed offset from your previous run. ([#202](https://github.com/MobileTeleSystems/onetl/pull/202)) + +- Add `has_data`, `raise_if_no_data` methods to `DBReader` class. ([#203](https://github.com/MobileTeleSystems/onetl/pull/203)) + +- Updare VMware Greenplum connector from `2.1.4` to `2.3.0`. This implies: + - Greenplum 7.x support + - [Kubernetes support](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/configure.html#k8scfg) + - New read option [gpdb.matchDistributionPolicy](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#distpolmotion) + which allows to match each Spark executor with specific Greenplum segment, avoiding redundant data transfer between Greenplum segments + - Allows overriding [Greenplum optimizer parameters](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#greenplum-gucs) in read/write operations ([#208](https://github.com/MobileTeleSystems/onetl/pull/208)) + +- `Greenplum.get_packages()` method now accepts optional arg `package_version` which allows to override version of Greenplum connector package. ([#208](https://github.com/MobileTeleSystems/onetl/pull/208)) diff --git a/mkdocs/docs/en/changelog/0.10.2.md b/mkdocs/docs/en/changelog/0.10.2.md new file mode 100644 index 000000000..3ae7ad822 --- /dev/null +++ b/mkdocs/docs/en/changelog/0.10.2.md @@ -0,0 +1,39 @@ +# 0.10.2 (2024-03-21) + +## Features + +- Add support of Pydantic v2. ([#230](https://github.com/MobileTeleSystems/onetl/pull/230)) + +## Improvements + +- Improve database connections documentation: + - Add "Types" section describing mapping between Clickhouse and Spark types + - Add "Prerequisites" section describing different aspects of connecting to Clickhouse + - Separate documentation of `DBReader` and `.sql()` / `.pipeline(...)` + - Add examples for `.fetch()` and `.execute()` ([#211](https://github.com/MobileTeleSystems/onetl/pull/211), [#228](https://github.com/MobileTeleSystems/onetl/pull/228), [#229](https://github.com/MobileTeleSystems/onetl/pull/229), [#233](https://github.com/MobileTeleSystems/onetl/pull/233), [#234](https://github.com/MobileTeleSystems/onetl/pull/234), [#235](https://github.com/MobileTeleSystems/onetl/pull/235), [#236](https://github.com/MobileTeleSystems/onetl/pull/236), [#240](https://github.com/MobileTeleSystems/onetl/pull/240)) + +- Add notes to Greenplum documentation about issues with IP resolution and building `gpfdist` URL ([#228](https://github.com/MobileTeleSystems/onetl/pull/228)) + +- Allow calling `MongoDB.pipeline(...)` with passing just collection name, without explicit aggregation pipeline. ([#237](https://github.com/MobileTeleSystems/onetl/pull/237)) + +- Update default `Postgres(extra={...})` to include `{"stringtype": "unspecified"}` option. + This allows to write text data to non-text column (or vice versa), relying to Postgres cast capabilities. + + For example, now it is possible to read column of type `money` as Spark's `StringType()`, and write it back to the same column, + without using intermediate columns or tables. ([#229](https://github.com/MobileTeleSystems/onetl/pull/229)) + +## Bug Fixes + +- Return back handling of `DBReader(columns="string")`. This was a valid syntax up to v0.10 release, but it was removed because + most of users neved used it. It looks that we were wrong, returning this behavior back, but with deprecation warning. ([#238](https://github.com/MobileTeleSystems/onetl/pull/238)) + +- Downgrade Greenplum package version from `2.3.0` to `2.2.0`. ([#239](https://github.com/MobileTeleSystems/onetl/pull/239)) + + This is because version 2.3.0 introduced issues with writing data to Greenplum 6.x. + Connector can open transaction with `SELECT * FROM table LIMIT 0` query, but does not close it, which leads to deadlocks. + + For using this connector with Greenplum 7.x, please pass package version explicitly: + + ```python + maven_packages = Greenplum.get_packages(package_version="2.3.0", ...) + ``` diff --git a/mkdocs/docs/en/changelog/0.11.0.md b/mkdocs/docs/en/changelog/0.11.0.md new file mode 100644 index 000000000..64dee376f --- /dev/null +++ b/mkdocs/docs/en/changelog/0.11.0.md @@ -0,0 +1,212 @@ +# 0.11.0 (2024-05-27) + +## Breaking Changes + +There can be some changes in connection behavior, related to version upgrades. So we mark these changes as **breaking** although +most of users will not see any differences. + +- Update Clickhouse JDBC driver to latest version ([#249](https://github.com/MobileTeleSystems/onetl/pull/249)): + - Package was renamed `ru.yandex.clickhouse:clickhouse-jdbc` → `com.clickhouse:clickhouse-jdbc`. + - Package version changed `0.3.2` → `0.6.0-patch5`. + - Driver name changed `ru.yandex.clickhouse.ClickHouseDriver` → `com.clickhouse.jdbc.ClickHouseDriver`. + + This brings up several fixes for Spark <-> Clickhouse type compatibility, and also Clickhouse clusters support. + +- Update other JDBC drivers to latest versions: + - MSSQL `12.2.0` → `12.6.2` ([#254](https://github.com/MobileTeleSystems/onetl/pull/254)). + - MySQL `8.0.33` → `8.4.0` ([#253](https://github.com/MobileTeleSystems/onetl/pull/253), [#285](https://github.com/MobileTeleSystems/onetl/pull/285)). + - Oracle `23.2.0.0` → `23.4.0.24.05` ([#252](https://github.com/MobileTeleSystems/onetl/pull/252), [#284](https://github.com/MobileTeleSystems/onetl/pull/284)). + - Postgres `42.6.0` → `42.7.3` ([#251](https://github.com/MobileTeleSystems/onetl/pull/251)). + +- Update MongoDB connector to latest version: `10.1.1` → `10.3.0` ([#255](https://github.com/MobileTeleSystems/onetl/pull/255), [#283](https://github.com/MobileTeleSystems/onetl/pull/283)). + + This brings up Spark 3.5 support. + +- Update `XML` package to latest version: `0.17.0` → `0.18.0` ([#259](https://github.com/MobileTeleSystems/onetl/pull/259)). + + This brings few bugfixes with datetime format handling. + +- For JDBC connections add new `SQLOptions` class for `DB.sql(query, options=...)` method ([#272](https://github.com/MobileTeleSystems/onetl/pull/272)). + + Firsly, to keep naming more consistent. + + Secondly, some of options are not supported by `DB.sql(...)` method, but supported by `DBReader`. + For example, `SQLOptions` do not support `partitioning_mode` and require explicit definition of `lower_bound` and `upper_bound` when `num_partitions` is greater than 1. + `ReadOptions` does support `partitioning_mode` and allows skipping `lower_bound` and `upper_bound` values. + + This require some code changes. Before: + + ```python + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.sql( + """ + SELECT * + FROM some.mytable + WHERE key = 'something' + """, + options=Postgres.ReadOptions( + partitioning_mode="range", + partition_column="id", + num_partitions=10, + ), + ) + ``` + + After: + + ```python + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.sql( + """ + SELECT * + FROM some.mytable + WHERE key = 'something' + """, + options=Postgres.SQLOptions( + # partitioning_mode is not supported! + partition_column="id", + num_partitions=10, + lower_bound=0, # <-- set explicitly + upper_bound=1000, # <-- set explicitly + ), + ) + ``` + + For now, `DB.sql(query, options=...)` can accept `ReadOptions` to keep backward compatibility, but emits deprecation warning. + The support will be removed in `v1.0.0`. + +- Split up `JDBCOptions` class into `FetchOptions` and `ExecuteOptions` ([#274](https://github.com/MobileTeleSystems/onetl/pull/274)). + + New classes are used by `DB.fetch(query, options=...)` and `DB.execute(query, options=...)` methods respectively. + This is mostly to keep naming more consistent. + + This require some code changes. Before: + + ```python + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.fetch( + "SELECT * FROM some.mytable WHERE key = 'something'", + options=Postgres.JDBCOptions( + fetchsize=1000, + query_timeout=30, + ), + ) + + postgres.execute( + "UPDATE some.mytable SET value = 'new' WHERE key = 'something'", + options=Postgres.JDBCOptions(query_timeout=30), + ) + ``` + + After: + + ```python + from onetl.connection import Postgres + + # Using FetchOptions for fetching data + postgres = Postgres(...) + df = postgres.fetch( + "SELECT * FROM some.mytable WHERE key = 'something'", + options=Postgres.FetchOptions( # <-- change class name + fetchsize=1000, + query_timeout=30, + ), + ) + + # Using ExecuteOptions for executing statements + postgres.execute( + "UPDATE some.mytable SET value = 'new' WHERE key = 'something'", + options=Postgres.ExecuteOptions(query_timeout=30), # <-- change class name + ) + ``` + + For now, `DB.fetch(query, options=...)` and `DB.execute(query, options=...)` can accept `JDBCOptions`, to keep backward compatibility, + but emit a deprecation warning. The old class will be removed in `v1.0.0`. + +- Serialize `ColumnDatetimeHWM` to Clickhouse's `DateTime64(6)` (precision up to microseconds) instead of `DateTime` (precision up to seconds) ([#267](https://github.com/MobileTeleSystems/onetl/pull/267)). + + In previous onETL versions, `ColumnDatetimeHWM` value was rounded to the second, and thus reading some rows that were read in previous runs, + producing duplicates. + + For Clickhouse versions below 21.1 comparing column of type `DateTime` with a value of type `DateTime64` is not supported, returning an empty dataframe. + To avoid this, replace: + + ```python + DBReader( + ..., + hwm=DBReader.AutoDetectHWM( + name="my_hwm", + expression="hwm_column", # <-- + ), + ) + ``` + + with: + + ```python + DBReader( + ..., + hwm=DBReader.AutoDetectHWM( + name="my_hwm", + expression="CAST(hwm_column AS DateTime64)", # <-- add explicit CAST + ), + ) + ``` + +- Pass JDBC connection extra params as `properties` dict instead of URL with query part ([#268](https://github.com/MobileTeleSystems/onetl/pull/268)). + + This allows passing custom connection parameters like `Clickhouse(extra={"custom_http_options": "option1=value1,option2=value2"})` + without need to apply urlencode to parameter value, like `option1%3Dvalue1%2Coption2%3Dvalue2`. + +## Features + +Improve user experience with Kafka messages and Database tables with serialized columns, like JSON/XML. + +- Allow passing custom package version as argument for `DB.get_packages(...)` method of several DB connectors: + - `Clickhouse.get_packages(package_version=..., apache_http_client_version=...)` ([#249](https://github.com/MobileTeleSystems/onetl/pull/249)). + - `MongoDB.get_packages(scala_version=..., spark_version=..., package_version=...)` ([#255](https://github.com/MobileTeleSystems/onetl/pull/255)). + - `MySQL.get_packages(package_version=...)` ([#253](https://github.com/MobileTeleSystems/onetl/pull/253)). + - `MSSQL.get_packages(java_version=..., package_version=...)` ([#254](https://github.com/MobileTeleSystems/onetl/pull/254)). + - `Oracle.get_packages(java_version=..., package_version=...)` ([#252](https://github.com/MobileTeleSystems/onetl/pull/252)). + - `Postgres.get_packages(package_version=...)` ([#251](https://github.com/MobileTeleSystems/onetl/pull/251)). + - `Teradata.get_packages(package_version=...)` ([#256](https://github.com/MobileTeleSystems/onetl/pull/256)). + Now users can downgrade or upgrade connection without waiting for next onETL release. Previously only `Kafka` and `Greenplum` supported this feature. +- Add `FileFormat.parse_column(...)` method to several classes: + - `Avro.parse_column(col)` ([#265](https://github.com/MobileTeleSystems/onetl/pull/265)). + - `JSON.parse_column(col, schema=...)` ([#257](https://github.com/MobileTeleSystems/onetl/pull/257)). + - `CSV.parse_column(col, schema=...)` ([#258](https://github.com/MobileTeleSystems/onetl/pull/258)). + - `XML.parse_column(col, schema=...)` ([#269](https://github.com/MobileTeleSystems/onetl/pull/269)). + This allows parsing data in `value` field of Kafka message or string/binary column of some table as a nested Spark structure. +- Add `FileFormat.serialize_column(...)` method to several classes: + - `Avro.serialize_column(col)` ([#265](https://github.com/MobileTeleSystems/onetl/pull/265)). + - `JSON.serialize_column(col)` ([#257](https://github.com/MobileTeleSystems/onetl/pull/257)). + - `CSV.serialize_column(col)` ([#258](https://github.com/MobileTeleSystems/onetl/pull/258)). + This allows saving Spark nested structures or arrays to `value` field of Kafka message or string/binary column of some table. + +## Improvements + +Few documentation improvements. + +- Replace all `assert` in documentation with doctest syntax. This should make documentation more readable ([#273](https://github.com/MobileTeleSystems/onetl/pull/273)). +- Add generic `Troubleshooting` guide ([#275](https://github.com/MobileTeleSystems/onetl/pull/275)). +- Improve Kafka documentation: + - Add "Prerequisites" page describing different aspects of connecting to Kafka. + - Improve "Reading from" and "Writing to" page of Kafka documentation, add more examples and usage notes. + - Add "Troubleshooting" page ([#276](https://github.com/MobileTeleSystems/onetl/pull/276)). +- Improve Hive documentation: + - Add "Prerequisites" page describing different aspects of connecting to Hive. + - Improve "Reading from" and "Writing to" page of Hive documentation, add more examples and recommendations. + - Improve "Executing statements in Hive" page of Hive documentation. ([#278](https://github.com/MobileTeleSystems/onetl/pull/278)). +- Add "Prerequisites" page describing different aspects of using SparkHDFS and SparkS3 connectors. ([#279](https://github.com/MobileTeleSystems/onetl/pull/279)). +- Add note about connecting to Clickhouse cluster. ([#280](https://github.com/MobileTeleSystems/onetl/pull/280)). +- Add notes about versions when specific class/method/attribute/argument was added, renamed or changed behavior ([#282](https://github.com/MobileTeleSystems/onetl/pull/282)). + +## Bug Fixes + +- Fix missing `pysmb` package after installing `pip install onetl[files]` . diff --git a/mkdocs/docs/en/changelog/0.11.1.md b/mkdocs/docs/en/changelog/0.11.1.md new file mode 100644 index 000000000..823afe3be --- /dev/null +++ b/mkdocs/docs/en/changelog/0.11.1.md @@ -0,0 +1,9 @@ +# 0.11.1 (2024-05-29) + +## Features + +- Change `MSSQL.port` default from `1433` to `None`, allowing use of `instanceName` to detect port number. ([#287](https://github.com/MobileTeleSystems/onetl/pull/287)) + +## Bug Fixes + +- Remove `fetchsize` from `JDBC.WriteOptions`. ([#288](https://github.com/MobileTeleSystems/onetl/pull/288)) diff --git a/mkdocs/docs/en/changelog/0.11.2.md b/mkdocs/docs/en/changelog/0.11.2.md new file mode 100644 index 000000000..9278d22f8 --- /dev/null +++ b/mkdocs/docs/en/changelog/0.11.2.md @@ -0,0 +1,5 @@ +# 0.11.2 (2024-09-02) + +## Bug Fixes + +- Fix passing `Greenplum(extra={"options": ...})` during read/write operations. ([#308](https://github.com/MobileTeleSystems/onetl/pull/308)) diff --git a/mkdocs/docs/en/changelog/0.12.0.md b/mkdocs/docs/en/changelog/0.12.0.md new file mode 100644 index 000000000..aeaca1d23 --- /dev/null +++ b/mkdocs/docs/en/changelog/0.12.0.md @@ -0,0 +1,54 @@ +# 0.12.0 (2024-09-03) + +## Breaking Changes + +- Change connection URL used for generating HWM names of S3 and Samba sources: + - `smb://host:port` -> `smb://host:port/share` + - `s3://host:port` -> `s3://host:port/bucket` ([#304](https://github.com/MobileTeleSystems/onetl/pull/304)) +- Update DB connectors/drivers to latest versions: + - Clickhouse `0.6.0-patch5` → `0.6.5` + - MongoDB `10.3.0` → `10.4.0` + - MSSQL `12.6.2` → `12.8.1` + - MySQL `8.4.0` → `9.0.0` + - Oracle `23.4.0.24.05` → `23.5.0.24.07` + - Postgres `42.7.3` → `42.7.4` +- Update `Excel` package from `0.20.3` to `0.20.4`, to include Spark 3.5.1 support. ([#306](https://github.com/MobileTeleSystems/onetl/pull/306)) + +## Features + +- Add support for specifying file formats (`ORC`, `Parquet`, `CSV`, etc.) in `HiveWriteOptions.format` ([#292](https://github.com/MobileTeleSystems/onetl/pull/292)): + + ```python + Hive.WriteOptions(format=ORC(compression="snappy")) + ``` + +- Collect Spark execution metrics in following methods, and log then in DEBUG mode: + - `DBWriter.run()` + - `FileDFWriter.run()` + - `Hive.sql()` + - `Hive.execute()` + + This is implemented using custom `SparkListener` which wraps the entire method call, and + then report collected metrics. But these metrics sometimes may be missing due to Spark architecture, + so they are not reliable source of information. That's why logs are printed only in DEBUG mode, and + are not returned as method call result. ([#303](https://github.com/MobileTeleSystems/onetl/pull/303)) + +- Generate default `jobDescription` based on currently executed method. Examples: + - `DBWriter.run(schema.table) -> Postgres[host:5432/database]` + - `MongoDB[localhost:27017/admin] -> DBReader.has_data(mycollection)` + - `Hive[cluster].execute()` + + If user already set custom `jobDescription`, it will left intact. ([#304](https://github.com/MobileTeleSystems/onetl/pull/304)) + +- Add log.info about JDBC dialect usage ([#305](https://github.com/MobileTeleSystems/onetl/pull/305)): + + ```text + |MySQL| Detected dialect: 'org.apache.spark.sql.jdbc.MySQLDialect' + ``` + +- Log estimated size of in-memory dataframe created by `JDBC.fetch` and `JDBC.execute` methods. ([#303](https://github.com/MobileTeleSystems/onetl/pull/303)) + +## Bug Fixes + +- Fix passing `Greenplum(extra={"options": ...})` during read/write operations. ([#308](https://github.com/MobileTeleSystems/onetl/pull/308)) +- Do not raise exception if yield-based hook whas something past (and only one) `yield`. diff --git a/mkdocs/docs/en/changelog/0.12.1.md b/mkdocs/docs/en/changelog/0.12.1.md new file mode 100644 index 000000000..d32401517 --- /dev/null +++ b/mkdocs/docs/en/changelog/0.12.1.md @@ -0,0 +1,17 @@ +# 0.12.1 (2024-10-28) + +## Features + +- Log detected JDBC dialect while using `DBWriter`. + +## Bug Fixes + +- Fix `SparkMetricsRecorder` failing when receiving `SparkListenerTaskEnd` without `taskMetrics` (e.g. executor was killed by OOM). ([#313](https://github.com/MobileTeleSystems/onetl/pull/313)) +- Call `kinit` before checking for HDFS active namenode. +- Wrap `kinit` with `threading.Lock` to avoid multithreading issues. +- Immediately show `kinit` errors to user, instead of hiding them. +- Use `AttributeError` instead of `ImportError` in module's `__getattr__` method, to make code compliant with Python spec. + +## Doc only Changes + +- Add note about [spark-dialect-extension](https://github.com/MobileTeleSystems/spark-dialect-extension) package to Clickhouse connector documentation. ([#310](https://github.com/MobileTeleSystems/onetl/pull/310)) diff --git a/mkdocs/docs/en/changelog/0.12.2.md b/mkdocs/docs/en/changelog/0.12.2.md new file mode 100644 index 000000000..23a8d383d --- /dev/null +++ b/mkdocs/docs/en/changelog/0.12.2.md @@ -0,0 +1,18 @@ +# 0.12.2 (2024-11-12) + +## Improvements + +- Change Spark `jobDescription` for DBReader & FileDFReader from `DBReader.run() -> Connection` to `Connection -> DBReader.run()`. + +## Bug Fixes + +- Fix `log_hwm` result for `KeyValueIntHWM` (used by Kafka). ([#316](https://github.com/MobileTeleSystems/onetl/pull/316)) +- Fix `log_collection` hiding values of `Kafka.addresses` in logs with `INFO` level. ([#316](https://github.com/MobileTeleSystems/onetl/pull/316)) + +## Dependencies + +- Allow using [etl-entities==2.4.0](https://github.com/MobileTeleSystems/etl-entities/releases/tag/2.4.0). + +## Doc only Changes + +- Fix links to MSSQL date & time type documentation. diff --git a/mkdocs/docs/en/changelog/0.12.3.md b/mkdocs/docs/en/changelog/0.12.3.md new file mode 100644 index 000000000..741c74e1f --- /dev/null +++ b/mkdocs/docs/en/changelog/0.12.3.md @@ -0,0 +1,5 @@ +# 0.12.3 (2024-11-22) + +## Bug Fixes + +- Allow passing table names in format `schema."table.with.dots"` to `DBReader(source=...)` and `DBWriter(target=...)`. diff --git a/mkdocs/docs/en/changelog/0.12.4.md b/mkdocs/docs/en/changelog/0.12.4.md new file mode 100644 index 000000000..3ebc57a87 --- /dev/null +++ b/mkdocs/docs/en/changelog/0.12.4.md @@ -0,0 +1,5 @@ +# 0.12.4 (2024-11-27) + +## Bug Fixes + +- Fix `DBReader(conn=oracle, options={"partitioning_mode": "hash"})` lead to data skew in last partition due to wrong `ora_hash` usage. ([#319](https://github.com/MobileTeleSystems/onetl/pull/319)) diff --git a/mkdocs/docs/en/changelog/0.12.5.md b/mkdocs/docs/en/changelog/0.12.5.md new file mode 100644 index 000000000..c542a50fd --- /dev/null +++ b/mkdocs/docs/en/changelog/0.12.5.md @@ -0,0 +1,13 @@ +# 0.12.5 (2024-12-03) + +## Improvements + +- Use `sipHash64` instead of `md5` in Clickhouse for reading data with `{"partitioning_mode": "hash"}`, as it is 5 times faster. +- Use `hashtext` instead of `md5` in Postgres for reading data with `{"partitioning_mode": "hash"}`, as it is 3-5 times faster. +- Use `BINARY_CHECKSUM` instead of `HASHBYTES` in MSSQL for reading data with `{"partitioning_mode": "hash"}`, as it is 5 times faster. + +## Big fixes + +- In JDBC sources wrap `MOD(partitionColumn, numPartitions)` with `ABS(...)` to make al returned values positive. This prevents data skew. +- Fix reading table data from MSSQL using `{"partitioning_mode": "hash"}` with `partitionColumn` of integer type. +- Fix reading table data from Postgres using `{"partitioning_mode": "hash"}` lead to data skew (all the data was read into one Spark partition). diff --git a/mkdocs/docs/en/changelog/0.13.0.md b/mkdocs/docs/en/changelog/0.13.0.md new file mode 100644 index 000000000..0a318a68b --- /dev/null +++ b/mkdocs/docs/en/changelog/0.13.0.md @@ -0,0 +1,222 @@ +# 0.13.0 (2025-02-24) + +🎉 3 years since first release 0.1.0 🎉 + +## Breaking Changes + +- Add Python 3.13. support. ([#298](https://github.com/MobileTeleSystems/onetl/pull/298)) + +- Change the logic of `FileConnection.walk` and `FileConnection.list_dir`. ([#327](https://github.com/MobileTeleSystems/onetl/pull/327)) + + Previously `limits.stops_at(path) == True` considered as "return current file and stop", and could lead to exceeding some limit. + Not it means "stop immediately". + +- Change default value for `FileDFWriter.Options(if_exists=...)` from `error` to `append`, + to make it consistent with other `.Options()` classes within onETL. ([#343](https://github.com/MobileTeleSystems/onetl/pull/343)) + +## Features + +- Add support for `FileModifiedTimeHWM` HWM class (see [etl-entities 2.5.0](https://github.com/MobileTeleSystems/etl-entities/releases/tag/2.5.0)): + + ```python + from etl_entitites.hwm import FileModifiedTimeHWM + from onetl.file import FileDownloader + from onetl.strategy import IncrementalStrategy + + downloader = FileDownloader( + ..., + hwm=FileModifiedTimeHWM(name="somename"), + ) + + with IncrementalStrategy(): + downloader.run() + ``` + +- Introduce `FileSizeRange(min=..., max=...)` filter class. ([#325](https://github.com/MobileTeleSystems/onetl/pull/325)) + + Now users can set `FileDownloader` / `FileMover` to download/move only files with specific file size range: + + ```python + from onetl.file import FileDownloader + from onetl.file.filter import FileSizeRange + + downloader = FileDownloader( + ..., + filters=[FileSizeRange(min="10KiB", max="1GiB")], + ) + ``` + +- Introduce `TotalFilesSize(...)` limit class. ([#326](https://github.com/MobileTeleSystems/onetl/pull/326)) + + Now users can set `FileDownloader` / `FileMover` to stop downloading/moving files after reaching a certain amount of data: + + ```python + from datetime import datetime, timedelta + from onetl.file import FileDownloader + from onetl.file.limit import TotalFilesSize + + downloader = FileDownloader( + ..., + limits=[TotalFilesSize("1GiB")], + ) + ``` + +- Implement `FileModifiedTime(since=..., until=...)` file filter. ([#330](https://github.com/MobileTeleSystems/onetl/pull/330)) + + Now users can set `FileDownloader` / `FileMover` to download/move only files with specific file modification time: + + ```python + from datetime import datetime, timedelta + from onetl.file import FileDownloader + from onetl.file.filter import FileModifiedTime + + downloader = FileDownloader( + ..., + filters=[FileModifiedTime(before=datetime.now() - timedelta(hours=1))], + ) + ``` + +- Add `SparkS3.get_exclude_packages()` and `Kafka.get_exclude_packages()` methods. ([#341](https://github.com/MobileTeleSystems/onetl/pull/341)) + + Using them allows to skip downloading dependencies not required by this specific connector, or which are already a part of Spark/PySpark: + + ```python + from onetl.connection import SparkS3, Kafka + + maven_packages = [ + *SparkS3.get_packages(spark_version="3.5.4"), + *Kafka.get_packages(spark_version="3.5.4"), + ] + exclude_packages = SparkS3.get_exclude_packages() + Kafka.get_exclude_packages() + spark = ( + SparkSession.builder.appName("spark_app_onetl_demo") + .config("spark.jars.packages", ",".join(maven_packages)) + .config("spark.jars.excludes", ",".join(exclude_packages)) + .getOrCreate() + ) + ``` + +## Improvements + +- All DB connections opened by `JDBC.fetch(...)`, `JDBC.execute(...)` or `JDBC.check()` + are immediately closed after the statements is executed. ([#334](https://github.com/MobileTeleSystems/onetl/pull/334)) + + Previously Spark session with `master=local[3]` actually opened up to 5 connections to target DB - one for `JDBC.check()`, + another for Spark driver interaction with DB to create tables, and one for each Spark executor. Now only max 4 connections are opened, + as `JDBC.check()` does not hold opened connection. + + This is important for RDBMS like Postgres or Greenplum where number of connections is strictly limited and limit is usually quite low. + +- Set up `ApplicationName` (client info) for Clickhouse, MongoDB, MSSQL, MySQL and Oracle. ([#339](https://github.com/MobileTeleSystems/onetl/pull/339), [#248](https://github.com/MobileTeleSystems/onetl/pull/248)) + + Also update `ApplicationName` format for Greenplum, Postgres, Kafka and SparkS3. + Now all connectors have the same `ApplicationName` format: `${spark.applicationId} ${spark.appName} onETL/${onetl.version} Spark/${spark.version}` + + The only connections not sending `ApplicationName` are Teradata and FileConnection implementations. + +- Now `DB.check()` will test connection availability not only on Spark driver, but also from some Spark executor. ([#346](https://github.com/MobileTeleSystems/onetl/pull/346)) + + This allows to fail immediately if Spark driver host has network access to target DB, but Spark executors have not. + +!!! note + + Now `Greenplum.check()` requires the same user grants as `DBReader(connection=greenplum)`: + + ```sql + -- yes, "writable" for reading data from GP, it's not a mistake + ALTER USER username CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist'); + + -- for both reading and writing to GP + -- ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist') CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist'); + ``` + + Please ask your Greenplum administrators to provide these grants. + + +## Bug Fixes + +- Avoid suppressing Hive Metastore errors while using `DBWriter`. ([#329](https://github.com/MobileTeleSystems/onetl/pull/329)) + + Previously this was implemented as: + + ```python + try: + spark.sql(f"SELECT * FROM {table}") + table_exists = True + except Exception: + table_exists = False + ``` + + If Hive Metastore was overloaded and responded with an exception, it was considered as non-existing table, resulting + to full table override instead of append or override only partitions subset. + +- Fix using onETL to write data to PostgreSQL or Greenplum instances behind *pgbouncer* with `pool_mode=transaction`. ([#336](https://github.com/MobileTeleSystems/onetl/pull/336)) + + Previously `Postgres.check()` opened a read-only transaction, pgbouncer changed the entire connection type from read-write to read-only, + and when `DBWriter.run(df)` executed in read-only connection, producing errors like: + + ``` + org.postgresql.util.PSQLException: ERROR: cannot execute INSERT in a read-only transaction + org.postgresql.util.PSQLException: ERROR: cannot execute TRUNCATE TABLE in a read-only transaction + ``` + + Added a workaround by passing `readOnly=True` to JDBC params for read-only connections, so pgbouncer may differ read-only and read-write connections properly. + + After upgrading onETL 0.13.x or higher the same error still may appear of pgbouncer still holds read-only connections and returns them for DBWriter. + To this this, user can manually convert read-only connection to read-write: + + ```python + postgres.execute("BEGIN READ WRITE;") # <-- add this line + DBWriter(...).run() + ``` + + After all connections in pgbouncer pool were converted from read-only to read-write, and error fixed, this additional line could be removed. + + See [Postgres JDBC driver documentation](https://jdbc.postgresql.org/documentation/use/). + +- Fix `MSSQL.fetch(...)` and `MySQL.fetch(...)` opened a read-write connection instead of read-only. ([#337](https://github.com/MobileTeleSystems/onetl/pull/337)) + + Now this is fixed: + - `MSSQL.fetch(...)` establishes connection with `ApplicationIntent=ReadOnly`. + - `MySQL.fetch(...)` calls `SET SESSION TRANSACTION READ ONLY` statement. + +- Fixed passing multiple filters to `FileDownloader` and `FileMover`. ([#338](https://github.com/MobileTeleSystems/onetl/pull/338)) + If was caused by sorting filters list in internal logging method, but `FileFilter` subclasses are not sortable. + +- Fix a false warning about a lof of parallel connections to Grenplum. ([#342](https://github.com/MobileTeleSystems/onetl/pull/342)) + + Creating Spark session with `.master("local[5]")` may open up to 6 connections to Greenplum (=number of Spark executors + 1 for driver), + but onETL instead used number of *CPU cores* on the host as a number of parallel connections. + + This lead to showing a false warning that number of Greenplum connections is too high, + which actually should be the case only if number of executors is higher than 30. + +- Fix MongoDB trying to use current database name as `authSource`. ([#347](https://github.com/MobileTeleSystems/onetl/pull/347)) + + Use default connector value which is `admin` database. Previous onETL versions could be fixed by: + + ```python + from onetl.connection import MongoDB + + mongodb = MongoDB( + ..., + database="mydb", + extra={ + "authSource": "admin", + }, + ) + ``` + +## Dependencies + +- Minimal `etl-entities` version is now [2.5.0](https://github.com/MobileTeleSystems/etl-entities/releases/tag/2.5.0). ([#331](https://github.com/MobileTeleSystems/onetl/pull/331)) +- Update DB connectors/drivers to latest versions: ([#345](https://github.com/MobileTeleSystems/onetl/pull/345)) + - Clickhouse `0.6.5` → `0.7.2` + - MongoDB `10.4.0` → `10.4.1` + - MySQL `9.0.0` → `9.2.0` + - Oracle `23.5.0.24.07` → `23.7.0.25.01` + - Postgres `42.7.4` → `42.7.5` + +## Doc only Changes + +- Split large code examples to tabs. ([#344](https://github.com/MobileTeleSystems/onetl/pull/344)) diff --git a/mkdocs/docs/en/changelog/0.13.1.md b/mkdocs/docs/en/changelog/0.13.1.md new file mode 100644 index 000000000..4045c7cbf --- /dev/null +++ b/mkdocs/docs/en/changelog/0.13.1.md @@ -0,0 +1,9 @@ +# 0.13.1 (2025-03-06) + +## Bug Fixes + +In 0.13.0, using `DBWriter(connection=hive, target="SOMEDB.SOMETABLE")` lead to executing `df.write.saveAsTable()` +instead of `df.write.insertInto()` if target table `somedb.sometable` already exist. + +This is caused by table name normalization (Hive uses lower-case names), which wasn't properly handled by method used for checking table existence. +([#350](https://github.com/MobileTeleSystems/onetl/pull/350)) diff --git a/mkdocs/docs/en/changelog/0.13.3.md b/mkdocs/docs/en/changelog/0.13.3.md new file mode 100644 index 000000000..1aa289b49 --- /dev/null +++ b/mkdocs/docs/en/changelog/0.13.3.md @@ -0,0 +1,5 @@ +# 0.13.3 (2025-03-11) + +## Dependencies + +Allow using [etl-entities 2.6.0](https://github.com/MobileTeleSystems/etl-entities/releases/tag/2.6.0). diff --git a/mkdocs/docs/en/changelog/0.13.4.md b/mkdocs/docs/en/changelog/0.13.4.md new file mode 100644 index 000000000..10f695e0c --- /dev/null +++ b/mkdocs/docs/en/changelog/0.13.4.md @@ -0,0 +1,10 @@ +# 0.13.4 (2025-03-20) + +## Doc only Changes + +- Prefer `ReadOptions(partitionColumn=..., numPartitions=..., queryTimeout=...)` + instead of `ReadOptions(partition_column=..., num_partitions=..., query_timeout=...)`, + to match Spark documentation. ([#352](https://github.com/MobileTeleSystems/onetl/pull/352)) +- Prefer `WriteOptions(if_exists=...)` instead of `WriteOptions(mode=...)` for IDE suggestions. ([#354](https://github.com/MobileTeleSystems/onetl/pull/354)) +- Document all options of supported file formats. + ([#355](https://github.com/MobileTeleSystems/onetl/pull/355), [#356](https://github.com/MobileTeleSystems/onetl/pull/356), [#357](https://github.com/MobileTeleSystems/onetl/pull/357), [#358](https://github.com/MobileTeleSystems/onetl/pull/358), [#359](https://github.com/MobileTeleSystems/onetl/pull/359), [#360](https://github.com/MobileTeleSystems/onetl/pull/360), [#361](https://github.com/MobileTeleSystems/onetl/pull/361), [#362](https://github.com/MobileTeleSystems/onetl/pull/362)) diff --git a/mkdocs/docs/en/changelog/0.7.0.md b/mkdocs/docs/en/changelog/0.7.0.md new file mode 100644 index 000000000..2140962fa --- /dev/null +++ b/mkdocs/docs/en/changelog/0.7.0.md @@ -0,0 +1,239 @@ +# 0.7.0 (2023-05-15) + +## 🎉 onETL is now open source 🎉 + +That was long road, but we finally did it! + +## Breaking Changes + +- Changed installation method. + + **TL;DR What should I change to restore previous behavior** + + Simple way: + + | onETL < 0.7.0 | onETL >= 0.7.0 | + | ----------------- | --------------------------------- | + | pip install onetl | pip install onetl[files,kerberos] | + + Right way - enumerate connectors should be installed: + + ```bash + pip install onetl[hdfs,ftp,kerberos] # except DB connections + ``` + + **Details** + + In onetl<0.7 the package installation looks like: + + ```bash title="before" + + pip install onetl + ``` + + But this includes all dependencies for all connectors, even if user does not use them. + This caused some issues, for example user had to install Kerberos libraries to be able to install onETL, even if user uses only S3 (without Kerberos support). + + Since 0.7.0 installation process was changed: + + ``` bash title="after" + + pip install onetl # minimal installation, only onETL core + # there is no extras for DB connections because they are using Java packages which are installed in runtime + + pip install onetl[ftp,ftps,hdfs,sftp,s3,webdav] # install dependencies for specified file connections + pip install onetl[files] # install dependencies for all file connections + + pip install onetl[kerberos] # Kerberos auth support + pip install onetl[spark] # install PySpark to use DB connections + + pip install onetl[spark,kerberos,files] # all file connections + Kerberos + PySpark + pip install onetl[all] # alias for previous case + ``` + + There are corresponding documentation items for each extras. + + Also onETL checks that some requirements are missing, and raises exception with recommendation how to install them: + + ``` text title="exception while import Clickhouse connection" + + Cannot import module "pyspark". + + Since onETL v0.7.0 you should install package as follows: + pip install onetl[spark] + + or inject PySpark to sys.path in some other way BEFORE creating MongoDB instance. + ``` + + ``` text title="exception while import FTP connection" + + Cannot import module "ftputil". + + Since onETL v0.7.0 you should install package as follows: + pip install onetl[ftp] + + or + pip install onetl[files] + ``` + +- Added new `cluster` argument to `Hive` and `HDFS` connections. + + `Hive` qualified name (used in HWM) contains cluster name. But in onETL<0.7.0 cluster name had hard coded value `rnd-dwh` which was not OK for some users. + + `HDFS` connection qualified name contains host (active namenode of Hadoop cluster), but its value can change over time, leading to creating of new HWM. + + Since onETL 0.7.0 both `Hive` and `HDFS` connections have `cluster` attribute which can be set to a specific cluster name. + For `Hive` it is mandatory, for `HDFS` it can be omitted (using host as a fallback). + + But passing cluster name every time could lead to errors. + + Now `Hive` and `HDFS` have nested class named `slots` with methods: + + - `normalize_cluster_name` + - `get_known_clusters` + - `get_current_cluster` + - `normalize_namenode_host` (only `HDFS`) + - `get_cluster_namenodes` (only `HDFS`) + - `get_webhdfs_port` (only `HDFS`) + - `is_namenode_active` (only `HDFS`) + + And new method `HDFS.get_current` / `Hive.get_current`. + + Developers can implement hooks validating user input or substituting values for automatic cluster detection. + This should improve user experience while using these connectors. + + See slots documentation. + +- Update JDBC connection drivers. + + - Greenplum `2.1.3` → `2.1.4`. + - MSSQL `10.2.1.jre8` → `12.2.0.jre8`. Minimal supported version of MSSQL is now 2014 instead 2021. + - MySQL `8.0.30` → `8.0.33`: + - Package was renamed `mysql:mysql-connector-java` → `com.mysql:mysql-connector-j`. + - Driver class was renamed `com.mysql.jdbc.Driver` → `com.mysql.cj.jdbc.Driver`. + - Oracle `21.6.0.0.1` → `23.2.0.0`. + - Postgres `42.4.0` → `42.6.0`. + - Teradata `17.20.00.08` → `17.20.00.15`: + - Package was renamed `com.teradata.jdbc:terajdbc4` → `com.teradata.jdbc:terajdbc`. + - Teradata driver is now published to Maven. + + See [#31](https://github.com/MobileTeleSystems/onetl/pull/31). + +## Features + +- Added MongoDB connection. + + Using official [MongoDB connector for Spark v10](https://www.mongodb.com/docs/spark-connector/current/). Only Spark 3.2+ is supported. + + There are some differences between MongoDB and other database sources: + + - Instead of `mongodb.sql` method there is `mongodb.pipeline`. + - No methods `mongodb.fetch` and `mongodb.execute`. + - `DBReader.hint` and `DBReader.where` have different types than in SQL databases: + + ```python + where = { + "col1": { + "$eq": 10, + }, + } + + hint = { + "col1": 1, + } + ``` + + - Because MongoDB does not have schemas of collections, but Spark cannot create dataframe with dynamic schema, new option `DBReader.df_schema` was introduced. + It is mandatory for MongoDB, but optional for other sources. + - Currently DBReader cannot be used with MongoDB and hwm expression, e.g. `hwm_column=("mycolumn", {"$cast": {"col1": "date"}})` + + Because there are no tables in MongoDB, some options were renamed in core classes: + + - `DBReader(table=...)` → `DBReader(source=...)` + - `DBWriter(table=...)` → `DBWriter(target=...)` + + Old names can be used too, they are not deprecated ([#30](https://github.com/MobileTeleSystems/onetl/pull/30)). + +- Added option for disabling some plugins during import. + + Previously if some plugin were failing during the import, the only way to import onETL would be to disable all plugins + using environment variable. + + Now there are several variables with different behavior: + + - `ONETL_PLUGINS_ENABLED=false` - disable all plugins autoimport. Previously it was named `ONETL_ENABLE_PLUGINS`. + - `ONETL_PLUGINS_BLACKLIST=plugin-name,another-plugin` - set list of plugins which should NOT be imported automatically. + - `ONETL_PLUGINS_WHITELIST=plugin-name,another-plugin` - set list of plugins which should ONLY be imported automatically. + + Also we improved exception message with recommendation how to disable a failing plugin: + + ``` text title="exception message example" + + Error while importing plugin 'mtspark' from package 'mtspark' v4.0.0. + + Statement: + import mtspark.onetl + + Check if plugin is compatible with current onETL version 0.7.0. + + You can disable loading this plugin by setting environment variable: + ONETL_PLUGINS_BLACKLIST='mtspark,failing-plugin' + + You can also define a whitelist of packages which can be loaded by onETL: + ONETL_PLUGINS_WHITELIST='not-failing-plugin1,not-failing-plugin2' + + Please take into account that plugin name may differ from package or module name. + See package metadata for more details + ``` + +## Improvements + +- Added compatibility with Python 3.11 and PySpark 3.4.0. + + File connections were OK, but `jdbc.fetch` and `jdbc.execute` were failing. Fixed in [#28](https://github.com/MobileTeleSystems/onetl/pull/28). + +- Added check for missing Java packages. + + Previously if DB connection tried to use some Java class which were not loaded into Spark version, it raised an exception + with long Java stacktrace. Most users failed to interpret this trace. + + Now onETL shows the following error message: + + ``` text title="exception message example" + + |Spark| Cannot import Java class 'com.mongodb.spark.sql.connector.MongoTableProvider'. + + It looks like you've created Spark session without this option: + SparkSession.builder.config("spark.jars.packages", MongoDB.package_spark_3_2) + + Please call `spark.stop()`, restart the interpreter, + and then create new SparkSession with proper options. + ``` + +- Documentation improvements. + + - Changed documentation site theme - using [furo](https://github.com/pradyunsg/furo) + instead of default [ReadTheDocs](https://github.com/readthedocs/sphinx_rtd_theme). + + New theme supports wide screens and dark mode. + See [#10](https://github.com/MobileTeleSystems/onetl/pull/10). + + - Now each connection class have compatibility table for Spark + Java + Python. + + - Added global compatibility table for Spark + Java + Python + Scala. + +## Bug Fixes + +- Fixed several SFTP issues. + + - If SSH config file `~/.ssh/config` contains some options not recognized by Paramiko (unknown syntax, unknown option name), + previous versions were raising exception until fixing or removing this file. Since 0.7.0 exception is replaced with warning. + + - If user passed `host_key_check=False` but server changed SSH keys, previous versions raised exception until new key is accepted. + Since 0.7.0 exception is replaced with warning if option value is `False`. + + Fixed in [#19](https://github.com/MobileTeleSystems/onetl/pull/19). + +- Fixed several S3 issues. + + There was a bug in S3 connection which prevented handling files in the root of a bucket - they were invisible for the connector. Fixed in [#29](https://github.com/MobileTeleSystems/onetl/pull/29). diff --git a/mkdocs/docs/en/changelog/0.7.1.md b/mkdocs/docs/en/changelog/0.7.1.md new file mode 100644 index 000000000..9eac47a1a --- /dev/null +++ b/mkdocs/docs/en/changelog/0.7.1.md @@ -0,0 +1,40 @@ +# 0.7.1 (2023-05-23) + +## Bug Fixes + +- Fixed `setup_logging` function. + + In onETL==0.7.0 calling `onetl.log.setup_logging()` broke the logging: + + ``` text title="exception message" + + Traceback (most recent call last): + File "/opt/anaconda/envs/py39/lib/python3.9/logging/__init__.py", line 434, in format + return self._format(record) + File "/opt/anaconda/envs/py39/lib/python3.9/logging/__init__.py", line 430, in _format + return self._fmt % record.dict + KeyError: 'levelname:8s' + ``` + +- Fixed installation examples. + + In onETL==0.7.0 there are examples of installing onETL with extras: + + ``` bash title="before" + + pip install onetl[files, kerberos, spark] + ``` + + But pip fails to install such package: + + ``` text title="exception" + + ERROR: Invalid requirement: 'onet[files,' + ``` + + This is because of spaces in extras clause. Fixed: + + ``` bash title="after" + + pip install onetl[files,kerberos,spark] + ``` diff --git a/mkdocs/docs/en/changelog/0.7.2.md b/mkdocs/docs/en/changelog/0.7.2.md new file mode 100644 index 000000000..505b03725 --- /dev/null +++ b/mkdocs/docs/en/changelog/0.7.2.md @@ -0,0 +1,37 @@ +# 0.7.2 (2023-05-24) + +## Dependencies + +- Limited `typing-extensions` version. + + `typing-extensions==4.6.0` release contains some breaking changes causing errors like: + + ``` text title="typing-extensions 4.6.0" + + Traceback (most recent call last): + File "/Users/project/lib/python3.9/typing.py", line 852, in __subclasscheck__ + return issubclass(cls, self.__origin__) + TypeError: issubclass() arg 1 must be a class + ``` + + `typing-extensions==4.6.1` was causing another error: + + ``` text title="typing-extensions 4.6.1" + + Traceback (most recent call last): + File "/home/maxim/Repo/typing_extensions/1.py", line 33, in + isinstance(file, ContainsException) + File "/home/maxim/Repo/typing_extensions/src/typing_extensions.py", line 599, in __instancecheck__ + if super().__instancecheck__(instance): + File "/home/maxim/.pyenv/versions/3.7.8/lib/python3.7/abc.py", line 139, in __instancecheck__ + return _abc_instancecheck(cls, instance) + File "/home/maxim/Repo/typing_extensions/src/typing_extensions.py", line 583, in __subclasscheck__ + return super().__subclasscheck__(other) + File "/home/maxim/.pyenv/versions/3.7.8/lib/python3.7/abc.py", line 143, in __subclasscheck__ + return _abc_subclasscheck(cls, subclass) + File "/home/maxim/Repo/typing_extensions/src/typing_extensions.py", line 661, in _proto_hook + and other._is_protocol + AttributeError: type object 'PathWithFailure' has no attribute '_is_protocol' + ``` + + We updated requirements with `typing-extensions<4.6` until fixing compatibility issues. diff --git a/mkdocs/docs/en/changelog/0.8.0.md b/mkdocs/docs/en/changelog/0.8.0.md new file mode 100644 index 000000000..4fa395932 --- /dev/null +++ b/mkdocs/docs/en/changelog/0.8.0.md @@ -0,0 +1,162 @@ +# 0.8.0 (2023-05-31) + +## Breaking Changes + +- Rename methods of `FileConnection` classes: + + - `get_directory` → `resolve_dir` + - `get_file` → `resolve_file` + - `listdir` → `list_dir` + - `mkdir` → `create_dir` + - `rmdir` → `remove_dir` + + New naming should be more consistent. + + They were undocumented in previous versions, but someone could use these methods, so this is a breaking change. ([#36](https://github.com/MobileTeleSystems/onetl/pull/36)) + +- Deprecate `onetl.core.FileFilter` class, replace it with new classes: + + - `onetl.file.filter.Glob` + - `onetl.file.filter.Regexp` + - `onetl.file.filter.ExcludeDir` + + Old class will be removed in v1.0.0. ([#43](https://github.com/MobileTeleSystems/onetl/pull/43)) + +- Deprecate `onetl.core.FileLimit` class, replace it with new class `onetl.file.limit.MaxFilesCount`. + + Old class will be removed in v1.0.0. ([#44](https://github.com/MobileTeleSystems/onetl/pull/44)) + +- Change behavior of `BaseFileLimit.reset` method. + + This method should now return `self` instead of `None`. + Return value could be the same limit object or a copy, this is an implementation detail. ([#44](https://github.com/MobileTeleSystems/onetl/pull/44)) + +- Replaced `FileDownloader.filter` and `.limit` with new options `.filters` and `.limits`: + + ``` python title="onETL < 0.8.0" + FileDownloader( + ..., + filter=FileFilter(glob="*.txt", exclude_dir="/path"), + limit=FileLimit(count_limit=10), + ) + ``` + + ``` python title="onETL >= 0.8.0" + FileDownloader( + ..., + filters=[Glob("*.txt"), ExcludeDir("/path")], + limits=[MaxFilesCount(10)], + ) + ``` + + This allows to developers to implement their own filter and limit classes, and combine them with existing ones. + + Old behavior still supported, but it will be removed in v1.0.0. ([#45](https://github.com/MobileTeleSystems/onetl/pull/45)) + +- Removed default value for `FileDownloader.limits`, user should pass limits list explicitly. ([#45](https://github.com/MobileTeleSystems/onetl/pull/45)) + +- Move classes from module `onetl.core`: + + ``` python title="before" + from onetl.core import DBReader + from onetl.core import DBWriter + from onetl.core import FileDownloader + from onetl.core import FileUploader + ``` + + with new modules `onetl.db` and `onetl.file`: + + ``` python title="after" + from onetl.db import DBReader + from onetl.db import DBWriter + + from onetl.file import FileDownloader + from onetl.file import FileUploader + ``` + + Imports from old module `onetl.core` still can be used, but marked as deprecated. Module will be removed in v1.0.0. ([#46](https://github.com/MobileTeleSystems/onetl/pull/46)) + +## Features + +- Add `rename_dir` method. + + Method was added to following connections: + + - `FTP` + - `FTPS` + - `HDFS` + - `SFTP` + - `WebDAV` + + It allows to rename/move directory to new path with all its content. + + `S3` does not have directories, so there is no such method in that class. ([#40](https://github.com/MobileTeleSystems/onetl/pull/40)) + +- Add `onetl.file.FileMover` class. + + It allows to move files between directories of remote file system. + Signature is almost the same as in `FileDownloader`, but without HWM support. ([#42](https://github.com/MobileTeleSystems/onetl/pull/42)) + +## Improvements + +- Document all public methods in `FileConnection` classes: + + - `download_file` + - `resolve_dir` + - `resolve_file` + - `get_stat` + - `is_dir` + - `is_file` + - `list_dir` + - `create_dir` + - `path_exists` + - `remove_file` + - `rename_file` + - `remove_dir` + - `upload_file` + - `walk` ([#39](https://github.com/MobileTeleSystems/onetl/pull/39)) + +- Update documentation of `check` method of all connections - add usage example and document result type. ([#39](https://github.com/MobileTeleSystems/onetl/pull/39)) + +- Add new exception type `FileSizeMismatchError`. + + Methods `connection.download_file` and `connection.upload_file` now raise new exception type instead of `RuntimeError`, + if target file after download/upload has different size than source. ([#39](https://github.com/MobileTeleSystems/onetl/pull/39)) + +- Add new exception type `DirectoryExistsError` - it is raised if target directory already exists. ([#40](https://github.com/MobileTeleSystems/onetl/pull/40)) + +- Improved `FileDownloader` / `FileUploader` exception logging. + + If `DEBUG` logging is enabled, print exception with stacktrace instead of + printing only exception message. ([#42](https://github.com/MobileTeleSystems/onetl/pull/42)) + +- Updated documentation of `FileUploader`. + + - Class does not support read strategies, added note to documentation. + - Added examples of using `run` method with explicit files list passing, both absolute and relative paths. + - Fix outdated imports and class names in examples. ([#42](https://github.com/MobileTeleSystems/onetl/pull/42)) + +- Updated documentation of `DownloadResult` class - fix outdated imports and class names. ([#42](https://github.com/MobileTeleSystems/onetl/pull/42)) + +- Improved file filters documentation section. + + Document interface class `onetl.base.BaseFileFilter` and function `match_all_filters`. ([#43](https://github.com/MobileTeleSystems/onetl/pull/43)) + +- Improved file limits documentation section. + + Document interface class `onetl.base.BaseFileLimit` and functions `limits_stop_at` / `limits_reached` / `reset_limits`. ([#44](https://github.com/MobileTeleSystems/onetl/pull/44)) + +- Added changelog. + + Changelog is generated from separated news files using [towncrier](https://pypi.org/project/towncrier/). ([#47](https://github.com/MobileTeleSystems/onetl/pull/47)) + +## Misc + +- Improved CI workflow for tests. + + - If developer haven't changed source core of a specific connector or its dependencies, + run tests only against maximum supported versions of Spark, Python, Java and db/file server. + - If developed made some changes in a specific connector, or in core classes, or in dependencies, + run tests for both minimal and maximum versions. + - Once a week run all aganst for minimal and latest versions to detect breaking changes in dependencies + - Minimal tested Spark version is 2.3.1 instead on 2.4.8. ([#32](https://github.com/MobileTeleSystems/onetl/pull/32)) diff --git a/mkdocs/docs/en/changelog/0.8.1.md b/mkdocs/docs/en/changelog/0.8.1.md new file mode 100644 index 000000000..aaf777091 --- /dev/null +++ b/mkdocs/docs/en/changelog/0.8.1.md @@ -0,0 +1,42 @@ +# 0.8.1 (2023-07-10) + +## Features + +- Add `@slot` decorator to public methods of: + + - `DBConnection` + - `FileConnection` + - `DBReader` + - `DBWriter` + - `FileDownloader` + - `FileUploader` + - `FileMover` ([#49](https://github.com/MobileTeleSystems/onetl/pull/49)) + +- Add `workers` field to `FileDownloader` / `FileUploader` / `FileMover`. `Options` classes. + + This allows to speed up all file operations using parallel threads. ([#57](https://github.com/MobileTeleSystems/onetl/pull/57)) + +## Improvements + +- Add documentation for HWM store `.get` and `.save` methods. ([#49](https://github.com/MobileTeleSystems/onetl/pull/49)) + +- Improve Readme: + + - Move `Quick start` section from documentation + - Add `Non-goals` section + - Fix code blocks indentation ([#50](https://github.com/MobileTeleSystems/onetl/pull/50)) + +- Improve Contributing guide: + + - Move `Develop` section from Readme + - Move `docs/changelog/README.rst` content + - Add `Limitations` section + - Add instruction of creating a fork and building documentation ([#50](https://github.com/MobileTeleSystems/onetl/pull/50)) + +- Remove duplicated checks for source file existence in `FileDownloader` / `FileMover`. ([#57](https://github.com/MobileTeleSystems/onetl/pull/57)) + +- Update default logging format to include thread name. ([#57](https://github.com/MobileTeleSystems/onetl/pull/57)) + +## Bug Fixes + +- Fix `S3.list_dir('/')` returns empty list on latest Minio version. ([#58](https://github.com/MobileTeleSystems/onetl/pull/58)) diff --git a/mkdocs/docs/en/changelog/0.9.0.md b/mkdocs/docs/en/changelog/0.9.0.md new file mode 100644 index 000000000..8aee9a3eb --- /dev/null +++ b/mkdocs/docs/en/changelog/0.9.0.md @@ -0,0 +1,122 @@ +# 0.9.0 (2023-08-17) + +## Breaking Changes + +- Rename methods: + + - `DBConnection.read_df` → `DBConnection.read_source_as_df` + - `DBConnection.write_df` → `DBConnection.write_df_to_target` ([#66](https://github.com/MobileTeleSystems/onetl/pull/66)) + +- Rename classes: + + - `HDFS.slots` → `HDFS.Slots` + - `Hive.slots` → `Hive.Slots` + + Old names are left intact, but will be removed in v1.0.0 ([#103](https://github.com/MobileTeleSystems/onetl/pull/103)) + +- Rename options to make them self-explanatory: + + - `Hive.WriteOptions(mode="append")` → `Hive.WriteOptions(if_exists="append")` + - `Hive.WriteOptions(mode="overwrite_table")` → `Hive.WriteOptions(if_exists="replace_entire_table")` + - `Hive.WriteOptions(mode="overwrite_partitions")` → `Hive.WriteOptions(if_exists="replace_overlapping_partitions")` + - `JDBC.WriteOptions(mode="append")` → `JDBC.WriteOptions(if_exists="append")` + - `JDBC.WriteOptions(mode="overwrite")` → `JDBC.WriteOptions(if_exists="replace_entire_table")` + - `Greenplum.WriteOptions(mode="append")` → `Greenplum.WriteOptions(if_exists="append")` + - `Greenplum.WriteOptions(mode="overwrite")` → `Greenplum.WriteOptions(if_exists="replace_entire_table")` + - `MongoDB.WriteOptions(mode="append")` → `Greenplum.WriteOptions(if_exists="append")` + - `MongoDB.WriteOptions(mode="overwrite")` → `Greenplum.WriteOptions(if_exists="replace_entire_collection")` + - `FileDownloader.Options(mode="error")` → `FileDownloader.Options(if_exists="error")` + - `FileDownloader.Options(mode="ignore")` → `FileDownloader.Options(if_exists="ignore")` + - `FileDownloader.Options(mode="overwrite")` → `FileDownloader.Options(if_exists="replace_file")` + - `FileDownloader.Options(mode="delete_all")` → `FileDownloader.Options(if_exists="replace_entire_directory")` + - `FileUploader.Options(mode="error")` → `FileUploader.Options(if_exists="error")` + - `FileUploader.Options(mode="ignore")` → `FileUploader.Options(if_exists="ignore")` + - `FileUploader.Options(mode="overwrite")` → `FileUploader.Options(if_exists="replace_file")` + - `FileUploader.Options(mode="delete_all")` → `FileUploader.Options(if_exists="replace_entire_directory")` + - `FileMover.Options(mode="error")` → `FileMover.Options(if_exists="error")` + - `FileMover.Options(mode="ignore")` → `FileMover.Options(if_exists="ignore")` + - `FileMover.Options(mode="overwrite")` → `FileMover.Options(if_exists="replace_file")` + - `FileMover.Options(mode="delete_all")` → `FileMover.Options(if_exists="replace_entire_directory")` + + Old names are left intact, but will be removed in v1.0.0 ([#108](https://github.com/MobileTeleSystems/onetl/pull/108)) + +- Rename `onetl.log.disable_clients_logging()` to `onetl.log.setup_clients_logging()`. ([#120](https://github.com/MobileTeleSystems/onetl/pull/120)) + +## Features + +- Add new methods returning Maven packages for specific connection class: + + - `Clickhouse.get_packages()` + - `MySQL.get_packages()` + - `Postgres.get_packages()` + - `Teradata.get_packages()` + - `MSSQL.get_packages(java_version="8")` + - `Oracle.get_packages(java_version="8")` + - `Greenplum.get_packages(scala_version="2.12")` + - `MongoDB.get_packages(scala_version="2.12")` + - `Kafka.get_packages(spark_version="3.4.1", scala_version="2.12")` + + Deprecate old syntax: + + - `Clickhouse.package` + - `MySQL.package` + - `Postgres.package` + - `Teradata.package` + - `MSSQL.package` + - `Oracle.package` + - `Greenplum.package_spark_2_3` + - `Greenplum.package_spark_2_4` + - `Greenplum.package_spark_3_2` + - `MongoDB.package_spark_3_2` + - `MongoDB.package_spark_3_3` + - `MongoDB.package_spark_3_4` ([#87](https://github.com/MobileTeleSystems/onetl/pull/87)) + +- Allow to set client modules log level in `onetl.log.setup_clients_logging()`. + + Allow to enable underlying client modules logging in `onetl.log.setup_logging()` by providing additional argument `enable_clients=True`. + This is useful for debug. ([#120](https://github.com/MobileTeleSystems/onetl/pull/120)) + +- Added support for reading and writing data to Kafka topics. + + For these operations, new classes were added. + + - `Kafka` ([#54](https://github.com/MobileTeleSystems/onetl/pull/54), [#60](https://github.com/MobileTeleSystems/onetl/pull/60), [#72](https://github.com/MobileTeleSystems/onetl/pull/72), [#84](https://github.com/MobileTeleSystems/onetl/pull/84), [#87](https://github.com/MobileTeleSystems/onetl/pull/87), [#89](https://github.com/MobileTeleSystems/onetl/pull/89), [#93](https://github.com/MobileTeleSystems/onetl/pull/93), [#96](https://github.com/MobileTeleSystems/onetl/pull/96), [#102](https://github.com/MobileTeleSystems/onetl/pull/102), [#104](https://github.com/MobileTeleSystems/onetl/pull/104)) + - `Kafka.PlaintextProtocol` ([#79](https://github.com/MobileTeleSystems/onetl/pull/79)) + - `Kafka.SSLProtocol` ([#118](https://github.com/MobileTeleSystems/onetl/pull/118)) + - `Kafka.BasicAuth` ([#63](https://github.com/MobileTeleSystems/onetl/pull/63), [#77](https://github.com/MobileTeleSystems/onetl/pull/77)) + - `Kafka.KerberosAuth` ([#63](https://github.com/MobileTeleSystems/onetl/pull/63), [#77](https://github.com/MobileTeleSystems/onetl/pull/77), [#110](https://github.com/MobileTeleSystems/onetl/pull/110)) + - `Kafka.ScramAuth` ([#115](https://github.com/MobileTeleSystems/onetl/pull/115)) + - `Kafka.Slots` ([#109](https://github.com/MobileTeleSystems/onetl/pull/109)) + - `Kafka.ReadOptions` ([#68](https://github.com/MobileTeleSystems/onetl/pull/68)) + - `Kafka.WriteOptions` ([#68](https://github.com/MobileTeleSystems/onetl/pull/68)) + + Currently, Kafka does not support incremental read strategies, this will be implemented in future releases. + +- Added support for reading files as Spark DataFrame and saving DataFrame as Files. + + For these operations, new classes were added. + + FileDFConnections: + + - `SparkHDFS` ([#98](https://github.com/MobileTeleSystems/onetl/pull/98)) + - `SparkS3` ([#94](https://github.com/MobileTeleSystems/onetl/pull/94), [#100](https://github.com/MobileTeleSystems/onetl/pull/100), [#124](https://github.com/MobileTeleSystems/onetl/pull/124)) + - `SparkLocalFS` ([#67](https://github.com/MobileTeleSystems/onetl/pull/67)) + + High-level classes: + + - `FileDFReader` ([#73](https://github.com/MobileTeleSystems/onetl/pull/73)) + - `FileDFWriter` ([#81](https://github.com/MobileTeleSystems/onetl/pull/81)) + + File formats: + + - `Avro` ([#69](https://github.com/MobileTeleSystems/onetl/pull/69)) + - `CSV` ([#92](https://github.com/MobileTeleSystems/onetl/pull/92)) + - `JSON` ([#83](https://github.com/MobileTeleSystems/onetl/pull/83)) + - `JSONLine` ([#83](https://github.com/MobileTeleSystems/onetl/pull/83)) + - `ORC` ([#86](https://github.com/MobileTeleSystems/onetl/pull/86)) + - `Parquet` ([#88](https://github.com/MobileTeleSystems/onetl/pull/88)) + +## Improvements + +- Remove redundant checks for driver availability in Greenplum and MongoDB connections. ([#67](https://github.com/MobileTeleSystems/onetl/pull/67)) +- Check of Java class availability moved from `.check()` method to connection constructor. ([#97](https://github.com/MobileTeleSystems/onetl/pull/97)) diff --git a/mkdocs/docs/en/changelog/0.9.1.md b/mkdocs/docs/en/changelog/0.9.1.md new file mode 100644 index 000000000..1779274b1 --- /dev/null +++ b/mkdocs/docs/en/changelog/0.9.1.md @@ -0,0 +1,7 @@ +# 0.9.1 (2023-08-17) + +## Bug Fixes + +- Fixed bug then number of threads created by `FileDownloader` / `FileUploader` / `FileMover` was + not `min(workers, len(files))`, but `max(workers, len(files))`. leading to create too much workers + on large files list. diff --git a/mkdocs/docs/en/changelog/0.9.2.md b/mkdocs/docs/en/changelog/0.9.2.md new file mode 100644 index 000000000..9c34d16a0 --- /dev/null +++ b/mkdocs/docs/en/changelog/0.9.2.md @@ -0,0 +1,23 @@ +# 0.9.2 (2023-09-06) + +## Features + +- Add `if_exists="ignore"` and `error` to `Greenplum.WriteOptions` ([#142](https://github.com/MobileTeleSystems/onetl/pull/142)) + +## Improvements + +- Improve validation messages while writing dataframe to Kafka. ([#131](https://github.com/MobileTeleSystems/onetl/pull/131)) + +- Improve documentation: + + - Add notes about reading and writing to database connections documentation + - Add notes about executing statements in JDBC and Greenplum connections + +## Bug Fixes + +- Fixed validation of `headers` column is written to Kafka with default `Kafka.WriteOptions()` - default value was `False`, + but instead of raising an exception, column value was just ignored. ([#131](https://github.com/MobileTeleSystems/onetl/pull/131)) +- Fix reading data from Oracle with `partitioningMode="range"` without explicitly set `lowerBound` / `upperBound`. ([#133](https://github.com/MobileTeleSystems/onetl/pull/133)) +- Update Kafka documentation with SSLProtocol usage. ([#136](https://github.com/MobileTeleSystems/onetl/pull/136)) +- Raise exception if someone tries to read data from Kafka topic which does not exist. ([#138](https://github.com/MobileTeleSystems/onetl/pull/138)) +- Allow to pass Kafka topics with name like `some.topic.name` to DBReader. Same for MongoDB collections. ([#139](https://github.com/MobileTeleSystems/onetl/pull/139)) diff --git a/mkdocs/docs/en/changelog/0.9.3.md b/mkdocs/docs/en/changelog/0.9.3.md new file mode 100644 index 000000000..1a8c25d4d --- /dev/null +++ b/mkdocs/docs/en/changelog/0.9.3.md @@ -0,0 +1,5 @@ +# 0.9.3 (2023-09-06) + +## Bug Fixes + +- Fix documentation build diff --git a/mkdocs/docs/en/changelog/0.9.4.md b/mkdocs/docs/en/changelog/0.9.4.md new file mode 100644 index 000000000..a330de366 --- /dev/null +++ b/mkdocs/docs/en/changelog/0.9.4.md @@ -0,0 +1,24 @@ +# 0.9.4 (2023-09-26) + +## Features + +- Add `Excel` file format support. ([#148](https://github.com/MobileTeleSystems/onetl/pull/148)) +- Add `Samba` file connection. + It is now possible to download and upload files to Samba shared folders using `FileDownloader`/`FileUploader`. ([#150](https://github.com/MobileTeleSystems/onetl/pull/150)) +- Add `if_exists="ignore"` and `error` to `Hive.WriteOptions` ([#143](https://github.com/MobileTeleSystems/onetl/pull/143)) +- Add `if_exists="ignore"` and `error` to `JDBC.WriteOptions` ([#144](https://github.com/MobileTeleSystems/onetl/pull/144)) +- Add `if_exists="ignore"` and `error` to `MongoDB.WriteOptions` ([#145](https://github.com/MobileTeleSystems/onetl/pull/145)) + +## Improvements + +- Add documentation about different ways of passing packages to Spark session. ([#151](https://github.com/MobileTeleSystems/onetl/pull/151)) +- Drastically improve `Greenplum` documentation: + : - Added information about network ports, grants, `pg_hba.conf` and so on. + - Added interaction schemas for reading, writing and executing statements in Greenplum. + - Added recommendations about reading data from views and `JOIN` results from Greenplum. ([#154](https://github.com/MobileTeleSystems/onetl/pull/154)) +- Make `.fetch` and `.execute` methods of DB connections thread-safe. Each thread works with its own connection. ([#156](https://github.com/MobileTeleSystems/onetl/pull/156)) +- Call `.close()` on `FileConnection` then it is removed by garbage collector. ([#156](https://github.com/MobileTeleSystems/onetl/pull/156)) + +## Bug Fixes + +- Fix issue when stopping Python interpreter calls `JDBCMixin.close()`, but it is finished with exceptions. ([#156](https://github.com/MobileTeleSystems/onetl/pull/156)) diff --git a/mkdocs/docs/en/changelog/0.9.5.md b/mkdocs/docs/en/changelog/0.9.5.md new file mode 100644 index 000000000..1d7358c0b --- /dev/null +++ b/mkdocs/docs/en/changelog/0.9.5.md @@ -0,0 +1,14 @@ +# 0.9.5 (2023-10-10) + +## Features + +- Add `XML` file format support. ([#163](https://github.com/MobileTeleSystems/onetl/pull/163)) +- Tested compatibility with Spark 3.5.0. `MongoDB` and `Excel` are not supported yet, but other packages do. ([#159](https://github.com/MobileTeleSystems/onetl/pull/159)) + +## Improvements + +- Add check to all DB and FileDF connections that Spark session is alive. ([#164](https://github.com/MobileTeleSystems/onetl/pull/164)) + +## Bug Fixes + +- Fix `Hive.check()` behavior when Hive Metastore is not available. ([#164](https://github.com/MobileTeleSystems/onetl/pull/164)) diff --git a/mkdocs/docs/en/changelog/DRAFT.md b/mkdocs/docs/en/changelog/DRAFT.md new file mode 100644 index 000000000..912b7d7f7 --- /dev/null +++ b/mkdocs/docs/en/changelog/DRAFT.md @@ -0,0 +1,3 @@ +```{eval-rst} +.. towncrier-draft-entries:: |release| [UNRELEASED] +``` diff --git a/mkdocs/docs/en/changelog/NEXT_RELEASE.md b/mkdocs/docs/en/changelog/NEXT_RELEASE.md new file mode 100644 index 000000000..a9831f9d1 --- /dev/null +++ b/mkdocs/docs/en/changelog/NEXT_RELEASE.md @@ -0,0 +1 @@ +% towncrier release notes start diff --git a/mkdocs/docs/en/changelog/index.md b/mkdocs/docs/en/changelog/index.md new file mode 100644 index 000000000..04704744d --- /dev/null +++ b/mkdocs/docs/en/changelog/index.md @@ -0,0 +1,29 @@ +# Changelog + +- [0.13.4](../changelog/0.13.4) +- [0.13.3](../changelog/0.13.3) +- [0.13.1](../changelog/0.13.1) +- [0.13.0](../changelog/0.13.0) +- [0.12.5](../changelog/0.12.5) +- [0.12.4](../changelog/0.12.4) +- [0.12.3](../changelog/0.12.3) +- [0.12.2](../changelog/0.12.2) +- [0.12.1](../changelog/0.12.1) +- [0.12.0](../changelog/0.12.0) +- [0.11.2](../changelog/0.11.2) +- [0.11.1](../changelog/0.11.1) +- [0.11.0](../changelog/0.11.0) +- [0.10.2](../changelog/0.10.2) +- [0.10.1](../changelog/0.10.1) +- [0.10.0](../changelog/0.10.0) +- [0.9.5](../changelog/0.9.5) +- [0.9.4](../changelog/0.9.4) +- [0.9.3](../changelog/0.9.3) +- [0.9.2](../changelog/0.9.2) +- [0.9.1](../changelog/0.9.1) +- [0.9.0](../changelog/0.9.0) +- [0.8.1](../changelog/0.8.1) +- [0.8.0](../changelog/0.8.0) +- [0.7.2](../changelog/0.7.2) +- [0.7.1](../changelog/0.7.1) +- [0.7.0](../changelog/0.7.0) diff --git a/mkdocs/docs/en/changelog/next_release/.keep b/mkdocs/docs/en/changelog/next_release/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/mkdocs/docs/en/concepts.md b/mkdocs/docs/en/concepts.md new file mode 100644 index 000000000..070ff02df --- /dev/null +++ b/mkdocs/docs/en/concepts.md @@ -0,0 +1,370 @@ +# Concepts + +Here you can find detailed documentation about each one of the onETL concepts and how to use them. + +## Connection + +### Basics + +onETL is used to pull and push data into other systems, and so it has a first-class `Connection` concept for storing credentials that are used to communicate with external systems. + +A `Connection` is essentially a set of parameters, such as username, password, hostname. + +To create a connection to a specific storage type, you must use a class that matches the storage type. The class name is the same as the storage type name (`Oracle`, `MSSQL`, `SFTP`, etc): + +```python +from onetl.connection import SFTP + +sftp = SFTP( + host="sftp.test.com", + user="onetl", + password="onetl", +) +``` + +All connection types are inherited from the parent class `BaseConnection`. + +### Class diagram + +```mermaid +classDiagram + BaseConnection <|-- DBConnection + DBConnection <|-- Hive + DBConnection <|-- Greenplum + DBConnection <|-- MongoDB + DBConnection <|-- Kafka + DBConnection <|-- JDBCConnection + JDBCConnection <|-- Clickhouse + JDBCConnection <|-- MSSQL + JDBCConnection <|-- MySQL + JDBCConnection <|-- Postgres + JDBCConnection <|-- Oracle + JDBCConnection <|-- Teradata + BaseConnection <|-- FileConnection + FileConnection <|-- FTP + FileConnection <|-- FTPS + FileConnection <|-- HDFS + FileConnection <|-- WebDAV + FileConnection <|-- Samba + FileConnection <|-- SFTP + FileConnection <|-- S3 + BaseConnection <|-- FileDFConnection + FileDFConnection <|-- SparkHDFS + FileDFConnection <|-- SparkLocalFS + FileDFConnection <|-- SparkS3 +``` + +### DBConnection + +Classes inherited from `DBConnection` could be used for accessing databases. + +A `DBConnection` could be instantiated as follows: + +```python +from onetl.connection import MSSQL + +mssql = MSSQL( + host="mssqldb.demo.com", + user="onetl", + password="onetl", + database="Telecom", + spark=spark, +) +``` + +where **spark** is the current SparkSession. +`onETL` uses `Spark` and specific Java connectors under the hood to work with databases. + +For a description of other parameters, see the documentation for the [available DBConnections][db-connections]. + +### FileConnection + +Classes inherited from `FileConnection` could be used to access files stored on the different file systems/file servers + +A `FileConnection` could be instantiated as follows: + +```python +from onetl.connection import SFTP + +sftp = SFTP( + host="sftp.test.com", + user="onetl", + password="onetl", +) +``` + +For a description of other parameters, see the documentation for the [available FileConnections][file-connections-0]. + +### FileDFConnection + +Classes inherited from `FileDFConnection` could be used for accessing files as Spark DataFrames. + +A `FileDFConnection` could be instantiated as follows: + +```python +from onetl.connection import SparkHDFS + +spark_hdfs = SparkHDFS( + host="namenode1.domain.com", + cluster="mycluster", + spark=spark, +) +``` + +where **spark** is the current SparkSession. +`onETL` uses `Spark` and specific Java connectors under the hood to work with DataFrames. + +For a description of other parameters, see the documentation for the [available FileDFConnections][file-dataframe-connections]. + +### Checking connection availability + +Once you have created a connection, you can check the database/filesystem availability using the method `check()`: + +```python +mssql.check() +sftp.check() +spark_hdfs.check() +``` + +It will raise an exception if database/filesystem cannot be accessed. + +This method returns connection itself, so you can create connection and immediately check its availability: + +```Python +mssql = MSSQL( + host="mssqldb.demo.com", + user="onetl", + password="onetl", + database="Telecom", + spark=spark, +).check() # <-- +``` + +## Extract/Load data + +### Basics + +As we said above, onETL is used to extract data from and load data into remote systems. + +onETL provides several classes for this: + +* [DBReader][db-reader] +* [DBWriter][db-writer] +* [FileDFReader][filedf-reader-0] +* [FileDFWriter][filedf-writer-0] +* [FileDownloader][file-downloader-0] +* [FileUploader][file-uploader-0] +* [FileMover][file-mover-0] + +All of these classes have a method `run()` that starts extracting/loading the data: + +```python +from onetl.db import DBReader, DBWriter + +reader = DBReader( + connection=mssql, + source="dbo.demo_table", + columns=["column_1", "column_2"], +) + +# Read data as Spark DataFrame +df = reader.run() + +db_writer = DBWriter( + connection=hive, + target="dl_sb.demo_table", +) + +# Save Spark DataFrame to Hive table +writer.run(df) +``` + +### Extract data + +To extract data you can use classes: + +| | Use case | Connection | `run()` gets | `run()` returns | +| -- | - | - | - | --- | +| [`DBReader`][db-reader] | Reading data from a database | Any [`DBConnection`][db-connections] | - | [Spark DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html#dataframe) | +| [`FileDFReader`][filedf-reader-0] | Read data from a file or set of files | Any [`FileDFConnection`][file-dataframe-connections] | No input, or List[File path on FileSystem] | [Spark DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html#dataframe) | +| [`FileDownloader`][file-downloader-0] | Download files from remote FS to local FS | Any [`FileConnection`][file-connections-0] | No input, or List[File path on remote FileSystem] | [`DownloadResult`][file-downloader-result] | + +### Load data + +To load data you can use classes: + +| | Use case | Connection | `run()` gets | `run()` returns | +| - | -- | - | --- | -- | +| [`DBWriter`][db-writer] | Writing data from a DataFrame to a database | Any [`DBConnection`][db-connections] | [Spark DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html#dataframe) | None | +| [`FileDFWriter`][filedf-writer-0] | Writing data from a DataFrame to a folder | Any [`FileDFConnection`][file-dataframe-connections] | [Spark DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html#dataframe) | None | +| [`FileUploader`][file-uploader-0] | Uploading files from a local FS to remote FS | Any [`FileConnection`][file-connections-0] | List[File path on local FileSystem] | [`UploadResult`][file-uploader-result] | + +### Manipulate data + +To manipulate data you can use classes: + +| | Use case | Connection | `run()` gets | `run()` returns | +| - | - | -- | -- | - | +| [`FileMover`][file-mover-0] | Move files between directories in remote FS | Any [`FileConnection`][file-connections-0] | List[File path on remote FileSystem] | [`MoveResult`][file-mover-result] | + +### Options + +Extract and load classes have a `options` parameter, which has a special meaning: + +> - all other parameters - *WHAT* we extract / *WHERE* we load to +> - `options` parameter - *HOW* we extract/load data + +```python +db_reader = DBReader( + # WHAT do we read: + connection=mssql, + source="dbo.demo_table", # some table from MSSQL + columns=["column_1", "column_2"], # but only specific set of columns + where="column_2 > 1000", # only rows matching the clause + # HOW do we read: + options=MSSQL.ReadOptions( + numPartitions=10, # read in 10 parallel jobs + partitionColumn="id", # balance data read by assigning each job a part of data using `hash(id) mod N` expression + partitioningMode="hash", + fetchsize=1000, # each job will fetch block of 1000 rows each on every read attempt + ), +) + +db_writer = DBWriter( + # WHERE do we write to - to some table in Hive + connection=hive, + target="dl_sb.demo_table", + # HOW do we write - overwrite all the data in the existing table + options=Hive.WriteOptions(if_exists="replace_entire_table"), +) + +file_downloader = FileDownloader( + # WHAT do we download - files from some dir in SFTP + connection=sftp, + source_path="/source", + filters=[Glob("*.csv")], # only CSV files + limits=[MaxFilesCount(1000)], # 1000 files max + # WHERE do we download to - a specific dir on local FS + local_path="/some", + # HOW do we download: + options=FileDownloader.Options( + delete_source=True, # after downloading each file remove it from source_path + if_exists="replace_file", # replace existing files in the local_path + ), +) + +file_uploader = FileUploader( + # WHAT do we upload - files from some local dir + local_path="/source", + # WHERE do we upload to- specific remote dir in HDFS + connection=hdfs, + target_path="/some", + # HOW do we upload: + options=FileUploader.Options( + delete_local=True, # after uploading each file remove it from local_path + if_exists="replace_file", # replace existing files in the target_path + ), +) + +file_mover = FileMover( + # WHAT do we move - files in some remote dir in HDFS + source_path="/source", + connection=hdfs, + # WHERE do we move files to + target_path="/some", # a specific remote dir within the same HDFS connection + # HOW do we load - replace existing files in the target_path + options=FileMover.Options(if_exists="replace_file"), +) + +file_df_reader = FileDFReader( + # WHAT do we read - *.csv files from some dir in S3 + connection=s3, + source_path="/source", + file_format=CSV(), + # HOW do we read - load files from /source/*.csv, not from /source/nested/*.csv + options=FileDFReader.Options(recursive=False), +) + +file_df_writer = FileDFWriter( + # WHERE do we write to - as .csv files in some dir in S3 + connection=s3, + target_path="/target", + file_format=CSV(), + # HOW do we write - replace all existing files in /target, if exists + options=FileDFWriter.Options(if_exists="replace_entire_directory"), +) +``` + +More information about `options` could be found on [`DBConnection`][db-connections] and [`FileDownloader`][file-downloader-0] / [`FileUploader`][file-uploader-0] / [`FileMover`][file-mover-0] / [`FileDFReader`][filedf-reader-0] / [`FileDFWriter`][filedf-writer-0] documentation. + + +### Read Strategies + +onETL have several builtin strategies for reading data: + +1. [Snapshot strategy][snapshot-strategy] (default strategy) +2. [Incremental strategy][incremental-strategy] +3. [Snapshot batch strategy][snapshot-batch-strategy] +4. [Incremental batch strategy][incremental-batch-strategy] + +For example, an incremental strategy allows you to get only new data from the table: + +```python +from onetl.strategy import IncrementalStrategy + +reader = DBReader( + connection=mssql, + source="dbo.demo_table", + hwm_column="id", # detect new data based on value of "id" column +) + +# first run +with IncrementalStrategy(): + df = reader.run() + +sleep(3600) + +# second run +with IncrementalStrategy(): + # only rows, that appeared in the source since previous run + df = reader.run() +``` + +or get only files which were not downloaded before: + +```python +from onetl.strategy import IncrementalStrategy + +file_downloader = FileDownloader( + connection=sftp, + source_path="/remote", + local_path="/local", + hwm_type="file_list", # save all downloaded files to a list, and exclude files already present in this list +) + +# first run +with IncrementalStrategy(): + files = file_downloader.run() + +sleep(3600) + +# second run +with IncrementalStrategy(): + # only files, that appeared in the source since previous run + files = file_downloader.run() +``` + +Most of strategies are based on [`HWM`][hwm], Please check each strategy documentation for more details + +### Why just not use Connection class for extract/load? + +Connections are very simple, they have only a set of some basic operations, +like `mkdir`, `remove_file`, `get_table_schema`, and so on. + +High-level operations, like + + * [`strategy`][strategy] support + * Handling metadata push/pull + * Handling different options, like `if_exists="replace_file"` in case of file download/upload + +is moved to a separate class which calls the connection object methods to perform some complex logic. diff --git a/mkdocs/docs/en/connection/db_connection/clickhouse/connection.md b/mkdocs/docs/en/connection/db_connection/clickhouse/connection.md new file mode 100644 index 000000000..06fa5a462 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/clickhouse/connection.md @@ -0,0 +1,12 @@ +(clickhouse-connection)= + +# Clickhouse connection + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.clickhouse.connection +``` + +```{eval-rst} +.. autoclass:: Clickhouse + :members: get_packages, check +``` diff --git a/mkdocs/docs/en/connection/db_connection/clickhouse/execute.md b/mkdocs/docs/en/connection/db_connection/clickhouse/execute.md new file mode 100644 index 000000000..501fed530 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/clickhouse/execute.md @@ -0,0 +1,125 @@ +(clickhouse-execute)= + +# Executing statements in Clickhouse + +```{eval-rst} +.. warning:: + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`Clickhouse.sql ` instead. +``` + +## How to + +There are 2 ways to execute some statement in Clickhouse + +### Use `Clickhouse.fetch` + +Use this method to perform some `SELECT` query which returns **small number or rows**, like reading +Clickhouse config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts {obj}`Clickhouse.FetchOptions `. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`clickhouse-types`. +``` + +#### Syntax support + +This method supports **any** query syntax supported by Clickhouse, like: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ✅︎ `SELECT func(arg1, arg2)` - call function +- ✅︎ `SHOW ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples + +```python +from onetl.connection import Clickhouse + +clickhouse = Clickhouse(...) + +df = clickhouse.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Clickhouse.FetchOptions(queryTimeout=10), +) +clickhouse.close() +value = df.collect()[0][0] # get value from first row and first column +``` + +### Use `Clickhouse.execute` + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts {obj}`Clickhouse.ExecuteOptions `. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +#### Syntax support + +This method supports **any** query syntax supported by Clickhouse, like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...`, and so on +- ✅︎ `ALTER ...` +- ✅︎ `INSERT INTO ... SELECT ...`, `UPDATE ...`, `DELETE ...`, and so on +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, `TRUNCATE TABLE`, and so on +- ✅︎ other statements not mentioned here +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples + +```python +from onetl.connection import Clickhouse + +clickhouse = Clickhouse(...) + +clickhouse.execute("DROP TABLE schema.table") +clickhouse.execute( + """ + CREATE TABLE schema.table ( + id UInt8, + key String, + value Float32 + ) + ENGINE = MergeTree() + ORDER BY id + """, + options=Clickhouse.ExecuteOptions(queryTimeout=10), +) +``` + +## Notes + +These methods **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + +So it should **NOT** be used to read large amounts of data. Use {ref}`DBReader ` or {ref}`Clickhouse.sql ` instead. + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.clickhouse.options +``` + +```{eval-rst} +.. autopydantic_model:: ClickhouseFetchOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false + +``` + +```{eval-rst} +.. autopydantic_model:: ClickhouseExecuteOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/clickhouse/index.md b/mkdocs/docs/en/connection/db_connection/clickhouse/index.md new file mode 100644 index 000000000..28c20dec1 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/clickhouse/index.md @@ -0,0 +1,28 @@ +(clickhouse)= + +# Clickhouse + +```{toctree} +:caption: Connection +:maxdepth: 1 + +prerequisites +connection +``` + +```{toctree} +:caption: Operations +:maxdepth: 1 + +read +sql +write +execute +``` + +```{toctree} +:caption: Troubleshooting +:maxdepth: 1 + +types +``` diff --git a/mkdocs/docs/en/connection/db_connection/clickhouse/prerequisites.md b/mkdocs/docs/en/connection/db_connection/clickhouse/prerequisites.md new file mode 100644 index 000000000..563dbad3f --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/clickhouse/prerequisites.md @@ -0,0 +1,73 @@ +(clickhouse-prerequisites)= + +# Prerequisites + +## Version Compatibility + +- Clickhouse server versions: + : - Officially declared: 22.8 or higher + - Actually tested: 21.1, 25.1 +- Spark versions: 2.3.x - 3.5.x +- Java versions: 8 - 20 + +See [official documentation](https://clickhouse.com/docs/en/integrations/java#jdbc-driver). + +## Installing PySpark + +To use Clickhouse connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See {ref}`install-spark` installation instruction for more details. + +## Connecting to Clickhouse + +### Connection port + +Connector can only use **HTTP** (usually `8123` port) or **HTTPS** (usually `8443` port) protocol. + +TCP and GRPC protocols are NOT supported. + +### Connecting to cluster + +It is possible to connect to Clickhouse cluster, and use it's load balancing capabilities to read or write data in parallel. +Each Spark executor can connect to random Clickhouse nodes, instead of sending all the data to a node specified in connection params. + +This requires all Clickhouse servers to run on different hosts, and **listen the same HTTP port**. +Set `auto_discovery=True` to enable this feature (disabled by default): + +```python +Clickhouse( + host="node1.of.cluster", + port=8123, + extra={ + "auto_discovery": True, + "load_balancing_policy": "roundRobin", + }, +) +``` + +See [official documentation](https://clickhouse.com/docs/en/integrations/java#configuring-node-discovery-load-balancing-and-failover). + +### Required grants + +Ask your Clickhouse cluster administrator to set following grants for a user, +used for creating a connection: + +```{eval-rst} +.. tabs:: + + .. code-tab:: sql Read + Write + + -- allow creating tables in the target schema + GRANT CREATE TABLE ON myschema.* TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON myschema.mytable TO username; + + .. code-tab:: sql Read only + + -- allow read access to specific table + GRANT SELECT ON myschema.mytable TO username; +``` + +More details can be found in [official documentation](https://clickhouse.com/docs/en/sql-reference/statements/grant). diff --git a/mkdocs/docs/en/connection/db_connection/clickhouse/read.md b/mkdocs/docs/en/connection/db_connection/clickhouse/read.md new file mode 100644 index 000000000..2c62132ed --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/clickhouse/read.md @@ -0,0 +1,93 @@ +(clickhouse-read)= + +# Reading from Clickhouse using `DBReader` + +{obj}`DBReader ` supports {ref}`strategy` for incremental data reading, +but does not support custom queries, like `JOIN`. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`clickhouse-types` +``` + +## Supported DBReader features + +- ✅︎ `columns` +- ✅︎ `where` +- ✅︎ `hwm`, supported strategies: +- - ✅︎ {ref}`snapshot-strategy` +- - ✅︎ {ref}`incremental-strategy` +- - ✅︎ {ref}`snapshot-batch-strategy` +- - ✅︎ {ref}`incremental-batch-strategy` +- ❌ `hint` (is not supported by Clickhouse) +- ❌ `df_schema` +- ✅︎ `options` (see {obj}`Clickhouse.ReadOptions `) + +## Examples + +Snapshot strategy: + +```python +from onetl.connection import Clickhouse +from onetl.db import DBReader + +clickhouse = Clickhouse(...) + +reader = DBReader( + connection=clickhouse, + source="schema.table", + columns=["id", "key", "CAST(value AS String) value", "updated_dt"], + where="key = 'something'", + options=Clickhouse.ReadOptions(partitionColumn="id", numPartitions=10), +) +df = reader.run() +``` + +Incremental strategy: + +```python +from onetl.connection import Clickhouse +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +clickhouse = Clickhouse(...) + +reader = DBReader( + connection=clickhouse, + source="schema.table", + columns=["id", "key", "CAST(value AS String) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="clickhouse_hwm", expression="updated_dt"), + options=Clickhouse.ReadOptions(partitionColumn="id", numPartitions=10), +) + +with IncrementalStrategy(): + df = reader.run() +``` + +## Recommendations + +### Select only required columns + +Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This reduces the amount of data passed from Clickhouse to Spark. + +### Pay attention to `where` value + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where="column = 'value'")` clause. +This both reduces the amount of data send from Clickhouse to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.clickhouse.options +``` + +```{eval-rst} +.. autopydantic_model:: ClickhouseReadOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/clickhouse/sql.md b/mkdocs/docs/en/connection/db_connection/clickhouse/sql.md new file mode 100644 index 000000000..bbe62adaa --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/clickhouse/sql.md @@ -0,0 +1,80 @@ +(clickhouse-sql)= + +# Reading from Clickhouse using `Clickhouse.sql` + +`Clickhouse.sql` allows passing custom SQL query, but does not support incremental strategies. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`clickhouse-types` +``` + +```{eval-rst} +.. warning:: + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. +``` + +## Syntax support + +Only queries with the following syntax are supported: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +## Examples + +```python +from onetl.connection import Clickhouse + +clickhouse = Clickhouse(...) +df = clickhouse.sql( + """ + SELECT + id, + key, + CAST(value AS String) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Clickhouse.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), +) +``` + +## Recommendations + +### Select only required columns + +Instead of passing `SELECT * FROM ...` prefer passing exact column names `SELECT col1, col2, ...`. +This reduces the amount of data passed from Clickhouse to Spark. + +### Pay attention to `where` value + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `WHERE column = 'value'` clause. +This both reduces the amount of data send from Clickhouse to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.clickhouse.options +``` + +```{eval-rst} +.. autopydantic_model:: ClickhouseSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/clickhouse/types.md b/mkdocs/docs/en/connection/db_connection/clickhouse/types.md new file mode 100644 index 000000000..91ba16856 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/clickhouse/types.md @@ -0,0 +1,457 @@ +(clickhouse-types)= + +# Clickhouse \<-> Spark type mapping + +```{eval-rst} +.. note:: + + The results below are valid for Spark 3.5.5, and may differ on other Spark versions. +``` + +```{eval-rst} +.. note:: + + It is recommended to use `spark-dialect-extension `_ package, + which implements writing Arrays from Spark to Clickhouse, fixes dropping fractions of seconds in `TimestampType`, + and fixes other type conversion issues. +``` + +## Type detection & casting + +Spark's DataFrames always have a `schema` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +### Reading from Clickhouse + +This is how Clickhouse connector performs this: + +- For each column in query result (`SELECT column1, column2, ... FROM table ...`) get column name and Clickhouse type. +- Find corresponding `Clickhouse type (read)` → `Spark type` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Create DataFrame from query with specific column names and Spark types. + +### Writing to some existing Clickhouse table + +This is how Clickhouse connector performs this: + +- Get names of columns in DataFrame. [^footnote-1] +- Perform `SELECT * FROM table LIMIT 0` query. +- Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. +- **Find corresponding** `Clickhouse type (read)` → `Spark type` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [^footnote-2] +- Find corresponding `Spark type` → `Clickhousetype (write)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- If `Clickhousetype (write)` match `Clickhouse type (read)`, no additional casts will be performed, DataFrame column will be written to Clickhouse as is. +- If `Clickhousetype (write)` does not match `Clickhouse type (read)`, DataFrame column will be casted to target column type **on Clickhouse side**. For example, you can write column with text data to `Int32` column, if column contains valid integer values within supported value range and precision. + +[^footnote-1]: This allows to write data to tables with `DEFAULT` columns - if DataFrame has no such column, + it will be populated by Clickhouse. + +[^footnote-2]: Yes, this is weird. + +### Create new table using Spark + +```{eval-rst} +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! +``` + +This is how Clickhouse connector performs this: + +- Find corresponding `Spark type` → `Clickhouse type (create)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Generate DDL for creating table in Clickhouse, like `CREATE TABLE (col1 ...)`, and run it. +- Write DataFrame to created table as is. + +But Spark does not have specific dialect for Clickhouse, so Generic JDBC dialect is used. +Generic dialect is using SQL ANSI type names while creating tables in target database, not database-specific types. + +If some cases this may lead to using wrong column type. For example, Spark creates column of type `TIMESTAMP` +which corresponds to Clickhouse type `DateTime32` (precision up to seconds) +instead of more precise `DateTime64` (precision up to nanoseconds). +This may lead to incidental precision loss, or sometimes data cannot be written to created table at all. + +So instead of relying on Spark to create tables: + +```{eval-rst} +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=clickhouse, + target="default.target_tbl", + options=Clickhouse.WriteOptions( + if_exists="append", + # ENGINE is required by Clickhouse + createTableOptions="ENGINE = MergeTree() ORDER BY id", + ), + ) + writer.run(df) +``` + +Always prefer creating tables with specific types **BEFORE WRITING DATA**: + +```{eval-rst} +.. dropdown:: See example + + .. code:: python + + clickhouse.execute( + """ + CREATE TABLE default.target_tbl ( + id UInt8, + value DateTime64(6) -- specific type and precision + ) + ENGINE = MergeTree() + ORDER BY id + """, + ) + + writer = DBWriter( + connection=clickhouse, + target="default.target_tbl", + options=Clickhouse.WriteOptions(if_exists="append"), + ) + writer.run(df) +``` + +### References + +Here you can find source code with type conversions: + +- [Clickhouse -> JDBC](https://github.com/ClickHouse/clickhouse-java/blob/0.3.2/clickhouse-jdbc/src/main/java/com/clickhouse/jdbc/JdbcTypeMapping.java#L39-L176) +- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L307) +- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L141-L164) +- [JDBC -> Clickhouse](https://github.com/ClickHouse/clickhouse-java/blob/0.3.2/clickhouse-jdbc/src/main/java/com/clickhouse/jdbc/JdbcTypeMapping.java#L185-L311) + +## Supported types + +See [official documentation](https://clickhouse.com/docs/en/sql-reference/data-types) + +### Generic types + +- `LowCardinality(T)` is same as `T` +- `Nullable(T)` is same as `T`, but Spark column is inferred as `nullable=True` + +### Numeric types + +```{eval-rst} ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) | ++================================+===================================+===============================+===============================+ +| `Bool` | `BooleanType()` | `Bool` | `UInt64` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `Decimal` | `DecimalType(P=10, S=0)` | `Decimal(P=10, S=0)` | `Decimal(P=10, S=0)` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `Decimal(P=0..38)` | `DecimalType(P=0..38, S=0)` | `Decimal(P=0..38, S=0)` | `Decimal(P=0..38, S=0)` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `Decimal(P=0..38, S=0..38)` | `DecimalType(P=0..38, S=0..38)` | `Decimal(P=0..38, S=0..38)` | `Decimal(P=0..38, S=0..38)` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `Decimal(P=39..76, S=0..76)` | unsupported [3]_ | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `Decimal32(P=0..9)` | `DecimalType(P=9, S=0..9)` | `Decimal(P=9, S=0..9)` | `Decimal(P=9, S=0..9)` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `Decimal64(S=0..18)` | `DecimalType(P=18, S=0..18)` | `Decimal(P=18, S=0..18)` | `Decimal(P=18, S=0..18)` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `Decimal128(S=0..38)` | `DecimalType(P=38, S=0..38)` | `Decimal(P=38, S=0..38)` | `Decimal(P=38, S=0..38)` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `Decimal256(S=0..76)` | unsupported [3]_ | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `Float32` | `FloatType()` | `Float32` | `Float32` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `Float64` | `DoubleType()` | `Float64` | `Float64` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `Int8` | `IntegerType()` | `Int32` | `Int32` | ++--------------------------------+ | | | +| `Int16` | | | | ++--------------------------------+ | | | +| `Int32` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `Int64` | `LongType()` | `Int64` | `Int64` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `Int128` | unsupported [3]_ | | | ++--------------------------------+ | | | +| `Int256` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `-`` | `ByteType()` | `Int8` | `Int8` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `-`` | `ShortType()` | `Int32` | `Int32` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `UInt8` | `IntegerType()` | `Int32` | `Int32` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `UInt16` | `LongType()` | `Int64` | `Int64` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `UInt32` | `DecimalType(20,0)` | `Decimal(20,0)` | `Decimal(20,0)` | ++--------------------------------+ | | | +| `UInt64` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `UInt128` | unsupported [3]_ | | | ++--------------------------------+ | | | +| `UInt256` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +``` + +[^footnote-3]: Clickhouse support numeric types up to 256 bit - `Int256`, `UInt256`, `Decimal256(S)`, `Decimal(P=39..76, S=0..76)`. + + But Spark's `DecimalType(P, S)` supports maximum `P=38` (128 bit). It is impossible to read, write or operate with values of larger precision, + this leads to an exception. + +### Temporal types + +Notes: +: - Datetime with timezone has the same precision as without timezone + - `DateTime` is alias for `DateTime32` + - `TIMESTAMP` is alias for `DateTime32`, but `TIMESTAMP(N)` is alias for `DateTime64(N)` + +```{eval-rst} ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) | ++===================================+======================================+==================================+===============================+ +| `Date` | `DateType()` | `Date` | `Date` | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| `Date32` | `DateType()` | `Date` | `Date`, | +| | | | **cannot insert data** [4]_ | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| `DateTime32`, seconds | `TimestampType()`, microseconds | `DateTime64(6)`, microseconds | `DateTime32`, seconds | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| `DateTime64(3)`, milliseconds | `TimestampType()`, microseconds | `DateTime64(6)`, microseconds | `DateTime32`, seconds, | +| | | | **precision loss** [5]_ | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| `DateTime64(6)`, microseconds | `TimestampType()`, microseconds | | `DateTime32`, seconds, | ++-----------------------------------+--------------------------------------+ | **precision loss** [7]_ | +| `DateTime64(7..9)`, nanoseconds | `TimestampType()`, microseconds, | | | +| | **precision loss** [6]_ | | | +| | | | | ++-----------------------------------+--------------------------------------+ | | +| `-`` | `TimestampNTZType()`, microseconds | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| `DateTime32(TZ)` | unsupported [7]_ | | | ++-----------------------------------+ | | | +| `DateTime64(P, TZ)` | | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| `IntervalNanosecond` | `LongType()` | `Int64` | `Int64` | ++-----------------------------------+ | | | +| `IntervalMicrosecond` | | | | ++-----------------------------------+ | | | +| `IntervalMillisecond` | | | | ++-----------------------------------+ | | | +| `IntervalSecond` | | | | ++-----------------------------------+ | | | +| `IntervalMinute` | | | | ++-----------------------------------+ | | | +| `IntervalHour` | | | | ++-----------------------------------+ | | | +| `IntervalDay` | | | | ++-----------------------------------+ | | | +| `IntervalMonth` | | | | ++-----------------------------------+ | | | +| `IntervalQuarter` | | | | ++-----------------------------------+ | | | +| `IntervalWeek` | | | | ++-----------------------------------+ | | | +| `IntervalYear` | | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +``` + +```{eval-rst} +.. warning:: + + Note that types in Clickhouse and Spark have different value ranges: + + +------------------------+-----------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + | Clickhouse type | Min value | Max value | Spark type | Min value | Max value | + +========================+===================================+===================================+=====================+================================+================================+ + | `Date` | `1970-01-01` | `2149-06-06` | `DateType()` | `0001-01-01` | `9999-12-31` | + +------------------------+-----------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + | `DateTime32` | `1970-01-01 00:00:00` | `2106-02-07 06:28:15` | `TimestampType()` | `0001-01-01 00:00:00.000000` | `9999-12-31 23:59:59.999999` | + +------------------------+-----------------------------------+-----------------------------------+ | | | + | `DateTime64(P=0..8)` | `1900-01-01 00:00:00.00000000` | `2299-12-31 23:59:59.99999999` | | | | + +------------------------+-----------------------------------+-----------------------------------+ | | | + | `DateTime64(P=9)` | `1900-01-01 00:00:00.000000000` | `2262-04-11 23:47:16.999999999` | | | | + +------------------------+-----------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all of values in Spark DataFrame can be written to Clickhouse. + + References: + * `Clickhouse Date documentation `_ + * `Clickhouse Datetime32 documentation `_ + * `Clickhouse Datetime64 documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ +``` + +[^footnote-4]: `Date32` has different bytes representation than `Date`, and inserting value of type `Date32` to `Date` column + leads to errors on Clickhouse side, e.g. `Date(106617) should be between 0 and 65535 inclusive of both values`. + Although Spark does properly read the `Date32` column as `DateType()`, and there should be no difference at all. + Probably this is some bug in Clickhouse driver. + +[^footnote-5]: Generic JDBC dialect generates DDL with Clickhouse type `TIMESTAMP` which is alias for `DateTime32` with precision up to seconds (`23:59:59`). + Inserting data with milliseconds precision (`23:59:59.999`) will lead to **throwing away milliseconds**. + Solution: create table manually, with proper column type. + +[^footnote-6]: Clickhouse support datetime up to nanoseconds precision (`23:59:59.999999999`), + but Spark `TimestampType()` supports datetime up to microseconds precision (`23:59:59.999999`). + Nanoseconds will be lost during read or write operations. + Solution: create table manually, with proper column type. + +[^footnote-7]: Clickhouse will raise an exception that data in format `2001-01-01 23:59:59.999999` has data `.999999` which does not match format `YYYY-MM-DD hh:mm:ss` + of `DateTime32` column type (see [^footnote-5]). + So Spark can create Clickhouse table, but cannot write data to column of this type. + Solution: create table manually, with proper column type. + +### String types + +```{eval-rst} ++--------------------------------------+------------------+------------------------+--------------------------+ +| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | ++======================================+==================+========================+==========================+ +| `FixedString(N)` | `StringType()` | `String` | `String` | ++--------------------------------------+ | | | +| `String` | | | | ++--------------------------------------+ | | | +| `Enum8` | | | | ++--------------------------------------+ | | | +| `Enum16` | | | | ++--------------------------------------+ | | | +| `IPv4` | | | | ++--------------------------------------+ | | | +| `IPv6` | | | | ++--------------------------------------+ | | | +| `UUID` | | | | ++--------------------------------------+------------------+ | | +| `-`` | `BinaryType()` | | | ++--------------------------------------+------------------+------------------------+--------------------------+ +``` + +## Unsupported types + +Columns of these Clickhouse types cannot be read by Spark: +: - `AggregateFunction(func, T)` + - `Array(T)` + - `JSON` + - `Map(K, V)` + - `MultiPolygon` + - `Nested(field1 T1, ...)` + - `Nothing` + - `Point` + - `Polygon` + - `Ring` + - `SimpleAggregateFunction(func, T)` + - `Tuple(T1, T2, ...)` + +Dataframe with these Spark types cannot be written to Clickhouse: +: - `ArrayType(T)` + - `BinaryType()` + - `CharType(N)` + - `DayTimeIntervalType(P, S)` + - `MapType(K, V)` + - `NullType()` + - `StructType([...])` + - `TimestampNTZType()` + - `VarcharType(N)` + +This is because Spark does not have dedicated Clickhouse dialect, and uses Generic JDBC dialect instead. +This dialect does not have type conversion between some types, like Clickhouse `Array` -> Spark `ArrayType()`, and vice versa. + +The is a way to avoid this - just cast everything to `String`. + +## Explicit type cast + +### `DBReader` + +Use `CAST` or `toJSONString` to get column data as string in JSON format, + +For parsing JSON columns in ClickHouse, {obj}`JSON.parse_column ` method. + +```python +from pyspark.sql.types import ArrayType, IntegerType + +from onetl.file.format import JSON +from onetl.connection import ClickHouse +from onetl.db import DBReader + +reader = DBReader( + connection=clickhouse, + target="default.source_tbl", + columns=[ + "id", + "toJSONString(array_column) array_column", + ], +) +df = reader.run() + +# Spark requires all columns to have some specific type, describe it +column_type = ArrayType(IntegerType()) + +json = JSON() +df = df.select( + df.id, + json.parse_column("array_column", column_type), +) +``` + +### `DBWriter` + +For writing JSON data to ClickHouse, use the {obj}`JSON.serialize_column ` method to convert a DataFrame column to JSON format efficiently and write it as a `String` column in Clickhouse. + +```python +from onetl.file.format import JSON +from onetl.connection import ClickHouse +from onetl.db import DBWriter + +clickhouse = ClickHouse(...) + +clickhouse.execute( + """ + CREATE TABLE default.target_tbl ( + id Int32, + array_column_json String, + ) + ENGINE = MergeTree() + ORDER BY id + """, +) + +json = JSON() +df = df.select( + df.id, + json.serialize_column(df.array_column).alias("array_column_json"), +) + +writer.run(df) +``` + +Then you can parse this column on Clickhouse side - for example, by creating a view: + +```sql +SELECT + id, + JSONExtract(json_column, 'Array(String)') AS array_column +FROM target_tbl +``` + +You can also use [ALIAS](https://clickhouse.com/docs/en/sql-reference/statements/create/table#alias) +or [MATERIALIZED](https://clickhouse.com/docs/en/sql-reference/statements/create/table#materialized) columns +to avoid writing such expression in every `SELECT` clause all the time: + +```sql +CREATE TABLE default.target_tbl ( + id Int32, + array_column_json String, + -- computed column + array_column Array(String) ALIAS JSONExtract(json_column, 'Array(String)') + -- or materialized column + -- array_column Array(String) MATERIALIZED JSONExtract(json_column, 'Array(String)') +) +ENGINE = MergeTree() +ORDER BY id +``` + +Downsides: + +- Using `SELECT JSONExtract(...)` or `ALIAS` column can be expensive, because value is calculated on every row access. This can be especially harmful if such column is used in `WHERE` clause. +- `ALIAS` and `MATERIALIZED` columns are not included in `SELECT *` clause, they should be added explicitly: `SELECT *, calculated_column FROM table`. + +```{eval-rst} +.. warning:: + + `EPHEMERAL `_ columns are not supported by Spark + because they cannot be selected to determine target column type. +``` diff --git a/mkdocs/docs/en/connection/db_connection/clickhouse/write.md b/mkdocs/docs/en/connection/db_connection/clickhouse/write.md new file mode 100644 index 000000000..fa39d81a2 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/clickhouse/write.md @@ -0,0 +1,60 @@ +(clickhouse-write)= + +# Writing to Clickhouse using `DBWriter` + +For writing data to Clickhouse, use {obj}`DBWriter `. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`clickhouse-types` +``` + +```{eval-rst} +.. warning:: + + It is always recommended to create table explicitly using :ref:`Clickhouse.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. +``` + +## Examples + +```python +from onetl.connection import Clickhouse +from onetl.db import DBWriter + +clickhouse = Clickhouse(...) + +df = ... # data is here + +writer = DBWriter( + connection=clickhouse, + target="schema.table", + options=Clickhouse.WriteOptions( + if_exists="append", + # ENGINE is required by Clickhouse + createTableOptions="ENGINE = MergeTree() ORDER BY id", + ), +) + +writer.run(df) +``` + +## Options + +Method above accepts {obj}`Clickhouse.WriteOptions ` + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.clickhouse.options +``` + +```{eval-rst} +.. autopydantic_model:: ClickhouseWriteOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/greenplum/connection.md b/mkdocs/docs/en/connection/db_connection/greenplum/connection.md new file mode 100644 index 000000000..e95a4e6a8 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/greenplum/connection.md @@ -0,0 +1,12 @@ +(greenplum-connection)= + +# Greenplum connection + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.greenplum.connection +``` + +```{eval-rst} +.. autoclass:: Greenplum + :members: get_packages, check +``` diff --git a/mkdocs/docs/en/connection/db_connection/greenplum/execute.md b/mkdocs/docs/en/connection/db_connection/greenplum/execute.md new file mode 100644 index 000000000..fbd38ca81 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/greenplum/execute.md @@ -0,0 +1,159 @@ +(greenplum-execute)= + +# Executing statements in Greenplum + +```{eval-rst} +.. warning:: + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` instead. +``` + +## How to + +There are 2 ways to execute some statement in Greenplum + +### Use `Greenplum.fetch` + +Use this method to perform some `SELECT` query which returns **small number or rows**, like reading +Greenplum config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts {obj}`Greenplum.FetchOptions `. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +```{eval-rst} +.. warning:: + + `Greenplum.fetch` is implemented using Postgres JDBC connection, + so types are handled a bit differently than in `DBReader`. See :ref:`postgres-types`. +``` + +#### Syntax support + +This method supports **any** query syntax supported by Greenplum, like: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ✅︎ `SELECT func(arg1, arg2)` or `{call func(arg1, arg2)}` - special syntax for calling functions +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples + +```python +from onetl.connection import Greenplum + +greenplum = Greenplum(...) + +df = greenplum.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Greenplum.FetchOptions(queryTimeout=10), +) +greenplum.close() +value = df.collect()[0][0] # get value from first row and first column +``` + +### Use `Greenplum.execute` + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts {obj}`Greenplum.ExecuteOptions `. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +#### Syntax support + +This method supports **any** query syntax supported by Greenplum, like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...`, and so on +- ✅︎ `ALTER ...` +- ✅︎ `INSERT INTO ... SELECT ...`, `UPDATE ...`, `DELETE ...`, and so on +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, `TRUNCATE TABLE`, and so on +- ✅︎ `CALL procedure(arg1, arg2) ...` +- ✅︎ `SELECT func(arg1, arg2)` or `{call func(arg1, arg2)}` - special syntax for calling functions +- ✅︎ other statements not mentioned here +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples + +```python +from onetl.connection import Greenplum + +greenplum = Greenplum(...) + +greenplum.execute("DROP TABLE schema.table") +greenplum.execute( + """ + CREATE TABLE schema.table ( + id int, + key text, + value real + ) + DISTRIBUTED BY id + """, + options=Greenplum.ExecuteOptions(queryTimeout=10), +) +``` + +## Interaction schema + +Unlike reading & writing, executing statements in Greenplum is done **only** through Greenplum master node, +without any interaction between Greenplum segments and Spark executors. More than that, Spark executors are not used in this case. + +The only port used while interacting with Greenplum in this case is `5432` (Greenplum master port). + +```{eval-rst} +.. dropdown:: Spark <-> Greenplum interaction during Greenplum.execute()/Greenplum.fetch() + + .. plantuml:: + + @startuml + title Greenplum master <-> Spark driver + box "Spark" + participant "Spark driver" + end box + + box "Greenplum" + participant "Greenplum master" + end box + + == Greenplum.check() == + + activate "Spark driver" + "Spark driver" -> "Greenplum master" ++ : CONNECT + + == Greenplum.execute(statement) == + "Spark driver" --> "Greenplum master" : EXECUTE statement + "Greenplum master" -> "Spark driver" : RETURN result + + == Greenplum.close() == + "Spark driver" --> "Greenplum master" : CLOSE CONNECTION + + deactivate "Greenplum master" + deactivate "Spark driver" + @enduml +``` + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.greenplum.options +``` + +```{eval-rst} +.. autopydantic_model:: GreenplumFetchOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false + +``` + +```{eval-rst} +.. autopydantic_model:: GreenplumExecuteOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/greenplum/index.md b/mkdocs/docs/en/connection/db_connection/greenplum/index.md new file mode 100644 index 000000000..1707fc960 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/greenplum/index.md @@ -0,0 +1,27 @@ +(greenplum)= + +# Greenplum + +```{toctree} +:caption: Connection +:maxdepth: 1 + +prerequisites +connection +``` + +```{toctree} +:caption: Operations +:maxdepth: 1 + +read +write +execute +``` + +```{toctree} +:caption: Troubleshooting +:maxdepth: 1 + +types +``` diff --git a/mkdocs/docs/en/connection/db_connection/greenplum/prerequisites.md b/mkdocs/docs/en/connection/db_connection/greenplum/prerequisites.md new file mode 100644 index 000000000..8a0fc5f73 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/greenplum/prerequisites.md @@ -0,0 +1,380 @@ +# Prerequisites { #greenplum-prerequisites } + +## Version Compatibility + +- Greenplum server versions: + : - Officially declared: 5.x, 6.x, and 7.x (which requires `Greenplum.get_packages(package_version="2.3.0")` or higher) + - Actually tested: 6.23, 7.0 +- Spark versions: 2.3.x - 3.2.x (Spark 3.3+ is not supported yet) +- Java versions: 8 - 11 + +See [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.2/greenplum-connector-spark/release_notes.html). + +## Installing PySpark + +To use Greenplum connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See {ref}`install-spark` installation instruction for more details. + +## Downloading VMware package + +To use Greenplum connector you should download connector `.jar` file from +[VMware website](https://network.tanzu.vmware.com/products/vmware-greenplum#/releases/1413479/file_groups/16966) +and then pass it to Spark session. + +```{eval-rst} +.. warning:: + + Please pay attention to :ref:`Spark & Scala version compatibility `. +``` + +```{eval-rst} +.. warning:: + + There are issues with using package of version 2.3.0/2.3.1 with Greenplum 6.x - connector can + open transaction with `SELECT * FROM table LIMIT 0` query, but does not close it, which leads to deadlocks + during write. +``` + +There are several ways to do that. See {ref}`java-packages` for details. + +```{eval-rst} +.. note:: + + If you're uploading package to private package repo, use `groupId=io.pivotal` and `artifactoryId=greenplum-spark_2.12`` + (`2.12` is Scala version) to give uploaded package a proper name. +``` + +## Connecting to Greenplum + +### Interaction schema + +Spark executors open ports to listen incoming requests. +Greenplum segments are initiating connections to Spark executors using [EXTERNAL TABLE](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-sql_commands-CREATE_EXTERNAL_TABLE.html) +functionality, and send/read data using [gpfdist protocol](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/admin_guide-external-g-using-the-greenplum-parallel-file-server--gpfdist-.html#about-gpfdist-setup-and-performance-1). + +Data is **not** send through Greenplum master. +Greenplum master only receives commands to start reading/writing process, and manages all the metadata (external table location, schema and so on). + +More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/overview.html). + +### Set number of connections + +```{eval-rst} +.. warning:: + + This is very important!!! + + If you don't limit number of connections, you can exceed the `max_connections `_ + limit set on the Greenplum side. It's usually not so high, e.g. 500-1000 connections max, + depending on your Greenplum instance settings and using connection balancers like `pgbouncer`. + + Consuming all available connections means **nobody** (even admin users) can connect to Greenplum. +``` + +Each job on the Spark executor makes its own connection to Greenplum master node, +so you need to limit number of connections to avoid opening too many of them. + +- Reading about `5-10Gb` of data requires about `3-5` parallel connections. +- Reading about `20-30Gb` of data requires about `5-10` parallel connections. +- Reading about `50Gb` of data requires ~ `10-20` parallel connections. +- Reading about `100+Gb` of data requires `20-30` parallel connections. +- Opening more than `30-50` connections is not recommended. + +Number of connections can be limited by 2 ways: + +- By limiting number of Spark executors and number of cores per-executor. Max number of parallel jobs is `executors * cores`. + +```{eval-rst} +.. tabs:: + + .. code-tab:: py Spark with master=local + + spark = ( + SparkSession.builder + # Spark will run with 5 threads in local mode, allowing up to 5 parallel tasks + .config("spark.master", "local[5]") + .config("spark.executor.cores", 1) + ).getOrCreate() + + .. code-tab:: py Spark with master=yarn or master=k8s, dynamic allocation + + spark = ( + SparkSession.builder + .config("spark.master", "yarn") + # Spark will start MAX 10 executors with 1 core each (dynamically), so max number of parallel jobs is 10 + .config("spark.dynamicAllocation.maxExecutors", 10) + .config("spark.executor.cores", 1) + ).getOrCreate() + + .. code-tab:: py Spark with master=yarn or master=k8s, static allocation + + spark = ( + SparkSession.builder + .config("spark.master", "yarn") + # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10 + .config("spark.executor.instances", 10) + .config("spark.executor.cores", 1) + ).getOrCreate() +``` + +- By limiting connection pool size user by Spark (**only** for Spark with `master=local`): + +```python +spark = SparkSession.builder.config("spark.master", "local[*]").getOrCreate() + +# No matter how many executors are started and how many cores they have, +# number of connections cannot exceed pool size: +Greenplum( + ..., + extra={ + "pool.maxSize": 10, + }, +) +``` + +See [connection pooling](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/using_the_connector.html#jdbcconnpool) +documentation. + +- By setting {obj}`num_partitions ` + and {obj}`partition_column ` (not recommended). + +### Allowing connection to Greenplum master + +Ask your Greenplum cluster administrator to allow your user to connect to Greenplum master node, +e.g. by updating `pg_hba.conf` file. + +More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/admin_guide-client_auth.html#limiting-concurrent-connections#allowing-connections-to-greenplum-database-0). + +### Set connection port + +#### Spark with `master=k8s` + +Please follow [the official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/configure.html#k8scfg) + +#### Spark with `master=yarn` or `master=local` + +To read data from Greenplum using Spark, following ports should be opened in firewall between Spark and Greenplum: + +- Spark driver and all Spark executors -> port `5432` on Greenplum master node. + + This port number should be set while connecting to Greenplum: + + ```python + greenplum = Greenplum(host="master.host", port=5432, ...) + ``` + +- Greenplum segments -> some port range (e.g. `41000-42000`) **listened by Spark executors**. + + This range should be set in `extra` option: + + ```python + greenplum = Greenplum( + ..., + extra={ + "server.port": "41000-42000", + }, + ) + ``` + + Number of ports in this range is `number of parallel running Spark sessions` * `number of parallel connections per session`. + + Number of connections per session (see below) is usually less than `30` (see above). + + Number of session depends on your environment: + : - For `master=local` only few ones-tens sessions can be started on the same host, depends on available RAM and CPU. + - For `master=yarn` hundreds or thousands of sessions can be started simultaneously, + but they are executing on different cluster nodes, so one port can be opened on different nodes at the same time. + +More details can be found in official documentation: +: - [port requirements](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/sys_reqs.html#network-port-requirements) + - [format of server.port value](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.port) + - [port troubleshooting](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/troubleshooting.html#port-errors) + +### Set connection host + +#### Spark with `master=k8s` + +Please follow [the official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/configure.html#k8scfg) + +#### Spark with `master=local` + +By default, Greenplum connector tries to resolve IP of current host, and then pass it as `gpfdist` URL to Greenplum segment. +This may fail in some cases. + +For example, IP can be resolved using `/etc/hosts` content like this: + +```text +127.0.0.1 localhost real-host-name +``` + +```bash +$ hostname -f +localhost + +$ hostname -i +127.0.0.1 +``` + +Reading/writing data to Greenplum will fail with following exception: + +```text +org.postgresql.util.PSQLException: ERROR: connection with gpfdist failed for +"gpfdist://127.0.0.1:49152/local-1709739764667/exec/driver", +effective url: "http://127.0.0.1:49152/local-1709739764667/exec/driver": +error code = 111 (Connection refused); (seg3 slice1 12.34.56.78:10003 pid=123456) +``` + +There are 2 ways to fix that: + +- Explicitly pass your host IP address to connector, like this + + ```python + import os + + # pass here real host IP (accessible from GP segments) + os.environ["HOST_IP"] = "192.168.1.1" + + greenplum = Greenplum( + ..., + extra={ + # connector will read IP from this environment variable + "server.hostEnv": "env.HOST_IP", + }, + spark=spark, + ) + ``` + + More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.hostenv). + +- Update `/etc/hosts` file to include real host IP: + + ```text + 127.0.0.1 localhost + # this IP should be accessible from GP segments + 192.168.1.1 driver-host-name + ``` + + So Greenplum connector will properly resolve host IP. + +#### Spark with `master=yarn` + +The same issue with resolving IP address can occur on Hadoop cluster node, but it's tricky to fix, because each node has a different IP. + +There are 3 ways to fix that: + +- Pass node hostname to `gpfdist` URL. So IP will be resolved on segment side: + + ```python + greenplum = Greenplum( + ..., + extra={ + "server.useHostname": "true", + }, + ) + ``` + + But this may fail if Hadoop cluster node hostname cannot be resolved from Greenplum segment side. + + More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.usehostname). + +- Set specific network interface to get IP address from: + + ```python + greenplum = Greenplum( + ..., + extra={ + "server.nic": "eth0", + }, + ) + ``` + + You can get list of network interfaces using this command. + + ```{eval-rst} + .. note:: + + This command should be executed on Hadoop cluster node, **not** Spark driver host! + ``` + + ```bash + $ ip address + 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 + inet 127.0.0.1/8 scope host lo + valid_lft forever preferred_lft forever + 2: eth0: mtu 1500 qdisc fq_codel state UP group default qlen 1000 + inet 192.168.1.1/24 brd 192.168.1.255 scope global dynamic noprefixroute eth0 + valid_lft 83457sec preferred_lft 83457sec + ``` + + Note that in this case **each** Hadoop cluster node node should have network interface with name `eth0`. + + More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.nic). + +- Update `/etc/hosts` on each Hadoop cluster node to include real node IP: + + ```text + 127.0.0.1 localhost + # this IP should be accessible from GP segments + 192.168.1.1 cluster-node-name + ``` + + So Greenplum connector will properly resolve node IP. + +### Set required grants + +Ask your Greenplum cluster administrator to set following grants for a user, +used for creating a connection: + +```{eval-rst} +.. tabs:: + + .. code-tab:: sql Read + Write + + -- get access to get tables metadata & cluster information + GRANT SELECT ON information_schema.tables TO username; + GRANT SELECT ON pg_attribute TO username; + GRANT SELECT ON pg_class TO username; + GRANT SELECT ON pg_namespace TO username; + GRANT SELECT ON pg_settings TO username; + GRANT SELECT ON pg_stats TO username; + GRANT SELECT ON gp_distributed_xacts TO username; + GRANT SELECT ON gp_segment_configuration TO username; + -- Greenplum 5.x only + GRANT SELECT ON gp_distribution_policy TO username; + + -- allow creating external tables in the same schema as source/target table + GRANT USAGE ON SCHEMA myschema TO username; + GRANT CREATE ON SCHEMA myschema TO username; + ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist') CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist'); + + -- allow read access to specific table (to get column types) + -- allow write access to specific table + GRANT SELECT, INSERT ON myschema.mytable TO username; + + .. code-tab:: sql Read only + + -- get access to get tables metadata & cluster information + GRANT SELECT ON information_schema.tables TO username; + GRANT SELECT ON pg_attribute TO username; + GRANT SELECT ON pg_class TO username; + GRANT SELECT ON pg_namespace TO username; + GRANT SELECT ON pg_settings TO username; + GRANT SELECT ON pg_stats TO username; + GRANT SELECT ON gp_distributed_xacts TO username; + GRANT SELECT ON gp_segment_configuration TO username; + -- Greenplum 5.x only + GRANT SELECT ON gp_distribution_policy TO username; + + -- allow creating external tables in the same schema as source table + GRANT USAGE ON SCHEMA schema_to_read TO username; + GRANT CREATE ON SCHEMA schema_to_read TO username; + -- yes, `writable` for reading from GP, because data is written from Greenplum to Spark executor. + ALTER USER username CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist'); + + -- allow read access to specific table + GRANT SELECT ON schema_to_read.table_to_read TO username; +``` + +More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/install_cfg.html#role-privileges). diff --git a/mkdocs/docs/en/connection/db_connection/greenplum/read.md b/mkdocs/docs/en/connection/db_connection/greenplum/read.md new file mode 100644 index 000000000..fd6a9fe59 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/greenplum/read.md @@ -0,0 +1,386 @@ +(greenplum-read)= + +# Reading from Greenplum using `DBReader` + +Data can be read from Greenplum to Spark using {obj}`DBReader `. +It also supports {ref}`strategy` for incremental data reading. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`greenplum-types`. +``` + +```{eval-rst} +.. note:: + + Unlike JDBC connectors, *Greenplum connector for Spark* does not support + executing **custom** SQL queries using `.sql` method. Connector can be used to only read data from a table or view. +``` + +## Supported DBReader features + +- ✅︎ `columns` (see note below) +- ✅︎ `where` (see note below) +- ✅︎ `hwm` (see note below), supported strategies: +- - ✅︎ {ref}`snapshot-strategy` +- - ✅︎ {ref}`incremental-strategy` +- - ✅︎ {ref}`snapshot-batch-strategy` +- - ✅︎ {ref}`incremental-batch-strategy` +- ❌ `hint` (is not supported by Greenplum) +- ❌ `df_schema` +- ✅︎ `options` (see {obj}`Greenplum.ReadOptions `) + +```{eval-rst} +.. warning:: + + In case of Greenplum connector, `DBReader` does not generate raw `SELECT` query. Instead it relies on Spark SQL syntax + which in some cases (using column projection and predicate pushdown) can be converted to Greenplum SQL. + + So `columns`, `where` and `hwm.expression` should be specified in `Spark SQL `_ syntax, + not Greenplum SQL. + + This is OK: + + .. code-block:: python + + DBReader( + columns=[ + "some_column", + # this cast is executed on Spark side + "CAST(another_column AS STRING)", + ], + # this predicate is parsed by Spark, and can be pushed down to Greenplum + where="some_column LIKE 'val1%'", + ) + + This is will fail: + + .. code-block:: python + + DBReader( + columns=[ + "some_column", + # Spark does not have `text` type + "CAST(another_column AS text)", + ], + # Spark does not support ~ syntax for regexp matching + where="some_column ~ 'val1.*'", + ) +``` + +## Examples + +Snapshot strategy: + +```python +from onetl.connection import Greenplum +from onetl.db import DBReader + +greenplum = Greenplum(...) + +reader = DBReader( + connection=greenplum, + source="schema.table", + columns=["id", "key", "CAST(value AS string) value", "updated_dt"], + where="key = 'something'", +) +df = reader.run() +``` + +Incremental strategy: + +```python +from onetl.connection import Greenplum +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +greenplum = Greenplum(...) + +reader = DBReader( + connection=greenplum, + source="schema.table", + columns=["id", "key", "CAST(value AS string) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="greenplum_hwm", expression="updated_dt"), +) + +with IncrementalStrategy(): + df = reader.run() +``` + +## Interaction schema + +High-level schema is described in {ref}`greenplum-prerequisites`. You can find detailed interaction schema below. + +```{eval-rst} +.. dropdown:: Spark <-> Greenplum interaction during DBReader.run() + + .. plantuml:: + + @startuml + title Greenplum master <-> Spark driver + box "Spark" + participant "Spark driver" + participant "Spark executor1" + participant "Spark executor2" + participant "Spark executorN" + end box + + box "Greenplum" + participant "Greenplum master" + participant "Greenplum segment1" + participant "Greenplum segment2" + participant "Greenplum segmentN" + end box + + == Greenplum.check() == + + activate "Spark driver" + "Spark driver" -> "Greenplum master" ++ : CONNECT + + "Spark driver" --> "Greenplum master" : CHECK IF TABLE EXISTS gp_table + "Greenplum master" --> "Spark driver" : TABLE EXISTS + "Spark driver" -> "Greenplum master" : SHOW SCHEMA FOR gp_table + "Greenplum master" --> "Spark driver" : (id bigint, col1 int, col2 text, ...) + + == DBReader.run() == + + "Spark driver" -> "Spark executor1" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1 + "Spark driver" -> "Spark executor2" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2 + "Spark driver" -> "Spark executorN" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N + + note right of "Spark driver" : This is done in parallel,\nexecutors are independent\n|\n|\n|\nV + "Spark executor1" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...) USING address=executor1_host:executor1_port;\nINSERT INTO EXTERNAL TABLE spark_executor1 FROM gp_table WHERE gp_segment_id = 1 + note right of "Greenplum master" : Each white vertical line here is a opened connection to master.\nUsually, **N+1** connections are created from Spark to Greenplum master + "Greenplum master" --> "Greenplum segment1" ++ : SELECT DATA FROM gp_table_data_on_segment1 TO spark_executor1 + note right of "Greenplum segment1" : No direct requests between Greenplum segments & Spark.\nData transfer is always initiated by Greenplum segments. + + "Spark executor2" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...) USING address=executor2_host:executor2_port;\nINSERT INTO EXTERNAL TABLE spark_executor2 FROM gp_table WHERE gp_segment_id = 2 + "Greenplum master" --> "Greenplum segment2" ++ : SELECT DATA FROM gp_table_data_on_segment2 TO spark_executor2 + + "Spark executorN" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...) USING address=executorN_host:executorN_port;\nINSERT INTO EXTERNAL TABLE spark_executorN FROM gp_table WHERE gp_segment_id = N + "Greenplum master" --> "Greenplum segmentN" ++ : SELECT DATA FROM gp_table_data_on_segmentN TO spark_executorN + + "Greenplum segment1" ->o "Spark executor1" -- : INITIALIZE CONNECTION TO Spark executor1\nPUSH DATA TO Spark executor1 + note left of "Spark executor1" : Circle is an open GPFDIST port,\nlistened by executor + + "Greenplum segment2" ->o "Spark executor2" -- : INITIALIZE CONNECTION TO Spark executor2\nPUSH DATA TO Spark executor2 + "Greenplum segmentN" ->o "Spark executorN" -- : INITIALIZE CONNECTION TO Spark executorN\nPUSH DATA TO Spark executorN + + == Spark.stop() == + + "Spark executor1" --> "Greenplum master" : DROP TABLE spark_executor1 + deactivate "Greenplum master" + "Spark executor2" --> "Greenplum master" : DROP TABLE spark_executor2 + deactivate "Greenplum master" + "Spark executorN" --> "Greenplum master" : DROP TABLE spark_executorN + deactivate "Greenplum master" + + "Spark executor1" --> "Spark driver" -- : DONE + "Spark executor2" --> "Spark driver" -- : DONE + "Spark executorN" --> "Spark driver" -- : DONE + + "Spark driver" --> "Greenplum master" : CLOSE CONNECTION + deactivate "Greenplum master" + deactivate "Spark driver" + @enduml +``` + +## Recommendations + +### Select only required columns + +Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This reduces the amount of data passed from Greenplum to Spark. + +### Pay attention to `where` value + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where="column = 'value'")` clause. +This both reduces the amount of data send from Greenplum to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +### Read data in parallel + +`DBReader` in case of Greenplum connector requires view or table to have a column which is used by Spark +for parallel reads. + +Choosing proper column allows each Spark executor to read only part of data stored in the specified segment, +avoiding moving large amounts of data between segments, which improves reading performance. + +#### Using `gp_segment_id` + +By default, `DBReader` will use [gp_segment_id](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/troubleshooting.html#reading-from-a-view) +column for parallel data reading. Each DataFrame partition will contain data of a specific Greenplum segment. + +This allows each Spark executor read only data from specific Greenplum segment, avoiding moving large amounts of data between segments. + +If view is used, it is recommended to include `gp_segment_id` column to this view: + +```{eval-rst} +.. dropdown:: Reading from view with gp_segment_id column + + .. code-block:: python + + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE VIEW schema.view_with_gp_segment_id AS + SELECT + id, + some_column, + another_column, + gp_segment_id -- IMPORTANT + FROM schema.some_table + """, + ) + + reader = DBReader( + connection=greenplum, + source="schema.view_with_gp_segment_id", + ) + df = reader.run() +``` + +#### Using custom `partition_column` + +Sometimes table or view is lack of `gp_segment_id` column, but there is some column +with value range correlated with Greenplum segment distribution. + +In this case, custom column can be used instead: + +```{eval-rst} +.. dropdown:: Reading from view with custom partition_column + + .. code-block:: python + + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE VIEW schema.view_with_partition_column AS + SELECT + id, + some_column, + part_column -- correlated to greenplum segment ID + FROM schema.some_table + """, + ) + + reader = DBReader( + connection=greenplum, + source="schema.view_with_partition_column", + options=Greenplum.ReadOptions( + # parallelize data using specified column + partitionColumn="part_column", + # create 10 Spark tasks, each will read only part of table data + partitions=10, + ), + ) + df = reader.run() +``` + +#### Reading `DISTRIBUTED REPLICATED` tables + +Replicated tables do not have `gp_segment_id` column at all, so you need to set `partition_column` to some column name +of type integer/bigint/smallint. + +### Parallel `JOIN` execution + +In case of using views which require some data motion between Greenplum segments, like `JOIN` queries, another approach should be used. + +Each Spark executor N will run the same query, so each of N query will start its own JOIN process, leading to really heavy load on Greenplum segments. +**This should be avoided**. + +Instead is recommended to run `JOIN` query on Greenplum side, save the result to an intermediate table, +and then read this table using `DBReader`: + +```{eval-rst} +.. dropdown:: Reading from view using intermediate table + + .. code-block:: python + + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE UNLOGGED TABLE schema.intermediate_table AS + SELECT + id, + tbl1.col1, + tbl1.data, + tbl2.another_data + FROM + schema.table1 as tbl1 + JOIN + schema.table2 as tbl2 + ON + tbl1.col1 = tbl2.col2 + WHERE ... + """, + ) + + reader = DBReader( + connection=greenplum, + source="schema.intermediate_table", + ) + df = reader.run() + + # write dataframe somethere + + greenplum.execute( + """ + DROP TABLE schema.intermediate_table + """, + ) +``` + +```{eval-rst} +.. warning:: + + **NEVER** do that: + + .. code-block:: python + + df1 = DBReader(connection=greenplum, target="public.table1", ...).run() + df2 = DBReader(connection=greenplum, target="public.table2", ...).run() + + joined_df = df1.join(df2, on="col") + + This will lead to sending all the data from both `table1` and `table2` to Spark executor memory, and then `JOIN`` + will be performed on Spark side, not inside Greenplum. This is **VERY** inefficient. +``` + +#### `TEMPORARY` tables notice + +Someone could think that writing data from view or result of `JOIN` to `TEMPORARY` table, +and then passing it to `DBReader`, is an efficient way to read data from Greenplum. This is because temp tables are not generating WAL files, +and are automatically deleted after finishing the transaction. + +That will **NOT** work. Each Spark executor establishes its own connection to Greenplum. +And each connection starts its own transaction which means that every executor will read empty temporary table. + +You should use [UNLOGGED](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-sql_commands-CREATE_TABLE.html) tables +to write data to intermediate table without generating WAL logs. + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.greenplum.options +``` + +```{eval-rst} +.. autopydantic_model:: GreenplumReadOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/greenplum/types.md b/mkdocs/docs/en/connection/db_connection/greenplum/types.md new file mode 100644 index 000000000..596be0fd0 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/greenplum/types.md @@ -0,0 +1,406 @@ +(greenplum-types)= + +# Greenplum \<-> Spark type mapping + +```{eval-rst} +.. note:: + + The results below are valid for Spark 3.2.4, and may differ on other Spark versions. +``` + +## Type detection & casting + +Spark's DataFrames always have a `schema` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +### Reading from Greenplum + +This is how Greenplum connector performs this: + +- Execute query `SELECT * FROM table LIMIT 0` [^footnote-1]. +- For each column in query result get column name and Greenplum type. +- Find corresponding `Greenplum type (read)` → `Spark type` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Use Spark column projection and predicate pushdown features to build a final query. +- Create DataFrame from generated query with inferred schema. + +[^footnote-1]: Yes, **all columns of a table**, not just selected ones. + This means that if source table **contains** columns with unsupported type, the entire table cannot be read. + +### Writing to some existing Greenplum table + +This is how Greenplum connector performs this: + +- Get names of columns in DataFrame. +- Perform `SELECT * FROM table LIMIT 0` query. +- For each column in query result get column name and Greenplum type. +- Match table columns with DataFrame columns (by name, case insensitive). + If some column is present only in target table, but not in DataFrame (like `DEFAULT` or `SERIAL` column), and vice versa, raise an exception. + See [Explicit type cast]. +- Find corresponding `Spark type` → `Greenplumtype (write)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- If `Greenplumtype (write)` match `Greenplum type (read)`, no additional casts will be performed, DataFrame column will be written to Greenplum as is. +- If `Greenplumtype (write)` does not match `Greenplum type (read)`, DataFrame column will be casted to target column type **on Greenplum side**. For example, you can write column with text data to `json` column which Greenplum connector currently does not support. + +### Create new table using Spark + +```{eval-rst} +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! +``` + +This is how Greenplum connector performs this: + +- Find corresponding `Spark type` → `Greenplum type (create)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Generate DDL for creating table in Greenplum, like `CREATE TABLE (col1 ...)`, and run it. +- Write DataFrame to created table as is. + +More details [can be found here](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/write_to_gpdb.html). + +But Greenplum connector support only limited number of types and almost no custom clauses (like `PARTITION BY`). +So instead of relying on Spark to create tables: + +```{eval-rst} +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=greenplum, + target="public.table", + options=Greenplum.WriteOptions( + if_exists="append", + # by default distribution is random + distributedBy="id", + # partitionBy is not supported + ), + ) + writer.run(df) +``` + +Always prefer creating table with desired DDL **BEFORE WRITING DATA**: + +```{eval-rst} +.. dropdown:: See example + + .. code:: python + + greenplum.execute( + """ + CREATE TABLE public.table ( + id int32, + business_dt timestamp(6), + value json + ) + PARTITION BY RANGE (business_dt) + DISTRIBUTED BY id + """, + ) + + writer = DBWriter( + connection=greenplum, + target="public.table", + options=Greenplum.WriteOptions(if_exists="append"), + ) + writer.run(df) +``` + +See Greenplum [CREATE TABLE](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-sql_commands-CREATE_TABLE.html) documentation. + +## Supported types + +See: +: - [official connector documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/reference-datatype_mapping.html) + - [list of Greenplum types](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-data_types.html) + +### Numeric types + +```{eval-rst} ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | ++==================================+===================================+===============================+=========================+ +| `decimal` | `DecimalType(P=38, S=18)` | `decimal(P=38, S=18)` | `decimal` (unbounded) | ++----------------------------------+-----------------------------------+-------------------------------+ | +| `decimal(P=0..38)` | `DecimalType(P=0..38, S=0)` | `decimal(P=0..38, S=0)` | | ++----------------------------------+-----------------------------------+-------------------------------+ | +| `decimal(P=0..38, S=0..38)` | `DecimalType(P=0..38, S=0..38)` | `decimal(P=0..38, S=0..38)` | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `decimal(P=39.., S=0..)` | unsupported [2]_ | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `real` | `FloatType()` | `real` | `real` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `double precision` | `DoubleType()` | `double precision` | `double precision` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `-`` | `ByteType()` | unsupported | unsupported | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `smallint` | `ShortType()` | `smallint` | `smallint` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `integer` | `IntegerType()` | `integer` | `integer` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `bigint` | `LongType()` | `bigint` | `bigint` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `money` | unsupported | | | ++----------------------------------+ | | | +| `int4range` | | | | ++----------------------------------+ | | | +| `int8range` | | | | ++----------------------------------+ | | | +| `numrange` | | | | ++----------------------------------+ | | | +| `int2vector` | | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +``` + +[^footnote-2]: Greenplum support decimal types with unlimited precision. + + But Spark's `DecimalType(P, S)` supports maximum `P=38` (128 bit). It is impossible to read, write or operate with values of larger precision, + this leads to an exception. + +### Temporal types + +```{eval-rst} ++------------------------------------+-------------------------+-----------------------+-------------------------+ +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | ++====================================+=========================+=======================+=========================+ +| `date` | `DateType()` | `date` | `date` | ++------------------------------------+-------------------------+-----------------------+-------------------------+ +| `time` | `TimestampType()`, | `timestamp` | `timestamp` | ++------------------------------------+ time format quirks [3]_ | | | +| `time(0..6)` | | | | ++------------------------------------+ | | | +| `time with time zone` | | | | ++------------------------------------+ | | | +| `time(0..6) with time zone` | | | | ++------------------------------------+-------------------------+-----------------------+-------------------------+ +| `timestamp` | `TimestampType()` | `timestamp` | `timestamp` | ++------------------------------------+ | | | +| `timestamp(0..6)` | | | | ++------------------------------------+ | | | +| `timestamp with time zone` | | | | ++------------------------------------+ | | | +| `timestamp(0..6) with time zone` | | | | ++------------------------------------+-------------------------+-----------------------+-------------------------+ +| `interval` or any precision | unsupported | | | ++------------------------------------+ | | | +| `daterange` | | | | ++------------------------------------+ | | | +| `tsrange` | | | | ++------------------------------------+ | | | +| `tstzrange` | | | | ++------------------------------------+-------------------------+-----------------------+-------------------------+ +``` + +```{eval-rst} +.. warning:: + + Note that types in Greenplum and Spark have different value ranges: + + +----------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + | Greenplum type | Min value | Max value | Spark type | Min value | Max value | + +================+=================================+==================================+=====================+================================+================================+ + | `date` | `-4713-01-01` | `5874897-01-01` | `DateType()` | `0001-01-01` | `9999-12-31` | + +----------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + | `timestamp` | `-4713-01-01 00:00:00.000000` | `294276-12-31 23:59:59.999999` | `TimestampType()` | `0001-01-01 00:00:00.000000` | `9999-12-31 23:59:59.999999` | + +----------------+---------------------------------+----------------------------------+ | | | + | `time` | `00:00:00.000000` | `24:00:00.000000` | | | | + +----------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all of values can be read from Greenplum to Spark. + + References: + * `Greenplum types documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ +``` + +[^footnote-3]: `time` type is the same as `timestamp` with date `1970-01-01`. So instead of reading data from Postgres like `23:59:59` + it is actually read `1970-01-01 23:59:59`, and vice versa. + +### String types + +```{eval-rst} ++-----------------------------+------------------+-----------------------+-------------------------+ +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | ++=============================+==================+=======================+=========================+ +| `character` | `StringType()` | `text` | `text` | ++-----------------------------+ | | | +| `character(N)` | | | | ++-----------------------------+ | | | +| `character varying` | | | | ++-----------------------------+ | | | +| `character varying(N)` | | | | ++-----------------------------+ | | | +| `text` | | | | ++-----------------------------+ | | | +| `xml` | | | | ++-----------------------------+ | | | +| `CREATE TYPE ... AS ENUM` | | | | ++-----------------------------+------------------+-----------------------+-------------------------+ +| `json` | unsupported | | | ++-----------------------------+ | | | +| `jsonb` | | | | ++-----------------------------+------------------+-----------------------+-------------------------+ +``` + +### Binary types + +```{eval-rst} ++--------------------------+-------------------+-----------------------+-------------------------+ +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | ++==========================+===================+=======================+=========================+ +| `boolean` | `BooleanType()` | `boolean` | `boolean` | ++--------------------------+-------------------+-----------------------+-------------------------+ +| `bit` | unsupported | | | ++--------------------------+ | | | +| `bit(N)` | | | | ++--------------------------+ | | | +| `bit varying` | | | | ++--------------------------+ | | | +| `bit varying(N)` | | | | ++--------------------------+-------------------+-----------------------+-------------------------+ +| `bytea` | unsupported [4]_ | | | ++--------------------------+-------------------+-----------------------+-------------------------+ +| `-`` | `BinaryType()` | `bytea` | `bytea` | ++--------------------------+-------------------+-----------------------+-------------------------+ +``` + +[^footnote-4]: Yes, that's weird. + +### Struct types + +```{eval-rst} ++--------------------------------+------------------+-----------------------+-------------------------+ +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | ++================================+==================+=======================+=========================+ +| `T[]` | unsupported | | | ++--------------------------------+------------------+-----------------------+-------------------------+ +| `-`` | `ArrayType()` | unsupported | | ++--------------------------------+------------------+-----------------------+-------------------------+ +| `CREATE TYPE sometype (...)` | `StringType()` | `text` | `text` | ++--------------------------------+------------------+-----------------------+-------------------------+ +| `-`` | `StructType()` | unsupported | | ++--------------------------------+------------------+ | | +| `-`` | `MapType()` | | | ++--------------------------------+------------------+-----------------------+-------------------------+ +``` + +## Unsupported types + +Columns of these types cannot be read/written by Spark: +: - `cidr` + - `inet` + - `macaddr` + - `macaddr8` + - `circle` + - `box` + - `line` + - `lseg` + - `path` + - `point` + - `polygon` + - `tsvector` + - `tsquery` + - `uuid` + +The is a way to avoid this - just cast unsupported types to `text`. But the way this can be done is not a straightforward. + +## Explicit type cast + +### `DBReader` + +Direct casting of Greenplum types is not supported by DBReader due to the connector’s implementation specifics. + +```python +reader = DBReader( + connection=greenplum, + # will fail + columns=["CAST(unsupported_column AS text)"], +) +``` + +But there is a workaround - create a view with casting unsupported column to text (or any other supported type). +For example, you can use [to_json](https://www.postgresql.org/docs/current/functions-json.html) Postgres function to convert column of any type to string representation and then parse this column on Spark side using {obj}`JSON.parse_column ` method. + +```python +from pyspark.sql.types import ArrayType, IntegerType + +from onetl.connection import Greenplum +from onetl.db import DBReader +from onetl.file.format import JSON + +greenplum = Greenplum(...) + +greenplum.execute( + """ + CREATE VIEW schema.view_with_json_column AS + SELECT + id, + supported_column, + to_json(array_column) array_column_as_json, + gp_segment_id -- ! important ! + FROM + schema.table_with_unsupported_columns + """, +) + +# create dataframe using this view +reader = DBReader( + connection=greenplum, + source="schema.view_with_json_column", +) +df = reader.run() + +# Define the schema for the JSON data +json_scheme = ArrayType(IntegerType()) + +df = df.select( + df.id, + df.supported_column, + JSON().parse_column(df.array_column_as_json, json_scheme).alias("array_column"), +) +``` + +### `DBWriter` + +To write data to a `text` or `json` column in a Greenplum table, use {obj}`JSON.serialize_column ` method. + +```python +from onetl.connection import Greenplum +from onetl.db import DBWriter +from onetl.file.format import JSON + +greenplum = Greenplum(...) + +greenplum.execute( + """ + CREATE TABLE schema.target_table ( + id int, + supported_column timestamp, + array_column_as_json jsonb, -- or text + ) + DISTRIBUTED BY id + """, +) + +write_df = df.select( + df.id, + df.supported_column, + JSON().serialize_column(df.array_column).alias("array_column_json"), +) + +writer = DBWriter( + connection=greenplum, + target="schema.target_table", +) +writer.run(write_df) +``` + +Then you can parse this column on Greenplum side: + +```sql +SELECT + id, + supported_column, + -- access first item of an array + array_column_as_json->0 +FROM + schema.target_table +``` diff --git a/mkdocs/docs/en/connection/db_connection/greenplum/write.md b/mkdocs/docs/en/connection/db_connection/greenplum/write.md new file mode 100644 index 000000000..af367ad75 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/greenplum/write.md @@ -0,0 +1,140 @@ +(greenplum-write)= + +# Writing to Greenplum using `DBWriter` + +For writing data to Greenplum, use {obj}`DBWriter ` +with {obj}`GreenplumWriteOptions `. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`greenplum-types`. +``` + +```{eval-rst} +.. warning:: + + It is always recommended to create table explicitly using :ref:`Greenplum.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different types than it is expected. +``` + +## Examples + +```python +from onetl.connection import Greenplum +from onetl.db import DBWriter + +greenplum = Greenplum(...) + +df = ... # data is here + +writer = DBWriter( + connection=greenplum, + target="schema.table", + options=Greenplum.WriteOptions( + if_exists="append", + # by default distribution is random + distributedBy="id", + # partitionBy is not supported + ), +) + +writer.run(df) +``` + +## Interaction schema + +High-level schema is described in {ref}`greenplum-prerequisites`. You can find detailed interaction schema below. + +```{eval-rst} +.. dropdown:: Spark <-> Greenplum interaction during DBWriter.run() + + .. plantuml:: + + @startuml + title Greenplum master <-> Spark driver + box "Spark" + participant "Spark driver" + participant "Spark executor1" + participant "Spark executor2" + participant "Spark executorN" + end box + + box "Greenplum" + participant "Greenplum master" + participant "Greenplum segment1" + participant "Greenplum segment2" + participant "Greenplum segmentN" + end box + + == Greenplum.check() == + + activate "Spark driver" + "Spark driver" -> "Greenplum master" ++ : CONNECT + "Spark driver" --> "Greenplum master" ++ : CHECK IF TABLE EXISTS gp_table + "Greenplum master" --> "Spark driver" : TABLE NOT EXISTS + + == DBWriter.run(df) == + + "Spark driver" -> "Spark executor1" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1 + "Spark driver" -> "Spark executor2" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2 + "Spark driver" -> "Spark executorN" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N + + note right of "Spark driver" : This is done in parallel,\nexecutors are independent\n|\n|\n|\nV + "Spark executor1" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...) USING address=executor1_host:executor1_port;\nINSERT INTO gp_table FROM spark_executor1 + note right of "Greenplum master" : Each white vertical line here is a opened connection to master.\nUsually, **N+1** connections are created from Spark to Greenplum master + "Greenplum master" --> "Greenplum segment1" ++ : SELECT DATA FROM spark_executor1 TO gp_table_data_on_segment1 + note right of "Greenplum segment1" : No direct requests between Greenplum segments & Spark.\nData transfer is always initiated by Greenplum segments. + + "Spark executor2" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...) USING address=executor2_host:executor2_port;\nINSERT INTO gp_table FROM spark_executor2 + "Greenplum master" --> "Greenplum segment2" ++ : SELECT DATA FROM spark_executor2 TO gp_table_data_on_segment2 + + "Spark executorN" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...) USING address=executorN_host:executorN_port;\nINSERT INTO gp_table FROM spark_executorN + "Greenplum master" --> "Greenplum segmentN" ++ : SELECT DATA FROM spark_executorN TO gp_table_data_on_segmentN + + "Greenplum segment1" -->o "Spark executor1" : INITIALIZE CONNECTION TO Spark executor1 + "Spark executor1" -> "Greenplum segment1" : READ DATA FROM Spark executor1 + note left of "Spark executor1" : Circle is an open GPFDIST port,\nlistened by executor + deactivate "Greenplum segment1" + + "Greenplum segment2" -->o "Spark executor2" : INITIALIZE CONNECTION TO Spark executor2 + "Spark executor2" -> "Greenplum segment2" : READ DATA FROM Spark executor2 + deactivate "Greenplum segment2" + + "Greenplum segmentN" -->o "Spark executorN" : INITIALIZE CONNECTION TO Spark executorN + "Spark executorN" -> "Greenplum segmentN" : READ DATA FROM Spark executorN + deactivate "Greenplum segmentN" + + == Finished == + + "Spark executor1" --> "Greenplum master" : DROP TABLE spark_executor1 + deactivate "Greenplum master" + "Spark executor2" --> "Greenplum master" : DROP TABLE spark_executor2 + deactivate "Greenplum master" + "Spark executorN" --> "Greenplum master" : DROP TABLE spark_executorN + deactivate "Greenplum master" + + "Spark executor1" --> "Spark driver" -- : DONE + "Spark executor2" --> "Spark driver" -- : DONE + "Spark executorN" --> "Spark driver" -- : DONE + + "Spark driver" --> "Greenplum master" : CLOSE CONNECTION + deactivate "Greenplum master" + deactivate "Spark driver" + @enduml +``` + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.greenplum.options +``` + +```{eval-rst} +.. autopydantic_model:: GreenplumWriteOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/hive/connection.md b/mkdocs/docs/en/connection/db_connection/hive/connection.md new file mode 100644 index 000000000..a9a0cfc55 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/hive/connection.md @@ -0,0 +1,13 @@ +(hive-connection)= + +# Hive Connection + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.hive.connection +``` + +```{eval-rst} +.. autoclass:: Hive + :members: get_current, check + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/db_connection/hive/execute.md b/mkdocs/docs/en/connection/db_connection/hive/execute.md new file mode 100644 index 000000000..d7b0694c4 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/hive/execute.md @@ -0,0 +1,55 @@ +(hive-execute)= + +# Executing statements in Hive + +Use `Hive.execute(...)` to execute DDL and DML operations. + +## Syntax support + +This method supports **any** query syntax supported by Hive, like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...`, and so on +- ✅︎ `LOAD DATA ...`, and so on +- ✅︎ `ALTER ...` +- ✅︎ `INSERT INTO ... SELECT ...`, and so on +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, and so on +- ✅︎ `MSCK REPAIR TABLE ...`, and so on +- ✅︎ other statements not mentioned here +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +```{eval-rst} +.. warning:: + + Actually, query should be written using `SparkSQL `_ syntax, not HiveQL. +``` + +## Examples + +```python +from onetl.connection import Hive + +hive = Hive(...) + +hive.execute("DROP TABLE schema.table") +hive.execute( + """ + CREATE TABLE schema.table ( + id NUMBER, + key VARCHAR, + value DOUBLE + ) + PARTITION BY (business_date DATE) + STORED AS orc + """ +) +``` + +### Details + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.hive.connection +``` + +```{eval-rst} +.. automethod:: Hive.execute +``` diff --git a/mkdocs/docs/en/connection/db_connection/hive/index.md b/mkdocs/docs/en/connection/db_connection/hive/index.md new file mode 100644 index 000000000..bb8e5ebcc --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/hive/index.md @@ -0,0 +1,28 @@ +(hive)= + +# Hive + +```{toctree} +:caption: Connection +:maxdepth: 1 + +prerequisites +connection +``` + +```{toctree} +:caption: Operations +:maxdepth: 1 + +read +sql +write +execute +``` + +```{toctree} +:caption: For developers +:maxdepth: 1 + +slots +``` diff --git a/mkdocs/docs/en/connection/db_connection/hive/prerequisites.md b/mkdocs/docs/en/connection/db_connection/hive/prerequisites.md new file mode 100644 index 000000000..79ff0e5dc --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/hive/prerequisites.md @@ -0,0 +1,134 @@ +(hive-prerequisites)= + +# Prerequisites + +```{eval-rst} +.. note:: + + onETL's Hive connection is actually SparkSession with access to `Hive Thrift Metastore `_ + and HDFS/S3. + All data motion is made using Spark. Hive Metastore is used only to store tables and partitions metadata. + + This connector does **NOT** require Hive server. It also does **NOT** use Hive JDBC connector. +``` + +## Version Compatibility + +- Hive Metastore version: + : - Officially declared: 0.12 - 3.1.3 (may require to add proper .jar file explicitly) + - Actually tested: 1.2.100, 2.3.10, 3.1.3 +- Spark versions: 2.3.x - 3.5.x +- Java versions: 8 - 20 + +See [official documentation](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html). + +## Installing PySpark + +To use Hive connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See {ref}`install-spark` installation instruction for more details. + +## Connecting to Hive Metastore + +```{eval-rst} +.. note:: + + If you're using managed Hadoop cluster, skip this step, as all Spark configs are should already present on the host. +``` + +Create `$SPARK_CONF_DIR/hive-site.xml` with Hive Metastore URL: + +```xml + + + + + hive.metastore.uris + thrift://metastore.host.name:9083 + + +``` + +Create `$SPARK_CONF_DIR/core-site.xml` with warehouse location ,e.g. HDFS IPC port of Hadoop namenode, or S3 bucket address & credentials: + +```{eval-rst} +.. tabs:: + + .. code-tab:: xml HDFS + + + + + + fs.defaultFS + hdfs://myhadoopcluster:9820 + + + + .. code-tab:: xml S3 + + + + + !-- See https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html#General_S3A_Client_configuration + + fs.defaultFS + s3a://mys3bucket/ + + + fs.s3a.bucket.mybucket.endpoint + http://s3.somain + + + fs.s3a.bucket.mybucket.connection.ssl.enabled + false + + + fs.s3a.bucket.mybucket.path.style.access + true + + + fs.s3a.bucket.mybucket.aws.credentials.provider + org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider + + + fs.s3a.bucket.mybucket.access.key + some-user + + + fs.s3a.bucket.mybucket.secret.key + mysecrettoken + + +``` + +## Using Kerberos + +Some of Hadoop managed clusters use Kerberos authentication. In this case, you should call [kinit](https://web.mit.edu/kerberos/krb5-1.12/doc/user/user_commands/kinit.html) command +**BEFORE** starting Spark session to generate Kerberos ticket. See {ref}`install-kerberos`. + +Sometimes it is also required to pass keytab file to Spark config, allowing Spark executors to generate own Kerberos tickets: + +```{eval-rst} +.. tabs:: + + .. code-tab:: python Spark 3 + + SparkSession.builder + .option("spark.kerberos.access.hadoopFileSystems", "hdfs://namenode1.domain.com:9820,hdfs://namenode2.domain.com:9820") + .option("spark.kerberos.principal", "user") + .option("spark.kerberos.keytab", "/path/to/keytab") + .gerOrCreate() + + .. code-tab:: python Spark 2 + + SparkSession.builder + .option("spark.yarn.access.hadoopFileSystems", "hdfs://namenode1.domain.com:9820,hdfs://namenode2.domain.com:9820") + .option("spark.yarn.principal", "user") + .option("spark.yarn.keytab", "/path/to/keytab") + .gerOrCreate() +``` + +See [Spark security documentation](https://spark.apache.org/docs/latest/security.html#kerberos) +for more details. diff --git a/mkdocs/docs/en/connection/db_connection/hive/read.md b/mkdocs/docs/en/connection/db_connection/hive/read.md new file mode 100644 index 000000000..fbecc2cf6 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/hive/read.md @@ -0,0 +1,95 @@ +(hive-read)= + +# Reading from Hive using `DBReader` + +{obj}`DBReader ` supports {ref}`strategy` for incremental data reading, +but does not support custom queries, like `JOIN`. + +## Supported DBReader features + +- ✅︎ `columns` +- ✅︎ `where` +- ✅︎ `hwm`, supported strategies: +- - ✅︎ {ref}`snapshot-strategy` +- - ✅︎ {ref}`incremental-strategy` +- - ✅︎ {ref}`snapshot-batch-strategy` +- - ✅︎ {ref}`incremental-batch-strategy` +- ❌ `hint` (is not supported by Hive) +- ❌ `df_schema` +- ❌ `options` (only Spark config params are used) + +```{eval-rst} +.. warning:: + + Actually, `columns`, `where` and `hwm.expression` should be written using `SparkSQL `_ syntax, + not HiveQL. +``` + +## Examples + +Snapshot strategy: + +```python +from onetl.connection import Hive +from onetl.db import DBReader + +hive = Hive(...) + +reader = DBReader( + connection=hive, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", +) +df = reader.run() +``` + +Incremental strategy: + +```python +from onetl.connection import Hive +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +hive = Hive(...) + +reader = DBReader( + connection=hive, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="hive_hwm", expression="updated_dt"), +) + +with IncrementalStrategy(): + df = reader.run() +``` + +## Recommendations + +### Use column-based write formats + +Prefer these write formats: +: - [ORC](https://spark.apache.org/docs/latest/sql-data-sources-orc.html) + - [Parquet](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html) + - [Iceberg](https://iceberg.apache.org/spark-quickstart/) + - [Hudi](https://hudi.apache.org/docs/quick-start-guide/) + - [Delta](https://docs.delta.io/latest/quick-start.html#set-up-apache-spark-with-delta-lake) + +For colum-based write formats, each file contains separated sections there column data is stored. The file footer contains +location of each column section/group. Spark can use this information to load only sections required by specific query, e.g. only selected columns, +to drastically speed up the query. + +Another advantage is high compression ratio, e.g. 10x-100x in comparison to JSON or CSV. + +### Select only required columns + +Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. +This drastically reduces the amount of data read by Spark, **if column-based file formats are used**. + +### Use partition columns in `where` clause + +Queries should include `WHERE` clause with filters on Hive partitioning columns. +This allows Spark to read only small set of files (*partition pruning*) instead of scanning the entire table, so this drastically increases performance. + +Supported operators are: `=`, `>`, `<` and `BETWEEN`, and only against some **static** value. diff --git a/mkdocs/docs/en/connection/db_connection/hive/slots.md b/mkdocs/docs/en/connection/db_connection/hive/slots.md new file mode 100644 index 000000000..8db1f7610 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/hive/slots.md @@ -0,0 +1,13 @@ +(hive-slots)= + +# Hive Slots + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.hive.slots +``` + +```{eval-rst} +.. autoclass:: HiveSlots + :members: normalize_cluster_name, get_known_clusters, get_current_cluster + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/db_connection/hive/sql.md b/mkdocs/docs/en/connection/db_connection/hive/sql.md new file mode 100644 index 000000000..9b035f8ff --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/hive/sql.md @@ -0,0 +1,79 @@ +(hive-sql)= + +# Reading from Hive using `Hive.sql` + +`Hive.sql` allows passing custom SQL query, but does not support incremental strategies. + +## Syntax support + +Only queries with the following syntax are supported: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +```{eval-rst} +.. warning:: + + Actually, query should be written using `SparkSQL `_ syntax, not HiveQL. +``` + +## Examples + +```python +from onetl.connection import Hive + +hive = Hive(...) +df = hive.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """ +) +``` + +## Recommendations + +### Use column-based write formats + +Prefer these write formats: +: - [ORC](https://spark.apache.org/docs/latest/sql-data-sources-orc.html) + - [Parquet](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html) + - [Iceberg](https://iceberg.apache.org/spark-quickstart/) + - [Hudi](https://hudi.apache.org/docs/quick-start-guide/) + - [Delta](https://docs.delta.io/latest/quick-start.html#set-up-apache-spark-with-delta-lake) + +For colum-based write formats, each file contains separated sections there column data is stored. The file footer contains +location of each column section/group. Spark can use this information to load only sections required by specific query, e.g. only selected columns, +to drastically speed up the query. + +Another advantage is high compression ratio, e.g. 10x-100x in comparison to JSON or CSV. + +### Select only required columns + +Instead of passing `SELECT * FROM ...` prefer passing exact column names `SELECT col1, col2, ...`. +This drastically reduces the amount of data read by Spark, **if column-based file formats are used**. + +### Use partition columns in `where` clause + +Queries should include `WHERE` clause with filters on Hive partitioning columns. +This allows Spark to read only small set of files (*partition pruning*) instead of scanning the entire table, so this drastically increases performance. + +Supported operators are: `=`, `>`, `<` and `BETWEEN`, and only against some **static** value. + +## Details + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.hive.connection +``` + +```{eval-rst} +.. automethod:: Hive.sql +``` diff --git a/mkdocs/docs/en/connection/db_connection/hive/write.md b/mkdocs/docs/en/connection/db_connection/hive/write.md new file mode 100644 index 000000000..d04fd4852 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/hive/write.md @@ -0,0 +1,180 @@ +(hive-write)= + +# Writing to Hive using `DBWriter` + +For writing data to Hive, use {obj}`DBWriter `. + +## Examples + +```python +from onetl.connection import Hive +from onetl.db import DBWriter + +hive = Hive(...) + +df = ... # data is here + +# Create dataframe with specific number of Spark partitions. +# Use the Hive partitioning columns to group the data. Create max 20 files per Hive partition. +# Also sort the data by column which most data is correlated with (e.g. user_id), reducing files size. + +num_files_per_partition = 20 +partition_columns = ["country", "business_date"] +sort_columns = ["user_id"] +write_df = df.repartition( + num_files_per_partition, + *partition_columns, + *sort_columns, +).sortWithinPartitions(*partition_columns, *sort_columns) + +writer = DBWriter( + connection=hive, + target="schema.table", + options=Hive.WriteOptions( + if_exists="append", + # Hive partitioning columns. + partitionBy=partition_columns, + ), +) + +writer.run(write_df) +``` + +## Recommendations + +### Use column-based write formats + +Prefer these write formats: +: - [ORC](https://spark.apache.org/docs/latest/sql-data-sources-orc.html) (**default**) + - [Parquet](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html) + - [Iceberg](https://iceberg.apache.org/spark-quickstart/) + - [Hudi](https://hudi.apache.org/docs/quick-start-guide/) + - [Delta](https://docs.delta.io/latest/quick-start.html#set-up-apache-spark-with-delta-lake) + +```{eval-rst} +.. warning:: + When using `DBWriter`, the default spark data format configured in `spark.sql.sources.default` is ignored, as `Hive.WriteOptions(format=...)` default value is explicitly set to `orc`. +``` + +For column-based write formats, each file contains separated sections where column data is stored. The file footer contains +location of each column section/group. Spark can use this information to load only sections required by specific query, e.g. only selected columns, +to drastically speed up the query. + +Another advantage is high compression ratio, e.g. 10x-100x in comparison to JSON or CSV. + +### Use partitioning + +#### How does it work + +Hive support splitting data to partitions, which are different directories in filesystem with names like `some_col=value1/another_col=value2`. + +For example, dataframe with content like this: + +| country: string | business_date: date | user_id: int | bytes: long | +| --------------- | ------------------- | ------------ | ----------- | +| RU | 2024-01-01 | 1234 | 25325253525 | +| RU | 2024-01-01 | 2345 | 23234535243 | +| RU | 2024-01-02 | 1234 | 62346634564 | +| US | 2024-01-01 | 5678 | 4252345354 | +| US | 2024-01-02 | 5678 | 5474575745 | +| US | 2024-01-03 | 5678 | 3464574567 | + +With `partitionBy=["country", "business_dt"]` data will be stored as files in the following subfolders: +: - `/country=RU/business_date=2024-01-01/` + - `/country=RU/business_date=2024-01-02/` + - `/country=US/business_date=2024-01-01/` + - `/country=US/business_date=2024-01-02/` + - `/country=US/business_date=2024-01-03/` + +A separated subdirectory is created for each distinct combination of column values in the dataframe. + +Please do not confuse Spark dataframe partitions (a.k.a batches of data handled by Spark executors, usually in parallel) +and Hive partitioning (store data in different subdirectories). +Number of Spark dataframe partitions is correlated the number of files created in **each** Hive partition. +For example, Spark dataframe with 10 partitions and 5 distinct values of Hive partition columns will be saved as 5 subfolders with 10 files each = 50 files in total. +Without Hive partitioning, all the files are placed into one flat directory. + +#### But why? + +Queries which has `WHERE` clause with filters on Hive partitioning columns, like `WHERE country = 'RU' AND business_date='2024-01-01'`, will +read only files from this exact partitions, like `/country=RU/business_date=2024-01-01/`, and skip files from other partitions. + +This drastically increases performance and reduces the amount of memory used by Spark. +Consider using Hive partitioning in all tables. + +#### Which columns should I use? + +Usually Hive partitioning columns are based on event date or location, like `country: string`, `business_date: date`, `run_date: date` and so on. + +**Partition columns should contain data with low cardinality.** +Dates, small integers, strings with low number of possible values are OK. +But timestamp, float, decimals, longs (like user id), strings with lots oj unique values (like user name or email) should **NOT** be used as Hive partitioning columns. +Unlike some other databases, range and hash-based partitions are not supported. + +Partition column should be a part of a dataframe. If you want to partition values by date component of `business_dt: timestamp` column, +add a new column to dataframe like this: `df.withColumn("business_date", date(df.business_dt))`. + +### Use compression + +Using compression algorithms like `snappy`, `lz4` or `zstd` can reduce the size of files (up to 10x). + +### Prefer creating large files + +Storing millions of small files is not that HDFS and S3 are designed for. Minimal file size should be at least 10Mb, but usually it is like 128Mb+ or 256Mb+ (HDFS block size). +**NEVER** create files with few Kbytes in size. + +Number of files can be different in different cases. +On one hand, Spark Adaptive Query Execution (AQE) can merge small Spark dataframe partitions into one larger. +On the other hand, dataframes with skewed data can produce a larger number of files than expected. + +To create small amount of large files, you can reduce number of Spark dataframe partitions. +Use [df.repartition(N, columns...)](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.repartition.html) function, +like this: `df.repartition(20, "col1", "col2")`. +This creates new Spark dataframe with partitions using `hash(df.col1 + df.col2) mod 20` expression, avoiding data skew. + +Note: larger dataframe partitions requires more resources (CPU, RAM) on Spark executor. The exact number of partitions +should be determined empirically, as it depends on the amount of data and available resources. + +### Sort data before writing + +Dataframe with sorted content: + +| country: string | business_date: date | user_id: int | business_dt: timestamp | bytes: long | +| --------------- | ------------------- | ------------ | ----------------------- | ----------- | +| RU | 2024-01-01 | 1234 | 2024-01-01T11:22:33.456 | 25325253525 | +| RU | 2024-01-01 | 1234 | 2024-01-01T12:23:44.567 | 25325253525 | +| RU | 2024-01-02 | 1234 | 2024-01-01T13:25:56.789 | 34335645635 | +| US | 2024-01-01 | 2345 | 2024-01-01T10:00:00.000 | 12341 | +| US | 2024-01-02 | 2345 | 2024-01-01T15:11:22.345 | 13435 | +| US | 2024-01-03 | 2345 | 2024-01-01T20:22:33.567 | 14564 | + +Has a much better compression rate than unsorted one, e.g. 2x or even higher: + +| country: string | business_date: date | user_id: int | business_dt: timestamp | bytes: long | +| --------------- | ------------------- | ------------ | ----------------------- | ----------- | +| RU | 2024-01-01 | 1234 | 2024-01-01T11:22:33.456 | 25325253525 | +| RU | 2024-01-01 | 6345 | 2024-12-01T23:03:44.567 | 25365 | +| RU | 2024-01-02 | 5234 | 2024-07-01T06:10:56.789 | 45643456747 | +| US | 2024-01-01 | 4582 | 2024-04-01T17:59:00.000 | 362546475 | +| US | 2024-01-02 | 2345 | 2024-09-01T04:24:22.345 | 3235 | +| US | 2024-01-03 | 3575 | 2024-03-01T21:37:33.567 | 346345764 | + +Choosing columns to sort data by is really depends on the data. If data is correlated with some specific +column, like in example above the amount of traffic is correlated with both `user_id` and `timestamp`, +use `df.sortWithinPartitions("user_id", "timestamp")` before writing the data. + +If `df.repartition(N, repartition_columns...)` is used in combination with `df.sortWithinPartitions(sort_columns...)`, +then `sort_columns` should start with `repartition_columns` or be equal to it. + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.hive.options +``` + +```{eval-rst} +.. autopydantic_model:: HiveWriteOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/index.md b/mkdocs/docs/en/connection/db_connection/index.md new file mode 100644 index 000000000..4977cf5d5 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/index.md @@ -0,0 +1,19 @@ +(db-connections)= + +# DB Connections + +```{toctree} +:caption: DB Connections +:maxdepth: 1 + +Clickhouse +Greenplum +Kafka +Hive +MongoDB +MSSQL +MySQL +Oracle +Postgres +Teradata +``` diff --git a/mkdocs/docs/en/connection/db_connection/kafka/auth.md b/mkdocs/docs/en/connection/db_connection/kafka/auth.md new file mode 100644 index 000000000..7d112d59a --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/kafka/auth.md @@ -0,0 +1,13 @@ +(kafka-auth)= + +# Kafka Auth + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.kafka.kafka_auth +``` + +```{eval-rst} +.. autoclass:: KafkaAuth + :members: get_options, cleanup + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/db_connection/kafka/basic_auth.md b/mkdocs/docs/en/connection/db_connection/kafka/basic_auth.md new file mode 100644 index 000000000..cb3cded81 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/kafka/basic_auth.md @@ -0,0 +1,14 @@ +(kafka-basic-auth)= + +# Kafka BasicAuth + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.kafka.kafka_basic_auth +``` + +```{eval-rst} +.. autopydantic_model:: KafkaBasicAuth + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/kafka/connection.md b/mkdocs/docs/en/connection/db_connection/kafka/connection.md new file mode 100644 index 000000000..169b5b701 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/kafka/connection.md @@ -0,0 +1,12 @@ +(kafka-connection)= + +# Kafka Connection + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.kafka.connection +``` + +```{eval-rst} +.. autoclass:: Kafka + :members: get_packages, get_exclude_packages, check, close +``` diff --git a/mkdocs/docs/en/connection/db_connection/kafka/index.md b/mkdocs/docs/en/connection/db_connection/kafka/index.md new file mode 100644 index 000000000..903feea96 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/kafka/index.md @@ -0,0 +1,46 @@ +(kafka)= + +# Kafka + +```{toctree} +:caption: Connection +:maxdepth: 1 + +prerequisites +connection +troubleshooting +``` + +```{toctree} +:caption: Protocols +:maxdepth: 1 + +plaintext_protocol +ssl_protocol +``` + +```{toctree} +:caption: Auth methods +:maxdepth: 1 + +basic_auth +kerberos_auth +scram_auth +``` + +```{toctree} +:caption: Operations +:maxdepth: 1 + +read +write +``` + +```{toctree} +:caption: For developers +:maxdepth: 1 + +auth +protocol +slots +``` diff --git a/mkdocs/docs/en/connection/db_connection/kafka/kerberos_auth.md b/mkdocs/docs/en/connection/db_connection/kafka/kerberos_auth.md new file mode 100644 index 000000000..637c9e16e --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/kafka/kerberos_auth.md @@ -0,0 +1,14 @@ +(kafka-kerberos-auth)= + +# Kafka KerberosAuth + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.kafka.kafka_kerberos_auth +``` + +```{eval-rst} +.. autopydantic_model:: KafkaKerberosAuth + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/kafka/plaintext_protocol.md b/mkdocs/docs/en/connection/db_connection/kafka/plaintext_protocol.md new file mode 100644 index 000000000..5bc9ae035 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/kafka/plaintext_protocol.md @@ -0,0 +1,14 @@ +(kafka-plaintext-protocol)= + +# Kafka PlaintextProtocol + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.kafka.kafka_plaintext_protocol +``` + +```{eval-rst} +.. autopydantic_model:: KafkaPlaintextProtocol + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/kafka/prerequisites.md b/mkdocs/docs/en/connection/db_connection/kafka/prerequisites.md new file mode 100644 index 000000000..3020648e4 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/kafka/prerequisites.md @@ -0,0 +1,65 @@ +(kafka-prerequisites)= + +# Prerequisites + +## Version Compatibility + +- Kafka server versions: + : - Officially declared: 0.10 or higher + - Actually tested: 3.2.3, 3.9.0 (only Kafka 3.x supports message headers) +- Spark versions: 2.4.x - 3.5.x +- Java versions: 8 - 17 + +See [official documentation](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html). + +## Installing PySpark + +To use Kafka connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See {ref}`install-spark` installation instruction for more details. + +## Connecting to Kafka + +### Connection address + +Kafka is a distributed service, and usually has a list of brokers you can connect to (unlike other connectors, there only one host+port can be set). +Please contact your Kafka administrator to get addresses of these brokers, as there are no defaults. + +Also Kafka has a feature called *advertised listeners* - client connects to one broker, and received list of other brokers in the clusters. +So you don't have to pass all brokers to `addresses`, it can be some subset. Other broker addresses will be fetched directly from the cluster. + +### Connection protocol + +Kafka can support different connection protocols. List of currently supported protocols: +: - {obj}`PLAINTEXT ` (not secure) + - {obj}`SSL ` (secure, recommended) + +Note that specific port can listen for only one of these protocols, so it is important to set +proper port number + protocol combination. + +### Authentication mechanism + +Kafka can support different authentication mechanism (also known as [SASL](https://en.wikipedia.org/wiki/Simple_Authentication_and_Security_Layer)). + +List of currently supported mechanisms: +: - {obj}`PLAIN `. To no confuse this with `PLAINTEXT` connection protocol, onETL uses name `BasicAuth`. + - {obj}`GSSAPI `. To simplify naming, onETL uses name `KerberosAuth`. + - {obj}`SCRAM-SHA-256 or SCRAM-SHA-512 ` (recommended). + +Different mechanisms use different types of credentials (login + password, keytab file, and so on). + +Note that connection protocol and auth mechanism are set in pairs: +: - If you see `SASL_PLAINTEXT` this means `PLAINTEXT` connection protocol + some auth mechanism. + - If you see `SASL_SSL` this means `SSL` connection protocol + some auth mechanism. + - If you see just `PLAINTEXT` or `SSL` (**no** `SASL`), this means that authentication is disabled (anonymous access). + +Please contact your Kafka administrator to get details about enabled auth mechanism in a specific Kafka instance. + +### Required grants + +Ask your Kafka administrator to set following grants for a user, *if Kafka instance uses ACL*: +: - `Describe` + `Read` for reading data from Kafka (Consumer). + - `Describe` + `Write` for writing data from Kafka (Producer). + +More details can be found in [documentation](https://kafka.apache.org/documentation/#operations_in_kafka). diff --git a/mkdocs/docs/en/connection/db_connection/kafka/protocol.md b/mkdocs/docs/en/connection/db_connection/kafka/protocol.md new file mode 100644 index 000000000..b9278d009 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/kafka/protocol.md @@ -0,0 +1,13 @@ +(kafka-protocol)= + +# Kafka Protocol + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.kafka.kafka_protocol +``` + +```{eval-rst} +.. autoclass:: KafkaProtocol + :members: get_options, cleanup + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/db_connection/kafka/read.md b/mkdocs/docs/en/connection/db_connection/kafka/read.md new file mode 100644 index 000000000..05ffa75c4 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/kafka/read.md @@ -0,0 +1,137 @@ +(kafka-read)= + +# Reading from Kafka + +Data can be read from Kafka to Spark using {obj}`DBReader `. +It also supports {ref}`strategy` for incremental data reading. + +## Supported DBReader features + +- ❌ `columns` (is not supported by Kafka) +- ❌ `where` (is not supported by Kafka) +- ✅︎ `hwm`, supported strategies: +- - ✅︎ {ref}`snapshot-strategy` +- - ✅︎ {ref}`incremental-strategy` +- - ❌ {ref}`snapshot-batch-strategy` +- - ❌ {ref}`incremental-batch-strategy` +- ❌ `hint` (is not supported by Kafka) +- ❌ `df_schema` (see note below) +- ✅︎ `options` (see {obj}`Kafka.ReadOptions `) + +## Dataframe schema + +Unlike other DB connections, Kafka does not have concept of columns. +All the topics messages have the same set of fields, see structure below: + +```text +root +|-- key: binary (nullable = true) +|-- value: binary (nullable = true) +|-- topic: string (nullable = false) +|-- partition: integer (nullable = false) +|-- offset: integer (nullable = false) +|-- timestamp: timestamp (nullable = false) +|-- timestampType: integer (nullable = false) +|-- headers: struct (nullable = true) + |-- key: string (nullable = false) + |-- value: binary (nullable = true) +``` + +`headers` field is present in the dataframe only if `Kafka.ReadOptions(include_headers=True)` is passed (compatibility with Kafka 1.x). + +## Value deserialization + +To read `value` or `key` of other type than bytes (e.g. struct or integer), users have to deserialize values manually. + +This could be done using following methods: +: - {obj}`Avro.parse_column ` + - {obj}`JSON.parse_column ` + - {obj}`CSV.parse_column ` + - {obj}`XML.parse_column ` + +## Examples + +Snapshot strategy, `value` is Avro binary data: + +```python +from onetl.connection import Kafka +from onetl.db import DBReader, DBWriter +from onetl.file.format import Avro +from pyspark.sql.functions import decode + +# read all topic data from Kafka +kafka = Kafka(...) +reader = DBReader(connection=kafka, source="avro_topic") +read_df = reader.run() + +# parse Avro format to Spark struct +avro = Avro( + schema_dict={ + "type": "record", + "name": "Person", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "age", "type": "int"}, + ], + } +) +deserialized_df = read_df.select( + # cast binary key to string + decode("key", "UTF-8").alias("key"), + avro.parse_column("value"), +) +``` + +Incremental strategy, `value` is JSON string: + +```{eval-rst} +.. note:: + + Currently Kafka connector does support only HWMs based on `offset` field. Other fields, like `timestamp`, are not yet supported. +``` + +```python +from onetl.connection import Kafka +from onetl.db import DBReader, DBWriter +from onetl.file.format import JSON +from pyspark.sql.functions import decode + +kafka = Kafka(...) + +# read only new data from Kafka topic +reader = DBReader( + connection=kafka, + source="topic_name", + hwm=DBReader.AutoDetectHWM(name="kafka_hwm", expression="offset"), +) + +with IncrementalStrategy(): + read_df = reader.run() + +# parse JSON format to Spark struct +json = JSON() +schema = StructType( + [ + StructField("name", StringType(), nullable=True), + StructField("age", IntegerType(), nullable=True), + ], +) +deserialized_df = read_df.select( + # cast binary key to string + decode("key", "UTF-8").alias("key"), + json.parse_column("value", json), +) +``` + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.kafka.options +``` + +```{eval-rst} +.. autopydantic_model:: KafkaReadOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/kafka/scram_auth.md b/mkdocs/docs/en/connection/db_connection/kafka/scram_auth.md new file mode 100644 index 000000000..5a27a46ea --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/kafka/scram_auth.md @@ -0,0 +1,14 @@ +(kafka-scram-auth)= + +# Kafka ScramAuth + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.kafka.kafka_scram_auth +``` + +```{eval-rst} +.. autopydantic_model:: KafkaScramAuth + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/kafka/slots.md b/mkdocs/docs/en/connection/db_connection/kafka/slots.md new file mode 100644 index 000000000..0c58413fb --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/kafka/slots.md @@ -0,0 +1,13 @@ +(kafka-slots)= + +# Kafka Slots + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.kafka.slots +``` + +```{eval-rst} +.. autoclass:: KafkaSlots + :members: normalize_cluster_name, get_known_clusters, normalize_address, get_cluster_addresses + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/db_connection/kafka/ssl_protocol.md b/mkdocs/docs/en/connection/db_connection/kafka/ssl_protocol.md new file mode 100644 index 000000000..6e1a7641d --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/kafka/ssl_protocol.md @@ -0,0 +1,14 @@ +(kafka-ssl-protocol)= + +# Kafka SSLProtocol + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.kafka.kafka_ssl_protocol +``` + +```{eval-rst} +.. autopydantic_model:: KafkaSSLProtocol + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/kafka/troubleshooting.md b/mkdocs/docs/en/connection/db_connection/kafka/troubleshooting.md new file mode 100644 index 000000000..7a3536fbf --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/kafka/troubleshooting.md @@ -0,0 +1,13 @@ +(kafka-troubleshooting)= + +# Kafka Troubleshooting + +```{eval-rst} +.. note:: + + General guide: :ref:`troubleshooting`. +``` + +## Cannot connect using `SSL` protocol + +Please check that certificate files are not Base-64 encoded. diff --git a/mkdocs/docs/en/connection/db_connection/kafka/write.md b/mkdocs/docs/en/connection/db_connection/kafka/write.md new file mode 100644 index 000000000..cb3e3019a --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/kafka/write.md @@ -0,0 +1,75 @@ +(kafka-write)= + +# Writing to Kafka + +For writing data to Kafka, use {obj}`DBWriter ` with specific options (see below). + +## Dataframe schema + +Unlike other DB connections, Kafka does not have concept of columns. +All the topics messages have the same set of fields. Only some of them can be written: + +```text +root +|-- key: binary (nullable = true) +|-- value: binary (nullable = true) +|-- headers: struct (nullable = true) + |-- key: string (nullable = false) + |-- value: binary (nullable = true) +``` + +`headers` can be passed only with `Kafka.WriteOptions(include_headers=True)` (compatibility with Kafka 1.x). + +Field `topic` should not be present in the dataframe, as it is passed to `DBWriter(target=...)`. + +Other fields, like `partition`, `offset`, `timestamp` are set by Kafka, and cannot be passed explicitly. + +## Value serialization + +To write `value` or `key` of other type than bytes (e.g. struct or integer), users have to serialize values manually. + +This could be done using following methods: +: - {obj}`Avro.serialize_column ` + - {obj}`JSON.serialize_column ` + - {obj}`CSV.serialize_column ` + +## Examples + +Convert `value` to JSON string, and write to Kafka: + +```python +from onetl.connection import Kafka +from onetl.db import DBWriter +from onetl.file.format import JSON + +df = ... # original data is here + +# serialize struct data as JSON +json = JSON() +write_df = df.select( + df.key, + json.serialize_column(df.value), +) + +# write data to Kafka +kafka = Kafka(...) + +writer = DBWriter( + connection=kafka, + target="topic_name", +) +writer.run(write_df) +``` + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.kafka.options +``` + +```{eval-rst} +.. autopydantic_model:: KafkaWriteOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/mongodb/connection.md b/mkdocs/docs/en/connection/db_connection/mongodb/connection.md new file mode 100644 index 000000000..b81c56ffc --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mongodb/connection.md @@ -0,0 +1,13 @@ +(mongodb-connection)= + +# MongoDB Connection + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.mongodb.connection +``` + +```{eval-rst} +.. autoclass:: MongoDB + :members: get_packages, check + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/db_connection/mongodb/index.md b/mkdocs/docs/en/connection/db_connection/mongodb/index.md new file mode 100644 index 000000000..fb9656117 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mongodb/index.md @@ -0,0 +1,27 @@ +(mongodb)= + +# MongoDB + +```{toctree} +:caption: Connection +:maxdepth: 1 + +prerequisites +connection +``` + +```{toctree} +:caption: Operations +:maxdepth: 1 + +read +pipeline +write +``` + +```{toctree} +:caption: Troubleshooting +:maxdepth: 1 + +types +``` diff --git a/mkdocs/docs/en/connection/db_connection/mongodb/pipeline.md b/mkdocs/docs/en/connection/db_connection/mongodb/pipeline.md new file mode 100644 index 000000000..2cc30458c --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mongodb/pipeline.md @@ -0,0 +1,41 @@ +(mongodb-sql)= + +# Reading from MongoDB using `MongoDB.pipeline` + +{obj}`MongoDB.sql ` allows passing custom pipeline, +but does not support incremental strategies. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`mongodb-types` +``` + +## Recommendations + +### Pay attention to `pipeline` value + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `mongodb.pipeline(..., pipeline={"$match": {"column": {"$eq": "value"}}})` value. +This both reduces the amount of data send from MongoDB to Spark, and may also improve performance of the query. +Especially if there are indexes for columns used in `pipeline` value. + +## References + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.mongodb.connection +``` + +```{eval-rst} +.. automethod:: MongoDB.pipeline +``` + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.mongodb.options +``` + +```{eval-rst} +.. autopydantic_model:: MongoDBPipelineOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/mongodb/prerequisites.md b/mkdocs/docs/en/connection/db_connection/mongodb/prerequisites.md new file mode 100644 index 000000000..f491ca10e --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mongodb/prerequisites.md @@ -0,0 +1,72 @@ +(mongodb-prerequisites)= + +# Prerequisites + +## Version Compatibility + +- MongoDB server versions: + : - Officially declared: 4.0 or higher + - Actually tested: 4.0.0, 8.0.4 +- Spark versions: 3.2.x - 3.5.x +- Java versions: 8 - 20 + +See [official documentation](https://www.mongodb.com/docs/spark-connector/). + +## Installing PySpark + +To use MongoDB connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See {ref}`install-spark` installation instruction for more details. + +## Connecting to MongoDB + +### Connection host + +It is possible to connect to MongoDB host by using either DNS name of host or it's IP address. + +It is also possible to connect to MongoDB shared cluster: + +```python +mongo = MongoDB( + host="master.host.or.ip", + user="user", + password="*****", + database="target_database", + spark=spark, + extra={ + # read data from secondary cluster node, switch to primary if not available + "readPreference": "secondaryPreferred", + }, +) +``` + +Supported `readPreference` values are described in [official documentation](https://www.mongodb.com/docs/manual/core/read-preference/). + +### Connection port + +Connection is usually performed to port `27017`. Port may differ for different MongoDB instances. +Please ask your MongoDB administrator to provide required information. + +### Required grants + +Ask your MongoDB cluster administrator to set following grants for a user, +used for creating a connection: + +```{eval-rst} +.. tabs:: + + .. code-tab:: js Read + Write + + // allow writing data to specific database + db.grantRolesToUser("username", [{db: "somedb", role: "readWrite"}]) + + .. code-tab:: js Read only + + // allow reading data from specific database + db.grantRolesToUser("username", [{db: "somedb", role: "read"}]) +``` + +See: +: - [db.grantRolesToUser documentation](https://www.mongodb.com/docs/manual/reference/method/db.grantRolesToUser) + - [MongoDB builtin roles](https://www.mongodb.com/docs/manual/reference/built-in-roles) diff --git a/mkdocs/docs/en/connection/db_connection/mongodb/read.md b/mkdocs/docs/en/connection/db_connection/mongodb/read.md new file mode 100644 index 000000000..15cf34bb1 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mongodb/read.md @@ -0,0 +1,141 @@ +(mongodb-read)= + +# Reading from MongoDB using `DBReader` + +{obj}`DBReader ` supports {ref}`strategy` for incremental data reading, +but does not support custom pipelines, e.g. aggregation. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`mongodb-types` +``` + +## Supported DBReader features + +- ❌ `columns` (for now, all document fields are read) +- ✅︎ `where` (passed to `{"$match": ...}` aggregation pipeline) +- ✅︎ `hwm`, supported strategies: +- - ✅︎ {ref}`snapshot-strategy` +- - ✅︎ {ref}`incremental-strategy` +- - ✅︎ {ref}`snapshot-batch-strategy` +- - ✅︎ {ref}`incremental-batch-strategy` +- - Note that `expression` field of HWM can only be a field name, not a custom expression +- ✅︎ `hint` (see [official documentation](https://www.mongodb.com/docs/v5.0/reference/operator/meta/hint/)) +- ✅︎ `df_schema` (mandatory) +- ✅︎ `options` (see {obj}`MongoDB.ReadOptions `) + +## Examples + +Snapshot strategy: + +```python +from onetl.connection import MongoDB +from onetl.db import DBReader + +from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, +) + +mongodb = MongoDB(...) + +# mandatory +df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ], + ), + ), + StructField("updated_dt", TimestampType()), + ] +) + +reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + where={"field": {"$eq": 123}}, + hint={"field": 1}, + options=MongoDBReadOptions(batchSize=10000), +) +df = reader.run() +``` + +Incremental strategy: + +```python +from onetl.connection import MongoDB +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, +) + +mongodb = MongoDB(...) + +# mandatory +df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ], + ), + ), + StructField("updated_dt", TimestampType()), + ] +) + +reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + where={"field": {"$eq": 123}}, + hint={"field": 1}, + hwm=DBReader.AutoDetectHWM(name="mongodb_hwm", expression="updated_dt"), + options=MongoDBReadOptions(batchSize=10000), +) + +with IncrementalStrategy(): + df = reader.run() +``` + +## Recommendations + +### Pay attention to `where` value + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where={"column": {"$eq": "value"}})` clause. +This both reduces the amount of data send from MongoDB to Spark, and may also improve performance of the query. +Especially if there are indexes for columns used in `where` clause. + +## Read options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.mongodb.options +``` + +```{eval-rst} +.. autopydantic_model:: MongoDBReadOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/mongodb/types.md b/mkdocs/docs/en/connection/db_connection/mongodb/types.md new file mode 100644 index 000000000..b75673c26 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mongodb/types.md @@ -0,0 +1,269 @@ +(mongodb-types)= + +# MongoDB \<-> Spark type mapping + +```{eval-rst} +.. note:: + + The results below are valid for Spark 3.5.5, and may differ on other Spark versions. +``` + +## Type detection & casting + +Spark's DataFrames always have a `schema` which is a list of fields with corresponding Spark types. All operations on a field are performed using field type. + +MongoDB is, by design, \_\_schemaless\_\_. So there are 2 ways how this can be handled: + +- User provides DataFrame schema explicitly: + + ```{eval-rst} + .. dropdown:: See example + + .. code-block:: python + + from onetl.connection import MongoDB + from onetl.db import DBReader + + from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, + ) + + mongodb = MongoDB(...) + + df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ] + ), + ), + ] + ) + + reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + ) + df = reader.run() + + # or + + df = mongodb.pipeline( + collection="some_collection", + df_schema=df_schema, + ) + ``` + +- Rely on MongoDB connector schema infer: + + ```python + df = mongodb.pipeline(collection="some_collection") + ``` + + In this case MongoDB connector read a sample of collection documents, and build DataFrame schema based on document fields and values. + +It is highly recommended to pass `df_schema` explicitly, to avoid type conversion issues. + +### References + +Here you can find source code with type conversions: + +- [MongoDB -> Spark](https://github.com/mongodb/mongo-spark/blob/r10.4.1/src/main/java/com/mongodb/spark/sql/connector/schema/InferSchema.java#L214-L260) +- [Spark -> MongoDB](https://github.com/mongodb/mongo-spark/blob/r10.4.1/src/main/java/com/mongodb/spark/sql/connector/schema/RowToBsonDocumentConverter.java#L157-L260) + +## Supported types + +See [official documentation](https://www.mongodb.com/docs/manual/reference/bson-types/) + +### Numeric types + +```{eval-rst} ++---------------------+-----------------------------+----------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++=====================+=============================+======================+ +| `Decimal128` | `DecimalType(P=34, S=32)` | `Decimal128` | ++---------------------+-----------------------------+----------------------+ +| `-`` | `FloatType()` | `Double` | ++---------------------+-----------------------------+ | +| `Double` | `DoubleType()` | | ++---------------------+-----------------------------+----------------------+ +| `-`` | `ByteType()` | `Int32` | ++---------------------+-----------------------------+ | +| `-`` | `ShortType()` | | ++---------------------+-----------------------------+ | +| `Int32` | `IntegerType()` | | ++---------------------+-----------------------------+----------------------+ +| `Int64` | `LongType()` | `Int64` | ++---------------------+-----------------------------+----------------------+ +``` + +### Temporal types + +```{eval-rst} ++------------------------+-----------------------------------+-------------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++========================+===================================+=========================+ +| `-`` | `DateType()`, days | `Date`, milliseconds | ++------------------------+-----------------------------------+-------------------------+ +| `Date`, milliseconds | `TimestampType()`, microseconds | `Date`, milliseconds, | +| | | **precision loss** [2]_ | ++------------------------+-----------------------------------+-------------------------+ +| `Timestamp`, seconds | `TimestampType()`, microseconds | `Date`, milliseconds | ++------------------------+-----------------------------------+-------------------------+ +| `-`` | `TimestampNTZType()` | unsupported | ++------------------------+-----------------------------------+ | +| `-`` | `DayTimeIntervalType()` | | ++------------------------+-----------------------------------+-------------------------+ +``` + +```{eval-rst} +.. warning:: + + Note that types in MongoDB and Spark have different value ranges: + + +---------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + | MongoDB type | Min value | Max value | Spark type | Min value | Max value | + +===============+================================+================================+=====================+================================+================================+ + | `Date` | -290 million years | 290 million years | `TimestampType()` | `0001-01-01 00:00:00.000000` | `9999-12-31 23:59:59.999999` | + +---------------+--------------------------------+--------------------------------+ | | | + | `Timestamp` | `1970-01-01 00:00:00` | `2106-02-07 09:28:16` | | | | + +---------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all values can be read from MongoDB to Spark, and can written from Spark DataFrame to MongoDB. + + References: + * `MongoDB Date type documentation `_ + * `MongoDB Timestamp documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ +``` + +[^footnote-1]: MongoDB `Date` type has precision up to milliseconds (`23:59:59.999`). + Inserting data with microsecond precision (`23:59:59.999999`) + will lead to **throwing away microseconds**. + +### String types + +Note: fields of deprecated MongoDB type `Symbol` are excluded during read. + +```{eval-rst} ++---------------------+------------------+----------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++=====================+==================+======================+ +| `String` | `StringType()` | `String` | ++---------------------+ | | +| `Code` | | | ++---------------------+ | | +| `RegExp` | | | ++---------------------+------------------+----------------------+ +``` + +### Binary types + +| MongoDB type (read) | Spark type | MongoDB type (write) | +| ------------------- | --------------- | -------------------- | +| `Boolean` | `BooleanType()` | `Boolean` | +| `Binary` | `BinaryType()` | `Binary` | + +### Struct types + +```{eval-rst} ++---------------------+-----------------------+----------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++=====================+=======================+======================+ +| `Array[T]` | `ArrayType(T)` | `Array[T]` | ++---------------------+-----------------------+----------------------+ +| `Object[...]` | `StructType([...])` | `Object[...]` | ++---------------------+-----------------------+ | +| `-`` | `MapType(...)` | | ++---------------------+-----------------------+----------------------+ +``` + +### Special types + +```{eval-rst} ++---------------------+---------------------------------------------------------+---------------------------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++=====================+=========================================================+=======================================+ +| `ObjectId` | `StringType()` | `String` | ++---------------------+ | | +| `MaxKey` | | | ++---------------------+ | | +| `MinKey` | | | ++---------------------+---------------------------------------------------------+---------------------------------------+ +| `Null` | `NullType()` | `Null` | ++---------------------+ | | +| `Undefined` | | | ++---------------------+---------------------------------------------------------+---------------------------------------+ +| `DBRef` | `StructType([$ref: StringType(), $id: StringType()])` | `Object[$ref: String, $id: String]` | ++---------------------+---------------------------------------------------------+---------------------------------------+ +``` + +## Explicit type cast + +### `DBReader` + +Currently it is not possible to cast field types using `DBReader`. But this can be done using `MongoDB.pipeline`. + +### `MongoDB.pipeline` + +You can use `$project` aggregation to cast field types: + +```python +from pyspark.sql.types import IntegerType, StructField, StructType + +from onetl.connection import MongoDB +from onetl.db import DBReader + +mongodb = MongoDB(...) + +df = mongodb.pipeline( + collection="my_collection", + pipeline=[ + { + "$project": { + # convert unsupported_field to string + "unsupported_field_str": { + "$convert": { + "input": "$unsupported_field", + "to": "string", + }, + }, + # skip unsupported_field from result + "unsupported_field": 0, + } + } + ], +) + +# cast field content to proper Spark type +df = df.select( + df.id, + df.supported_field, + # explicit cast + df.unsupported_field_str.cast("integer").alias("parsed_integer"), +) +``` + +### `DBWriter` + +Convert dataframe field to string on Spark side, and then write it to MongoDB: + +```python +df = df.select( + df.id, + df.unsupported_field.cast("string").alias("array_field_json"), +) + +writer.run(df) +``` diff --git a/mkdocs/docs/en/connection/db_connection/mongodb/write.md b/mkdocs/docs/en/connection/db_connection/mongodb/write.md new file mode 100644 index 000000000..5b44d8613 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mongodb/write.md @@ -0,0 +1,47 @@ +(mongodb-write)= + +# Writing to MongoDB using `DBWriter` + +For writing data to MongoDB, use {obj}`DBWriter `. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`mongodb-types` +``` + +## Examples + +```python +from onetl.connection import MongoDB +from onetl.db import DBWriter + +mongodb = MongoDB(...) + +df = ... # data is here + +writer = DBWriter( + connection=mongodb, + target="schema.table", + options=MongoDB.WriteOptions( + if_exists="append", + ), +) + +writer.run(df) +``` + +## Write options + +Method above accepts {obj}`MongoDB.WriteOptions ` + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.mongodb.options +``` + +```{eval-rst} +.. autopydantic_model:: MongoDBWriteOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/mssql/connection.md b/mkdocs/docs/en/connection/db_connection/mssql/connection.md new file mode 100644 index 000000000..e35ef6ed9 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mssql/connection.md @@ -0,0 +1,12 @@ +(mssql-connection)= + +# MSSQL connection + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.mssql.connection +``` + +```{eval-rst} +.. autoclass:: MSSQL + :members: get_packages, check +``` diff --git a/mkdocs/docs/en/connection/db_connection/mssql/execute.md b/mkdocs/docs/en/connection/db_connection/mssql/execute.md new file mode 100644 index 000000000..278d84229 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mssql/execute.md @@ -0,0 +1,117 @@ +(mssql-execute)= + +# Executing statements in MSSQL + +```{eval-rst} +.. warning:: + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`MSSQL.sql ` instead. +``` + +## How to + +There are 2 ways to execute some statement in MSSQL + +### Use `MSSQL.fetch` + +Use this method to perform some `SELECT` query which returns **small number or rows**, like reading +MSSQL config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts {obj}`MSSQL.FetchOptions `. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`mssql-types`. +``` + +#### Syntax support + +This method supports **any** query syntax supported by MSSQL, like: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ✅︎ `SELECT func(arg1, arg2) FROM DUAL` - call function +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples + +```python +from onetl.connection import MSSQL + +mssql = MSSQL(...) + +df = mssql.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=MSSQL.FetchOptions(queryTimeout=10), +) +mssql.close() +value = df.collect()[0][0] # get value from first row and first column +``` + +### Use `MSSQL.execute` + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts {obj}`MSSQL.ExecuteOptions `. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +#### Syntax support + +This method supports **any** query syntax supported by MSSQL, like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...` +- ✅︎ `ALTER ...` +- ✅︎ `INSERT INTO ... AS SELECT ...` +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, `TRUNCATE TABLE`, and so on +- ✅︎ `EXEC procedure(arg1, arg2) ...` or `{call procedure(arg1, arg2)}` - special syntax for calling procedure +- ✅︎ `DECLARE ... BEGIN ... END` - execute PL/SQL statement +- ✅︎ other statements not mentioned here +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples + +```python +from onetl.connection import MSSQL + +mssql = MSSQL(...) + +mssql.execute("DROP TABLE schema.table") +mssql.execute( + """ + CREATE TABLE schema.table ( + id bigint GENERATED ALWAYS AS IDENTITY, + key VARCHAR2(4000), + value NUMBER + ) + """, + options=MSSQL.ExecuteOptions(queryTimeout=10), +) +``` + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.mssql.options +``` + +```{eval-rst} +.. autopydantic_model:: MSSQLFetchOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` + +```{eval-rst} +.. autopydantic_model:: MSSQLExecuteOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/mssql/index.md b/mkdocs/docs/en/connection/db_connection/mssql/index.md new file mode 100644 index 000000000..16448c6a1 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mssql/index.md @@ -0,0 +1,28 @@ +(mssql)= + +# MSSQL + +```{toctree} +:caption: Connection +:maxdepth: 1 + +prerequisites +connection +``` + +```{toctree} +:caption: Operations +:maxdepth: 1 + +read +sql +write +execute +``` + +```{toctree} +:caption: Troubleshooting +:maxdepth: 1 + +types +``` diff --git a/mkdocs/docs/en/connection/db_connection/mssql/prerequisites.md b/mkdocs/docs/en/connection/db_connection/mssql/prerequisites.md new file mode 100644 index 000000000..89e62f322 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mssql/prerequisites.md @@ -0,0 +1,76 @@ +(mssql-prerequisites)= + +# Prerequisites + +## Version Compatibility + +- SQL Server versions: + : - Officially declared: 2016 - 2022 + - Actually tested: 2017, 2022 +- Spark versions: 2.3.x - 3.5.x +- Java versions: 8 - 20 + +See [official documentation](https://learn.microsoft.com/en-us/sql/connect/jdbc/system-requirements-for-the-jdbc-driver) +and [official compatibility matrix](https://learn.microsoft.com/en-us/sql/connect/jdbc/microsoft-jdbc-driver-for-sql-server-support-matrix). + +## Installing PySpark + +To use MSSQL connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See {ref}`install-spark` installation instruction for more details. + +## Connecting to MSSQL + +### Connection port + +Connection is usually performed to port 1433. Port may differ for different MSSQL instances. +Please ask your MSSQL administrator to provide required information. + +For named MSSQL instances (`instanceName` option), [port number is optional](https://learn.microsoft.com/en-us/sql/connect/jdbc/building-the-connection-url?view=sql-server-ver16#named-and-multiple-sql-server-instances), and could be omitted. + +### Connection host + +It is possible to connect to MSSQL by using either DNS name of host or it's IP address. + +If you're using MSSQL cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +### Required grants + +Ask your MSSQL cluster administrator to set following grants for a user, +used for creating a connection: + +```{eval-rst} +.. tabs:: + + .. code-tab:: sql Read + Write (schema is owned by user) + + -- allow creating tables for user + GRANT CREATE TABLE TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON username.mytable TO username; + + -- only if if_exists="replace_entire_table" is used: + -- allow dropping/truncating tables in any schema + GRANT ALTER ON username.mytable TO username; + + .. code-tab:: sql Read + Write (schema is not owned by user) + + -- allow creating tables for user + GRANT CREATE TABLE TO username; + + -- allow managing tables in specific schema, and inserting data to tables + GRANT ALTER, SELECT, INSERT ON SCHEMA::someschema TO username; + + .. code-tab:: sql Read only + + -- allow read access to specific table + GRANT SELECT ON someschema.mytable TO username; +``` + +More details can be found in official documentation: +: - [GRANT ON DATABASE](https://learn.microsoft.com/en-us/sql/t-sql/statements/grant-database-permissions-transact-sql) + - [GRANT ON OBJECT](https://learn.microsoft.com/en-us/sql/t-sql/statements/grant-object-permissions-transact-sql) + - [GRANT ON SCHEMA](https://learn.microsoft.com/en-us/sql/t-sql/statements/grant-schema-permissions-transact-sql) diff --git a/mkdocs/docs/en/connection/db_connection/mssql/read.md b/mkdocs/docs/en/connection/db_connection/mssql/read.md new file mode 100644 index 000000000..924d10e2f --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mssql/read.md @@ -0,0 +1,93 @@ +(mssql-read)= + +# Reading from MSSQL using `DBReader` + +{obj}`DBReader ` supports {ref}`strategy` for incremental data reading, +but does not support custom queries, like `JOIN`. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`mssql-types` +``` + +## Supported DBReader features + +- ✅︎ `columns` +- ✅︎ `where` +- ✅︎ `hwm`, supported strategies: +- - ✅︎ {ref}`snapshot-strategy` +- - ✅︎ {ref}`incremental-strategy` +- - ✅︎ {ref}`snapshot-batch-strategy` +- - ✅︎ {ref}`incremental-batch-strategy` +- ❌ `hint` (MSSQL does support hints, but DBReader not, at least for now) +- ❌ `df_schema` +- ✅︎ `options` (see {obj}`MSSQL.ReadOptions `) + +## Examples + +Snapshot strategy: + +```python +from onetl.connection import MSSQL +from onetl.db import DBReader + +mssql = MSSQL(...) + +reader = DBReader( + connection=mssql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + options=MSSQL.ReadOptions(partitionColumn="id", numPartitions=10), +) +df = reader.run() +``` + +Incremental strategy: + +```python +from onetl.connection import MSSQL +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +mssql = MSSQL(...) + +reader = DBReader( + connection=mssql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="mssql_hwm", expression="updated_dt"), + options=MSSQL.ReadOptions(partitionColumn="id", numPartitions=10), +) + +with IncrementalStrategy(): + df = reader.run() +``` + +## Recommendations + +### Select only required columns + +Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This reduces the amount of data passed from MSSQL to Spark. + +### Pay attention to `where` value + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where="column = 'value'")` clause. +This both reduces the amount of data send from MSSQL to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.mssql.options +``` + +```{eval-rst} +.. autopydantic_model:: MSSQLReadOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/mssql/sql.md b/mkdocs/docs/en/connection/db_connection/mssql/sql.md new file mode 100644 index 000000000..de932e2d3 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mssql/sql.md @@ -0,0 +1,80 @@ +(mssql-sql)= + +# Reading from MSSQL using `MSSQL.sql` + +`MSSQL.sql` allows passing custom SQL query, but does not support incremental strategies. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`mssql-types` +``` + +```{eval-rst} +.. warning:: + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. +``` + +## Syntax support + +Only queries with the following syntax are supported: + +- ✅︎ `SELECT ... FROM ...` +- ❌ `WITH alias AS (...) SELECT ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +## Examples + +```python +from onetl.connection import MSSQL + +mssql = MSSQL(...) +df = mssql.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=MSSQL.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), +) +``` + +## Recommendations + +### Select only required columns + +Instead of passing `SELECT * FROM ...` prefer passing exact column names `SELECT col1, col2, ...`. +This reduces the amount of data passed from MSSQL to Spark. + +### Pay attention to `where` value + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `WHERE column = 'value'` clause. +This both reduces the amount of data send from MSSQL to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.mssql.options +``` + +```{eval-rst} +.. autopydantic_model:: MSSQLSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/mssql/types.md b/mkdocs/docs/en/connection/db_connection/mssql/types.md new file mode 100644 index 000000000..286352de5 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mssql/types.md @@ -0,0 +1,376 @@ +(mssql-types)= + +# MSSQL \<-> Spark type mapping + +```{eval-rst} +.. note:: + + The results below are valid for Spark 3.5.5, and may differ on other Spark versions. +``` + +## Type detection & casting + +Spark's DataFrames always have a `schema` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +### Reading from MSSQL + +This is how MSSQL connector performs this: + +- For each column in query result (`SELECT column1, column2, ... FROM table ...`) get column name and MSSQL type. +- Find corresponding `MSSQL type (read)` → `Spark type` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Create DataFrame from query with specific column names and Spark types. + +### Writing to some existing MSSQL table + +This is how MSSQL connector performs this: + +- Get names of columns in DataFrame. [^footnote-1] +- Perform `SELECT * FROM table LIMIT 0` query. +- Take only columns present in DataFrame (by name, case insensitive). For each found column get MSSQL type. +- Find corresponding `Spark type` → `MSSQL type (write)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- If `MSSQL type (write)` match `MSSQL type (read)`, no additional casts will be performed, DataFrame column will be written to MSSQL as is. +- If `MSSQL type (write)` does not match `MSSQL type (read)`, DataFrame column will be casted to target column type **on MSSQL side**. + For example, you can write column with text data to `int` column, if column contains valid integer values within supported value range and precision [^footnote-2]. + +[^footnote-1]: This allows to write data to tables with `DEFAULT` and `GENERATED` columns - if DataFrame has no such column, + it will be populated by MSSQL. + +[^footnote-2]: This is true only if DataFrame column is a `StringType()`, because text value is parsed automatically to target column type. + + But other types cannot be silently converted, like `int -> text`. This requires explicit casting, see [DBWriter]. + +### Create new table using Spark + +```{eval-rst} +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! +``` + +This is how MSSQL connector performs this: + +- Find corresponding `Spark type` → `MSSQL type (create)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Generate DDL for creating table in MSSQL, like `CREATE TABLE (col1 ...)`, and run it. +- Write DataFrame to created table as is. + +But some cases this may lead to using wrong column type. For example, Spark creates column of type `timestamp` +which corresponds to MSSQL's type `timestamp(0)` (precision up to seconds) +instead of more precise `timestamp(6)` (precision up to nanoseconds). +This may lead to incidental precision loss, or sometimes data cannot be written to created table at all. + +So instead of relying on Spark to create tables: + +```{eval-rst} +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=mssql, + target="myschema.target_tbl", + options=MSSQL.WriteOptions( + if_exists="append", + ), + ) + writer.run(df) +``` + +Always prefer creating tables with specific types **BEFORE WRITING DATA**: + +```{eval-rst} +.. dropdown:: See example + + .. code:: python + + mssql.execute( + """ + CREATE TABLE schema.table ( + id bigint, + key text, + value datetime2(6) -- specific type and precision + ) + """, + ) + + writer = DBWriter( + connection=mssql, + target="myschema.target_tbl", + options=MSSQL.WriteOptions(if_exists="append"), + ) + writer.run(df) +``` + +### References + +Here you can find source code with type conversions: + +- [MSSQL -> JDBC](https://github.com/microsoft/mssql-jdbc/blob/v12.2.0/src/main/java/com/microsoft/sqlserver/jdbc/SQLServerResultSetMetaData.java#L117-L170) +- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala#L135-L152) +- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala#L154-L163) +- [JDBC -> MSSQL](https://github.com/microsoft/mssql-jdbc/blob/v12.2.0/src/main/java/com/microsoft/sqlserver/jdbc/DataTypes.java#L625-L676) + +## Supported types + +See [official documentation](https://learn.microsoft.com/en-us/sql/t-sql/data-types/data-types-transact-sql) + +### Numeric types + +```{eval-rst} ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | ++===============================+===================================+===============================+===============================+ +| `decimal` | `DecimalType(P=18, S=0)` | `decimal(P=18, S=0)` | `decimal(P=18, S=0)` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `decimal(P=0..38)` | `DecimalType(P=0..38, S=0)` | `decimal(P=0..38, S=0)` | `decimal(P=0..38, S=0)` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `decimal(P=0..38, S=0..38)` | `DecimalType(P=0..38, S=0..38)` | `decimal(P=0..38, S=0..38)` | `decimal(P=0..38, S=0..38)` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `real` | `FloatType()` | `real` | `real` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `float` | `DoubleType()` | `float` | `float` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `smallint` | `ShortType()` | `smallint` | `smallint` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `tinyint` | `IntegerType()` | `int` | `int` | ++-------------------------------+ | | | +| `int` | | | | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `bigint` | `LongType()` | `bigint` | `bigint` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +``` + +### Temporal types + +```{eval-rst} +.. note:: + + MSSQL `timestamp` type is alias for `rowversion` (see `Special types`_). It is not a temporal type! +``` + +```{eval-rst} ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | ++==========================================+======================================+===================================+===============================+ +| `date` | `DateType()` | `date` | `date` | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| `smalldatetime`, minutes | `TimestampType()`, microseconds | `datetime2(6)`, microseconds | `datetime`, milliseconds | ++------------------------------------------+ | | | +| `datetime`, milliseconds | | | | ++------------------------------------------+ | | | +| `datetime2(0)`, seconds | | | | ++------------------------------------------+ | | | +| `datetime2(3)`, milliseconds | | | | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| `datetime2(6)`, microseconds | `TimestampType()`, microseconds | `datetime2(6)`, microseconds | `datetime`, milliseconds, | ++------------------------------------------+--------------------------------------+-----------------------------------+ **precision loss** [3]_ | +| `datetime2(7)`, 100s of nanoseconds | `TimestampType()`, microseconds, | `datetime2(6)`, microseconds, | | +| | **precision loss** [4]_ | **precision loss** [4]_ | | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| `time(0)`, seconds | `TimestampType()`, microseconds, | `datetime2(6)`, microseconds | `datetime`, milliseconds | ++------------------------------------------+ with time format quirks [5]_ | | | +| `time(3)`, milliseconds | | | | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| `time(6)`, microseconds | `TimestampType()`, microseconds, | `datetime2(6)`, microseconds | `datetime`, milliseconds, | ++ | with time format quirks [5]_ | | **precision loss** [3]_ | ++------------------------------------------+--------------------------------------+-----------------------------------+ + +| `time`, 100s of nanoseconds | `TimestampType()`, microseconds, | `datetime2(6)`, microseconds | | ++------------------------------------------+ **precision loss** [4]_, | **precision loss** [3]_ | | +| `time(7)`, 100s of nanoseconds | with time format quirks [5]_ | | | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| `datetimeoffset` | `StringType()` | `nvarchar` | `nvarchar` | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +``` + +```{eval-rst} +.. warning:: + + Note that types in MSSQL and Spark have different value ranges: + + +-------------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + | MySQL type | Min value | Max value | Spark type | Min value | Max value | + +===================+================================+================================+=====================+================================+================================+ + | `smalldatetime` | `1900-01-01 00:00:00` | `2079-06-06 23:59:00` | `TimestampType()` | `0001-01-01 00:00:00.000000` | `9999-12-31 23:59:59.999999` | + +-------------------+--------------------------------+--------------------------------+ | | | + | `datetime` | `1753-01-01 00:00:00.000` | `9999-12-31 23:59:59.997` | | | | + +-------------------+--------------------------------+--------------------------------+ | | | + | `datetime2` | `0001-01-01 00:00:00.000000` | `9999-12-31 23:59:59.999999` | | | | + +-------------------+--------------------------------+--------------------------------+ | | | + | `time` | `00:00:00.0000000` | `23:59:59.9999999` | | | | + +-------------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all of values in Spark DataFrame can be written to MSSQL. + + References: + * `MSSQL date & time types documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ +``` + +[^footnote-3]: MSSQL dialect for Spark generates DDL with type `datetime` which has precision up to milliseconds (`23:59:59.999`, 10{superscript}`-3` seconds). + Inserting data with microsecond and higher precision (`23:59:59.999999` .. `23.59:59.9999999`, 10{superscript}`-6` .. 10{superscript}`-7` seconds) + will lead to **throwing away microseconds**. + +[^footnote-4]: MSSQL support timestamp up to 100s of nanoseconds precision (`23:59:59.9999999999`, 10{superscript}`-7` seconds), + but Spark `TimestampType()` supports datetime up to microseconds precision (`23:59:59.999999`, 10{superscript}`-6` seconds). + Last digit will be lost during read or write operations. + +[^footnote-5]: `time` type is the same as `datetime2` with date `1970-01-01`. So instead of reading data from MSSQL like `23:59:59.999999` + it is actually read `1970-01-01 23:59:59.999999`, and vice versa. + +### String types + +```{eval-rst} ++-------------------+------------------+--------------------+---------------------+ +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | ++===================+==================+====================+=====================+ +| `char` | `StringType()` | `nvarchar` | `nvarchar` | ++-------------------+ | | | +| `char(N)` | | | | ++-------------------+ | | | +| `nchar` | | | | ++-------------------+ | | | +| `nchar(N)` | | | | ++-------------------+ | | | +| `varchar` | | | | ++-------------------+ | | | +| `varchar(N)` | | | | ++-------------------+ | | | +| `nvarchar` | | | | ++-------------------+ | | | +| `nvarchar(N)` | | | | ++-------------------+ | | | +| `mediumtext` | | | | ++-------------------+ | | | +| `text` | | | | ++-------------------+ | | | +| `ntext` | | | | ++-------------------+ | | | +| `xml` | | | | ++-------------------+------------------+--------------------+---------------------+ +``` + +### Binary types + +```{eval-rst} ++--------------------+-------------------+--------------------+---------------------+ +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | ++====================+===================+====================+=====================+ +| `bit` | `BooleanType()` | `bit` | `bit` | ++--------------------+-------------------+--------------------+---------------------+ +| `binary` | `BinaryType()` | `varbinary` | `varbinary` | ++--------------------+ | | | +| `binary(N)` | | | | ++--------------------+ | | | +| `varbinary` | | | | ++--------------------+ | | | +| `varbinary(N)` | | | | ++--------------------+ | | | +| `image` | | | | ++--------------------+-------------------+--------------------+---------------------+ +``` + +### Special types + +```{eval-rst} ++---------------------------+------------------+--------------------+---------------------+ +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | ++===========================+==================+====================+=====================+ +| `geography` | `BinaryType()` | `varbinary` | `varbinary` | ++---------------------------+ | | | +| `geometry` | | | | ++---------------------------+ | | | +| `hierarchyid` | | | | ++---------------------------+ | | | +| `rowversion` | | | | ++---------------------------+------------------+--------------------+---------------------+ +| `sql_variant` | unsupported | | | ++---------------------------+------------------+--------------------+---------------------+ +| `sysname` | `StringType()` | `nvarchar` | `nvarchar` | ++---------------------------+ | | | +| `uniqueidentifier` | | | | ++---------------------------+------------------+--------------------+---------------------+ +``` + +## Explicit type cast + +### `DBReader` + +It is possible to explicitly cast column type using `DBReader(columns=...)` syntax. + +For example, you can use `CAST(column AS text)` to convert data to string representation on MSSQL side, and so it will be read as Spark's `StringType()`: + +```python +from onetl.connection import MSSQL +from onetl.db import DBReader + +mssql = MSSQL(...) + +DBReader( + connection=mssql, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS text) unsupported_column_str", + ], +) +df = reader.run() + +# cast column content to proper Spark type +df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), +) +``` + +### `DBWriter` + +Convert dataframe column to JSON using [to_json](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.to_json.html), +and write it as `text` column in MSSQL: + +```python +mssql.execute( + """ + CREATE TABLE schema.target_tbl ( + id bigint, + struct_column_json text -- any string type, actually + ) + """, +) + +from pyspark.sql.functions import to_json + +df = df.select( + df.id, + to_json(df.struct_column).alias("struct_column_json"), +) + +writer.run(df) +``` + +Then you can parse this column on MSSQL side - for example, by creating a view: + +```sql +SELECT + id, + JSON_VALUE(struct_column_json, "$.nested.field") AS nested_field +FROM target_tbl +``` + +Or by using [computed column](https://learn.microsoft.com/en-us/sql/relational-databases/tables/specify-computed-columns-in-a-table): + +```sql +CREATE TABLE schema.target_table ( + id bigint, + supported_column datetime2(6), + struct_column_json text, -- any string type, actually + -- computed column + nested_field AS (JSON_VALUE(struct_column_json, "$.nested.field")) + -- or persisted column + -- nested_field AS (JSON_VALUE(struct_column_json, "$.nested.field")) PERSISTED +) +``` + +By default, column value is calculated on every table read. +Column marked as `PERSISTED` is calculated during insert, but this require additional space. diff --git a/mkdocs/docs/en/connection/db_connection/mssql/write.md b/mkdocs/docs/en/connection/db_connection/mssql/write.md new file mode 100644 index 000000000..0b6d8db19 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mssql/write.md @@ -0,0 +1,56 @@ +(mssql-write)= + +# Writing to MSSQL using `DBWriter` + +For writing data to MSSQL, use {obj}`DBWriter `. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`mssql-types` +``` + +```{eval-rst} +.. warning:: + + It is always recommended to create table explicitly using :ref:`MSSQL.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. +``` + +## Examples + +```python +from onetl.connection import MSSQL +from onetl.db import DBWriter + +mssql = MSSQL(...) + +df = ... # data is here + +writer = DBWriter( + connection=mssql, + target="schema.table", + options=MSSQL.WriteOptions(if_exists="append"), +) + +writer.run(df) +``` + +## Options + +Method above accepts {obj}`MSSQL.WriteOptions ` + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.mssql.options +``` + +```{eval-rst} +.. autopydantic_model:: MSSQLWriteOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/mysql/connection.md b/mkdocs/docs/en/connection/db_connection/mysql/connection.md new file mode 100644 index 000000000..1b2ba94c3 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mysql/connection.md @@ -0,0 +1,12 @@ +(mysql-connection)= + +# MySQL connection + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.mysql.connection +``` + +```{eval-rst} +.. autoclass:: MySQL + :members: get_packages, check +``` diff --git a/mkdocs/docs/en/connection/db_connection/mysql/execute.md b/mkdocs/docs/en/connection/db_connection/mysql/execute.md new file mode 100644 index 000000000..99c86be5b --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mysql/execute.md @@ -0,0 +1,115 @@ +(mysql-execute)= + +# Executing statements in MySQL + +```{eval-rst} +.. warning:: + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`MySQL.sql ` instead. +``` + +## How to + +There are 2 ways to execute some statement in MySQL + +### Use `MySQL.fetch` + +Use this method to perform some `SELECT` query which returns **small number or rows**, like reading +MySQL config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts {obj}`MySQL.FetchOptions `. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`mysql-types`. +``` + +#### Syntax support + +This method supports **any** query syntax supported by MySQL, like: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ✅︎ `SELECT func(arg1, arg2)` or `{?= call func(arg1, arg2)}` - special syntax for calling function +- ✅︎ `SHOW ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples + +```python +from onetl.connection import MySQL + +mysql = MySQL(...) + +df = mysql.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=MySQL.FetchOptions(queryTimeout=10), +) +mysql.close() +value = df.collect()[0][0] # get value from first row and first column +``` + +### Use `MySQL.execute` + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts {obj}`MySQL.ExecuteOptions `. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +#### Syntax support + +This method supports **any** query syntax supported by MySQL, like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...`, and so on +- ✅︎ `ALTER ...` +- ✅︎ `INSERT INTO ... SELECT ...`, `UPDATE ...`, `DELETE ...`, and so on +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, and so on +- ✅︎ `CALL procedure(arg1, arg2) ...` or `{call procedure(arg1, arg2)}` - special syntax for calling procedure +- ✅︎ other statements not mentioned here +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples + +```python +from onetl.connection import MySQL + +mysql = MySQL(...) + +mysql.execute("DROP TABLE schema.table") +mysql.execute( + """ + CREATE TABLE schema.table ( + id bigint, + key text, + value float + ) + ENGINE = InnoDB + """, + options=MySQL.ExecuteOptions(queryTimeout=10), +) +``` + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.mysql.options +``` + +```{eval-rst} +.. autopydantic_model:: MySQLFetchOptions + :inherited-members: GenericOptions + :member-order: bysource + +``` + +```{eval-rst} +.. autopydantic_model:: MySQLExecuteOptions + :inherited-members: GenericOptions + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/db_connection/mysql/index.md b/mkdocs/docs/en/connection/db_connection/mysql/index.md new file mode 100644 index 000000000..6b69e1c1b --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mysql/index.md @@ -0,0 +1,28 @@ +(mysql)= + +# MySQL + +```{toctree} +:caption: Connection +:maxdepth: 1 + +prerequisites +connection +``` + +```{toctree} +:caption: Operations +:maxdepth: 1 + +read +sql +write +execute +``` + +```{toctree} +:caption: Troubleshooting +:maxdepth: 1 + +types +``` diff --git a/mkdocs/docs/en/connection/db_connection/mysql/prerequisites.md b/mkdocs/docs/en/connection/db_connection/mysql/prerequisites.md new file mode 100644 index 000000000..6292e7761 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mysql/prerequisites.md @@ -0,0 +1,61 @@ +(mysql-prerequisites)= + +# Prerequisites + +## Version Compatibility + +- MySQL server versions: + : - Officially declared: 8.0 - 9.2 + - Actually tested: 5.7.13, 9.2.0 +- Spark versions: 2.3.x - 3.5.x +- Java versions: 8 - 20 + +See [official documentation](https://dev.mysql.com/doc/connector-j/en/connector-j-versions.html). + +## Installing PySpark + +To use MySQL connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See {ref}`install-spark` installation instruction for more details. + +## Connecting to MySQL + +### Connection host + +It is possible to connect to MySQL by using either DNS name of host or it's IP address. + +If you're using MySQL cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +### Connection port + +Connection is usually performed to port 3306. Port may differ for different MySQL instances. +Please ask your MySQL administrator to provide required information. + +### Required grants + +Ask your MySQL cluster administrator to set following grants for a user, +used for creating a connection: + +```{eval-rst} +.. tabs:: + + .. code-tab:: sql Read + Write + + -- allow creating tables in the target schema + GRANT CREATE ON myschema.* TO username@'192.168.1.%'; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON myschema.mytable TO username@'192.168.1.%'; + + .. code-tab:: sql Read only + + -- allow read access to specific table + GRANT SELECT ON myschema.mytable TO username@'192.168.1.%'; +``` + +In example above `'192.168.1.%''` is a network subnet `192.168.1.0 - 192.168.1.255` +where Spark driver and executors are running. To allow connecting user from any IP, use `'%'` (not secure!). + +More details can be found in [official documentation](https://dev.mysql.com/doc/refman/en/grant.html). diff --git a/mkdocs/docs/en/connection/db_connection/mysql/read.md b/mkdocs/docs/en/connection/db_connection/mysql/read.md new file mode 100644 index 000000000..c3751dca2 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mysql/read.md @@ -0,0 +1,93 @@ +(mysql-read)= + +# Reading from MySQL using `DBReader` + +{obj}`DBReader ` supports {ref}`strategy` for incremental data reading, +but does not support custom queries, like `JOIN`. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`mysql-types` +``` + +## Supported DBReader features + +- ✅︎ `columns` +- ✅︎ `where` +- ✅︎ `hwm`, supported strategies: +- - ✅︎ {ref}`snapshot-strategy` +- - ✅︎ {ref}`incremental-strategy` +- - ✅︎ {ref}`snapshot-batch-strategy` +- - ✅︎ {ref}`incremental-batch-strategy` +- ✅︎ `hint` (see [official documentation](https://dev.mysql.com/doc/refman/en/optimizer-hints.html)) +- ❌ `df_schema` +- ✅︎ `options` (see {obj}`MySQL.ReadOptions `) + +## Examples + +Snapshot strategy: + +```python +from onetl.connection import MySQL +from onetl.db import DBReader + +mysql = MySQL(...) + +reader = DBReader( + connection=mysql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hint="SKIP_SCAN(schema.table key_index)", + options=MySQL.ReadOptions(partitionColumn="id", numPartitions=10), +) +df = reader.run() +``` + +Incremental strategy: + +```python +from onetl.connection import MySQL +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +mysql = MySQL(...) + +reader = DBReader( + connection=mysql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hint="SKIP_SCAN(schema.table key_index)", + hwm=DBReader.AutoDetectHWM(name="mysql_hwm", expression="updated_dt"), + options=MySQL.ReadOptions(partitionColumn="id", numPartitions=10), +) + +with IncrementalStrategy(): + df = reader.run() +``` + +## Recommendations + +### Select only required columns + +Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This reduces the amount of data passed from Oracle to Spark. + +### Pay attention to `where` value + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where="column = 'value'")` clause. +This both reduces the amount of data send from Oracle to Spark, and may also improve performance of the query. +Especially if there are indexes for columns used in `where` clause. + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.mysql.options +``` + +```{eval-rst} +.. autopydantic_model:: MySQLReadOptions + :inherited-members: GenericOptions + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/db_connection/mysql/sql.md b/mkdocs/docs/en/connection/db_connection/mysql/sql.md new file mode 100644 index 000000000..7d2dd4e51 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mysql/sql.md @@ -0,0 +1,81 @@ +(mysql-sql)= + +# Reading from MySQL using `MySQL.sql` + +`MySQL.sql` allows passing custom SQL query, but does not support incremental strategies. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`mysql-types` +``` + +```{eval-rst} +.. warning:: + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. +``` + +## Syntax support + +Only queries with the following syntax are supported: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ❌ `SHOW ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +## Examples + +```python +from onetl.connection import MySQL + +mysql = MySQL(...) +df = mysql.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=MySQL.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), +) +``` + +## Recommendations + +### Select only required columns + +Instead of passing `SELECT * FROM ...` prefer passing exact column names `SELECT col1, col2, ...`. +This reduces the amount of data passed from MySQL to Spark. + +### Pay attention to `where` value + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `WHERE column = 'value'` clause. +This both reduces the amount of data send from MySQL to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.mysql.options +``` + +```{eval-rst} +.. autopydantic_model:: MySQLSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/mysql/types.md b/mkdocs/docs/en/connection/db_connection/mysql/types.md new file mode 100644 index 000000000..4e3ae5d51 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mysql/types.md @@ -0,0 +1,382 @@ +(mysql-types)= + +# MySQL \<-> Spark type mapping + +```{eval-rst} +.. note:: + + The results below are valid for Spark 3.5.5, and may differ on other Spark versions. +``` + +## Type detection & casting + +Spark's DataFrames always have a `schema` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +### Reading from MySQL + +This is how MySQL connector performs this: + +- For each column in query result (`SELECT column1, column2, ... FROM table ...`) get column name and MySQL type. +- Find corresponding `MySQL type (read)` → `Spark type` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Create DataFrame from query with specific column names and Spark types. + +### Writing to some existing MySQL table + +This is how MySQL connector performs this: + +- Get names of columns in DataFrame. [^footnote-1] +- Perform `SELECT * FROM table LIMIT 0` query. +- Take only columns present in DataFrame (by name, case insensitive). For each found column get MySQL type. +- Find corresponding `Spark type` → `MySQL type (write)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- If `MySQL type (write)` match `MySQL type (read)`, no additional casts will be performed, DataFrame column will be written to MySQL as is. +- If `MySQL type (write)` does not match `MySQL type (read)`, DataFrame column will be casted to target column type **on MySQL side**. For example, you can write column with text data to `int` column, if column contains valid integer values within supported value range and precision. + +[^footnote-1]: This allows to write data to tables with `DEFAULT` and `GENERATED` columns - if DataFrame has no such column, + it will be populated by MySQL. + +### Create new table using Spark + +```{eval-rst} +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! +``` + +This is how MySQL connector performs this: + +- Find corresponding `Spark type` → `MySQL type (create)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Generate DDL for creating table in MySQL, like `CREATE TABLE (col1 ...)`, and run it. +- Write DataFrame to created table as is. + +But some cases this may lead to using wrong column type. For example, Spark creates column of type `timestamp` +which corresponds to MySQL type `timestamp(0)` (precision up to seconds) +instead of more precise `timestamp(6)` (precision up to nanoseconds). +This may lead to incidental precision loss, or sometimes data cannot be written to created table at all. + +So instead of relying on Spark to create tables: + +```{eval-rst} +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=mysql, + target="myschema.target_tbl", + options=MySQL.WriteOptions( + if_exists="append", + createTableOptions="ENGINE = InnoDB", + ), + ) + writer.run(df) +``` + +Always prefer creating tables with specific types **BEFORE WRITING DATA**: + +```{eval-rst} +.. dropdown:: See example + + .. code:: python + + mysql.execute( + """ + CREATE TABLE schema.table ( + id bigint, + key text, + value timestamp(6) -- specific type and precision + ) + ENGINE = InnoDB + """, + ) + + writer = DBWriter( + connection=mysql, + target="myschema.target_tbl", + options=MySQL.WriteOptions(if_exists="append"), + ) + writer.run(df) +``` + +### References + +Here you can find source code with type conversions: + +- [MySQL -> JDBC](https://github.com/mysql/mysql-connector-j/blob/8.0.33/src/main/core-api/java/com/mysql/cj/MysqlType.java#L44-L623) +- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala#L104-L132) +- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala#L204-L211) +- [JDBC -> MySQL](https://github.com/mysql/mysql-connector-j/blob/8.0.33/src/main/core-api/java/com/mysql/cj/MysqlType.java#L625-L867) + +## Supported types + +See [official documentation](https://dev.mysql.com/doc/refman/en/data-types.html) + +### Numeric types + +```{eval-rst} ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++===============================+===================================+===============================+===============================+ +| `decimal` | `DecimalType(P=10, S=0)` | `decimal(P=10, S=0)` | `decimal(P=10, S=0)` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `decimal(P=0..38)` | `DecimalType(P=0..38, S=0)` | `decimal(P=0..38, S=0)` | `decimal(P=0..38, S=0)` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `decimal(P=0..38, S=0..30)` | `DecimalType(P=0..38, S=0..30)` | `decimal(P=0..38, S=0..30)` | `decimal(P=0..38, S=0..30)` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `decimal(P=39..65, S=...)` | unsupported [2]_ | | | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `float` | `DoubleType()` | `double` | `double` | ++-------------------------------+ | | | +| `double` | | | | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `tinyint` | `IntegerType()` | `int` | `int` | ++-------------------------------+ | | | +| `smallint` | | | | ++-------------------------------+ | | | +| `mediumint` | | | | ++-------------------------------+ | | | +| `int` | | | | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| `bigint` | `LongType()` | `bigint` | `bigint` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +``` + +[^footnote-2]: MySQL support decimal types with precision `P` up to 65. + + But Spark's `DecimalType(P, S)` supports maximum `P=38`. It is impossible to read, write or operate with values of larger precision, + this leads to an exception. + +### Temporal types + +```{eval-rst} ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++===================================+======================================+===================================+===============================+ +| `year` | `DateType()` | `date` | `date` | ++-----------------------------------+ | | | +| `date` | | | | ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| `datetime`, seconds | `TimestampType()`, microseconds | `timestamp(6)`, microseconds | `timestamp(0)`, seconds | ++-----------------------------------+ | | | +| `timestamp`, seconds | | | | ++-----------------------------------+ | | | +| `datetime(0)`, seconds | | | | ++-----------------------------------+ | | | +| `timestamp(0)`, seconds | | | | ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| `datetime(3)`, milliseconds | `TimestampType()`, microseconds | `timestamp(6)`, microseconds | `timestamp(0)`, seconds, | ++-----------------------------------+ | | **precision loss** [4]_, | +| `timestamp(3)`, milliseconds | | | | ++-----------------------------------+ | | | +| `datetime(6)`, microseconds | | | | ++-----------------------------------+ | | | +| `timestamp(6)`, microseconds | | | | ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| `time`, seconds | `TimestampType()`, microseconds, | `timestamp(6)`, microseconds | `timestamp(0)`, seconds | ++-----------------------------------+ with time format quirks [5]_ | | | +| `time(0)`, seconds | | | | ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| `time(3)`, milliseconds | `TimestampType()`, microseconds | `timestamp(6)`, microseconds | `timestamp(0)`, seconds, | ++-----------------------------------+ with time format quirks [5]_ | | **precision loss** [4]_, | +| `time(6)`, microseconds | | | | ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +``` + +```{eval-rst} +.. warning:: + + Note that types in MySQL and Spark have different value ranges: + + +---------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + | MySQL type | Min value | Max value | Spark type | Min value | Max value | + +===============+================================+================================+=====================+================================+================================+ + | `year` | `1901` | `2155` | `DateType()` | `0001-01-01` | `9999-12-31` | + +---------------+--------------------------------+--------------------------------+ | | | + | `date` | `1000-01-01` | `9999-12-31` | | | | + +---------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + | `datetime` | `1000-01-01 00:00:00.000000` | `9999-12-31 23:59:59.499999` | `TimestampType()` | `0001-01-01 00:00:00.000000` | `9999-12-31 23:59:59.999999` | + +---------------+--------------------------------+--------------------------------+ | | | + | `timestamp` | `1970-01-01 00:00:01.000000` | `9999-12-31 23:59:59.499999` | | | | + +---------------+--------------------------------+--------------------------------+ | | | + | `time` | `-838:59:59.000000` | `838:59:59.000000` | | | | + +---------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + + So Spark can read all the values from MySQL, but not all of values in Spark DataFrame can be written to MySQL. + + References: + * `MySQL year documentation `_ + * `MySQL date, datetime & timestamp documentation `_ + * `MySQL time documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ +``` + +[^footnote-3]: MySQL dialect generates DDL with MySQL type `timestamp` which is alias for `timestamp(0)` with precision up to seconds (`23:59:59`). + Inserting data with microseconds precision (`23:59:59.999999`) will lead to **throwing away microseconds**. + +[^footnote-4]: `time` type is the same as `timestamp` with date `1970-01-01`. So instead of reading data from MySQL like `23:59:59` + it is actually read `1970-01-01 23:59:59`, and vice versa. + +### String types + +```{eval-rst} ++-------------------------------+------------------+--------------------+---------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++===============================+==================+====================+=====================+ +| `char` | `StringType()` | `longtext` | `longtext` | ++-------------------------------+ | | | +| `char(N)` | | | | ++-------------------------------+ | | | +| `varchar(N)` | | | | ++-------------------------------+ | | | +| `mediumtext` | | | | ++-------------------------------+ | | | +| `text` | | | | ++-------------------------------+ | | | +| `longtext` | | | | ++-------------------------------+ | | | +| `json` | | | | ++-------------------------------+ | | | +| `enum("val1", "val2", ...)` | | | | ++-------------------------------+ | | | +| `set("val1", "val2", ...)` | | | | ++-------------------------------+------------------+--------------------+---------------------+ +``` + +### Binary types + +```{eval-rst} ++-------------------+------------------+--------------------+---------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++===================+==================+====================+=====================+ +| `binary` | `BinaryType()` | `blob` | `blob` | ++-------------------+ | | | +| `binary(N)` | | | | ++-------------------+ | | | +| `varbinary(N)` | | | | ++-------------------+ | | | +| `mediumblob` | | | | ++-------------------+ | | | +| `blob` | | | | ++-------------------+ | | | +| `longblob` | | | | ++-------------------+------------------+--------------------+---------------------+ +``` + +### Geometry types + +```{eval-rst} ++------------------------+------------------+--------------------+---------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++========================+==================+====================+=====================+ +| `point` | `BinaryType()` | `blob` | `blob` | ++------------------------+ | | | +| `linestring` | | | | ++------------------------+ | | | +| `polygon` | | | | ++------------------------+ | | | +| `geometry` | | | | ++------------------------+ | | | +| `multipoint` | | | | ++------------------------+ | | | +| `multilinestring` | | | | ++------------------------+ | | | +| `multipolygon` | | | | ++------------------------+ | | | +| `geometrycollection` | | | | ++------------------------+------------------+--------------------+---------------------+ +``` + +## Explicit type cast + +### `DBReader` + +It is possible to explicitly cast column type using `DBReader(columns=...)` syntax. + +For example, you can use `CAST(column AS text)` to convert data to string representation on MySQL side, and so it will be read as Spark's `StringType()`. + +It is also possible to use [JSON_OBJECT](https://dev.mysql.com/doc/refman/en/json.html) MySQL function and parse JSON columns in MySQL with the {obj}`JSON.parse_column ` method. + +```python +from pyspark.sql.types import IntegerType, StructType, StructField + +from onetl.connection import MySQL +from onetl.db import DBReader +from onetl.file.format import JSON + +mysql = MySQL(...) + +DBReader( + connection=mysql, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS text) unsupported_column_str", + # or + "JSON_OBJECT('key', value_column) json_column", + ], +) +df = reader.run() + +json_scheme = StructType([StructField("key", IntegerType())]) + +df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + JSON().parse_column("json_column", json_scheme).alias("struct_column"), +) +``` + +### `DBWriter` + +To write JSON data to a `json` or `text` column in a MySQL table, use the {obj}`JSON.serialize_column ` method. + +```python +from onetl.connection import MySQL +from onetl.db import DBWriter +from onetl.file.format import JSON + +mysql.execute( + """ + CREATE TABLE schema.target_tbl ( + id bigint, + array_column_json json -- any string type, actually + ) + ENGINE = InnoDB + """, +) + +df = df.select( + df.id, + JSON().serialize_column(df.array_column).alias("array_column_json"), +) + +writer.run(df) +``` + +Then you can parse this column on MySQL side - for example, by creating a view: + +```sql +SELECT + id, + array_column_json->"$[0]" AS array_item +FROM target_tbl +``` + +Or by using [GENERATED column](https://dev.mysql.com/doc/refman/en/create-table-generated-columns.html): + +```sql +CREATE TABLE schema.target_table ( + id bigint, + supported_column timestamp, + array_column_json json, -- any string type, actually + -- virtual column + array_item_0 GENERATED ALWAYS AS (array_column_json->"$[0]")) VIRTUAL + -- or stired column + -- array_item_0 GENERATED ALWAYS AS (array_column_json->"$[0]")) STORED +) +``` + +`VIRTUAL` column value is calculated on every table read. +`STORED` column value is calculated during insert, but this require additional space. diff --git a/mkdocs/docs/en/connection/db_connection/mysql/write.md b/mkdocs/docs/en/connection/db_connection/mysql/write.md new file mode 100644 index 000000000..7f7f2a5d2 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/mysql/write.md @@ -0,0 +1,60 @@ +(mysql-write)= + +# Writing to MySQL using `DBWriter` + +For writing data to MySQL, use {obj}`DBWriter `. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`mysql-types` +``` + +```{eval-rst} +.. warning:: + + It is always recommended to create table explicitly using :ref:`MySQL.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. +``` + +## Examples + +```python +from onetl.connection import MySQL +from onetl.db import DBWriter + +mysql = MySQL(...) + +df = ... # data is here + +writer = DBWriter( + connection=mysql, + target="schema.table", + options=MySQL.WriteOptions( + if_exists="append", + # ENGINE is required by MySQL + createTableOptions="ENGINE = MergeTree() ORDER BY id", + ), +) + +writer.run(df) +``` + +## Options + +Method above accepts {obj}`MySQL.WriteOptions ` + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.mysql.options +``` + +```{eval-rst} +.. autopydantic_model:: MySQLWriteOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/oracle/connection.md b/mkdocs/docs/en/connection/db_connection/oracle/connection.md new file mode 100644 index 000000000..26fc8a01e --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/oracle/connection.md @@ -0,0 +1,12 @@ +(oracle-connection)= + +# Oracle connection + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.oracle.connection +``` + +```{eval-rst} +.. autoclass:: Oracle + :members: get_packages, check +``` diff --git a/mkdocs/docs/en/connection/db_connection/oracle/execute.md b/mkdocs/docs/en/connection/db_connection/oracle/execute.md new file mode 100644 index 000000000..28b5baa39 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/oracle/execute.md @@ -0,0 +1,115 @@ +(oracle-execute)= + +# Executing statements in Oracle + +```{eval-rst} +.. warning:: + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`Oracle.sql ` instead. +``` + +## How to + +There are 2 ways to execute some statement in Oracle + +### Use `Oracle.fetch` + +Use this method to execute some `SELECT` query which returns **small number or rows**, like reading +Oracle config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts {obj}`Oracle.FetchOptions `. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`oracle-types`. +``` + +#### Syntax support + +This method supports **any** query syntax supported by Oracle, like: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ✅︎ `SELECT func(arg1, arg2) FROM DUAL` - call function +- ✅︎ `SHOW ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples + +```python +from onetl.connection import Oracle + +oracle = Oracle(...) + +df = oracle.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Oracle.FetchOptions(queryTimeout=10), +) +oracle.close() +value = df.collect()[0][0] # get value from first row and first column +``` + +### Use `Oracle.execute` + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts {obj}`Oracle.ExecuteOptions `. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +#### Syntax support + +This method supports **any** query syntax supported by Oracle, like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...` +- ✅︎ `ALTER ...` +- ✅︎ `INSERT INTO ... SELECT ...`, `UPDATE ...`, `DELETE ...`, and so on +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, `TRUNCATE TABLE`, and so on +- ✅︎ `CALL procedure(arg1, arg2) ...` or `{call procedure(arg1, arg2)}` - special syntax for calling procedure +- ✅︎ `DECLARE ... BEGIN ... END` - execute PL/SQL statement +- ✅︎ other statements not mentioned here +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples + +```python +from onetl.connection import Oracle + +oracle = Oracle(...) + +oracle.execute("DROP TABLE schema.table") +oracle.execute( + """ + CREATE TABLE schema.table ( + id bigint GENERATED ALWAYS AS IDENTITY, + key VARCHAR2(4000), + value NUMBER + ) + """, + options=Oracle.ExecuteOptions(queryTimeout=10), +) +``` + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.oracle.options +``` + +```{eval-rst} +.. autopydantic_model:: OracleFetchOptions + :inherited-members: GenericOptions + :member-order: bysource + +``` + +```{eval-rst} +.. autopydantic_model:: OracleExecuteOptions + :inherited-members: GenericOptions + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/db_connection/oracle/index.md b/mkdocs/docs/en/connection/db_connection/oracle/index.md new file mode 100644 index 000000000..5218cc287 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/oracle/index.md @@ -0,0 +1,28 @@ +(oracle)= + +# Oracle + +```{toctree} +:caption: Connection +:maxdepth: 1 + +prerequisites +connection +``` + +```{toctree} +:caption: Operations +:maxdepth: 1 + +read +sql +write +execute +``` + +```{toctree} +:caption: Troubleshooting +:maxdepth: 1 + +types +``` diff --git a/mkdocs/docs/en/connection/db_connection/oracle/prerequisites.md b/mkdocs/docs/en/connection/db_connection/oracle/prerequisites.md new file mode 100644 index 000000000..5a350937b --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/oracle/prerequisites.md @@ -0,0 +1,108 @@ +(oracle-prerequisites)= + +# Prerequisites + +## Version Compatibility + +- Oracle Server versions: + : - Officially declared: 19c, 21c, 23ai + - Actually tested: 11.2, 23.5 +- Spark versions: 2.3.x - 3.5.x +- Java versions: 8 - 20 + +See [official documentation](https://www.oracle.com/cis/database/technologies/appdev/jdbc-downloads.html). + +## Installing PySpark + +To use Oracle connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See {ref}`install-spark` installation instruction for more details. + +## Connecting to Oracle + +### Connection port + +Connection is usually performed to port 1521. Port may differ for different Oracle instances. +Please ask your Oracle administrator to provide required information. + +### Connection host + +It is possible to connect to Oracle by using either DNS name of host or it's IP address. + +If you're using Oracle cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +### Connect as proxy user + +It is possible to connect to database as another user without knowing this user password. + +This can be enabled by granting user a special `CONNECT THROUGH` permission: + +```sql +ALTER USER schema_owner GRANT CONNECT THROUGH proxy_user; +``` + +Then you can connect to Oracle using credentials of `proxy_user` but specify that you need permissions of `schema_owner`: + +```python +oracle = Oracle( + ..., + user="proxy_user[schema_owner]", + password="proxy_user password", +) +``` + +See [official documentation](https://oracle-base.com/articles/misc/proxy-users-and-connect-through). + +### Required grants + +Ask your Oracle cluster administrator to set following grants for a user, +used for creating a connection: + +```{eval-rst} +.. tabs:: + + .. code-tab:: sql Read + Write (schema is owned by user) + + -- allow user to log in + GRANT CREATE SESSION TO username; + + -- allow creating tables in user schema + GRANT CREATE TABLE TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON username.mytable TO username; + + .. code-tab:: sql Read + Write (schema is not owned by user) + + -- allow user to log in + GRANT CREATE SESSION TO username; + + -- allow creating tables in any schema, + -- as Oracle does not support specifying exact schema name + GRANT CREATE ANY TABLE TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON someschema.mytable TO username; + + -- only if if_exists="replace_entire_table" is used: + -- allow dropping/truncating tables in any schema, + -- as Oracle does not support specifying exact schema name + GRANT DROP ANY TABLE TO username; + + .. code-tab:: sql Read only + + -- allow user to log in + GRANT CREATE SESSION TO username; + + -- allow read access to specific table + GRANT SELECT ON someschema.mytable TO username; +``` + +More details can be found in official documentation: +: - [GRANT](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/GRANT.html) + - [SELECT](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/SELECT.html) + - [CREATE TABLE](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/SELECT.html) + - [INSERT](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/INSERT.html) + - [TRUNCATE TABLE](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/TRUNCATE-TABLE.html) diff --git a/mkdocs/docs/en/connection/db_connection/oracle/read.md b/mkdocs/docs/en/connection/db_connection/oracle/read.md new file mode 100644 index 000000000..4f11e86e7 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/oracle/read.md @@ -0,0 +1,93 @@ +(oracle-read)= + +# Reading from Oracle using `DBReader` + +{obj}`DBReader ` supports {ref}`strategy` for incremental data reading, +but does not support custom queries, like `JOIN`. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`oracle-types` +``` + +## Supported DBReader features + +- ✅︎ `columns` +- ✅︎ `where` +- ✅︎ `hwm`, supported strategies: +- - ✅︎ {ref}`snapshot-strategy` +- - ✅︎ {ref}`incremental-strategy` +- - ✅︎ {ref}`snapshot-batch-strategy` +- - ✅︎ {ref}`incremental-batch-strategy` +- ✅︎ `hint` (see [official documentation](https://docs.oracle.com/cd/B10500_01/server.920/a96533/hintsref.htm)) +- ❌ `df_schema` +- ✅︎ `options` (see {obj}`Oracle.ReadOptions `) + +## Examples + +Snapshot strategy: + +```python +from onetl.connection import Oracle +from onetl.db import DBReader + +oracle = Oracle(...) + +reader = DBReader( + connection=oracle, + source="schema.table", + columns=["id", "key", "CAST(value AS VARCHAR2(4000)) value", "updated_dt"], + where="key = 'something'", + hint="INDEX(schema.table key_index)", + options=Oracle.ReadOptions(partitionColumn="id", numPartitions=10), +) +df = reader.run() +``` + +Incremental strategy: + +```python +from onetl.connection import Oracle +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +oracle = Oracle(...) + +reader = DBReader( + connection=oracle, + source="schema.table", + columns=["id", "key", "CAST(value AS VARCHAR2(4000)) value", "updated_dt"], + where="key = 'something'", + hint="INDEX(schema.table key_index)", + hwm=DBReader.AutoDetectHWM(name="oracle_hwm", expression="updated_dt"), + options=Oracle.ReadOptions(partitionColumn="id", numPartitions=10), +) + +with IncrementalStrategy(): + df = reader.run() +``` + +## Recommendations + +### Select only required columns + +Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This reduces the amount of data passed from Oracle to Spark. + +### Pay attention to `where` value + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where="column = 'value'")` clause. +This both reduces the amount of data send from Oracle to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.oracle.options +``` + +```{eval-rst} +.. autopydantic_model:: OracleReadOptions + :inherited-members: GenericOptions + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/db_connection/oracle/sql.md b/mkdocs/docs/en/connection/db_connection/oracle/sql.md new file mode 100644 index 000000000..73d82e739 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/oracle/sql.md @@ -0,0 +1,81 @@ +(oracle-sql)= + +# Reading from Oracle using `Oracle.sql` + +`Oracle.sql` allows passing custom SQL query, but does not support incremental strategies. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`oracle-types` +``` + +```{eval-rst} +.. warning:: + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. +``` + +## Syntax support + +Only queries with the following syntax are supported: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ❌ `SHOW ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +## Examples + +```python +from onetl.connection import Oracle + +oracle = Oracle(...) +df = oracle.sql( + """ + SELECT + id, + key, + CAST(value AS VARCHAR2(4000)) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Oracle.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), +) +``` + +## Recommendations + +### Select only required columns + +Instead of passing `SELECT * FROM ...` prefer passing exact column names `SELECT col1, col2, ...`. +This reduces the amount of data passed from Oracle to Spark. + +### Pay attention to `where` value + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `WHERE column = 'value'` clause. +This both reduces the amount of data send from Oracle to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.oracle.options +``` + +```{eval-rst} +.. autopydantic_model:: OracleSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/oracle/types.md b/mkdocs/docs/en/connection/db_connection/oracle/types.md new file mode 100644 index 000000000..329e89ff5 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/oracle/types.md @@ -0,0 +1,400 @@ +(oracle-types)= + +# Oracle \<-> Spark type mapping + +```{eval-rst} +.. note:: + + The results below are valid for Spark 3.5.5, and may differ on other Spark versions. +``` + +## Type detection & casting + +Spark's DataFrames always have a `schema` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +### Reading from Oracle + +This is how Oracle connector performs this: + +- For each column in query result (`SELECT column1, column2, ... FROM table ...`) get column name and Oracle type. +- Find corresponding `Oracle type (read)` → `Spark type` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Create DataFrame from query with specific column names and Spark types. + +### Writing to some existing Oracle table + +This is how Oracle connector performs this: + +- Get names of columns in DataFrame. [^footnote-1] +- Perform `SELECT * FROM table LIMIT 0` query. +- Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. +- **Find corresponding** `Oracle type (read)` → `Spark type` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [^footnote-2] +- Find corresponding `Spark type` → `Oracle type (write)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- If `Oracle type (write)` match `Oracle type (read)`, no additional casts will be performed, DataFrame column will be written to Oracle as is. +- If `Oracle type (write)` does not match `Oracle type (read)`, DataFrame column will be casted to target column type **on Oracle side**. + For example, you can write column with text data to `int` column, if column contains valid integer values within supported value range and precision. + +[^footnote-1]: This allows to write data to tables with `DEFAULT` and `GENERATED` columns - if DataFrame has no such column, + it will be populated by Oracle. + +[^footnote-2]: Yes, this is weird. + +### Create new table using Spark + +```{eval-rst} +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! +``` + +This is how Oracle connector performs this: + +- Find corresponding `Spark type` → `Oracle type (create)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Generate DDL for creating table in Oracle, like `CREATE TABLE (col1 ...)`, and run it. +- Write DataFrame to created table as is. + +But Oracle connector support only limited number of types and almost no custom clauses (like `PARTITION BY`, `INDEX`, etc). +So instead of relying on Spark to create tables: + +```{eval-rst} +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=oracle, + target="public.table", + options=Oracle.WriteOptions(if_exists="append"), + ) + writer.run(df) +``` + +Always prefer creating table with desired DDL **BEFORE WRITING DATA**: + +```{eval-rst} +.. dropdown:: See example + + .. code:: python + + oracle.execute( + """ + CREATE TABLE username.table ( + id NUMBER, + business_dt TIMESTAMP(6), + value VARCHAR2(2000) + ) + """, + ) + + writer = DBWriter( + connection=oracle, + target="public.table", + options=Oracle.WriteOptions(if_exists="append"), + ) + writer.run(df) +``` + +See Oracle [CREATE TABLE](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/SELECT.html) documentation. + +## Supported types + +### References + +See [List of Oracle types](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/Data-Types.html). + +Here you can find source code with type conversions: + +- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala#L83-L109) +- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala#L111-L123) + +### Numeric types + +```{eval-rst} ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++==================================+===================================+===============================+===========================+ +| `NUMBER` | `DecimalType(P=38, S=10)` | `NUMBER(P=38, S=10)` | `NUMBER(P=38, S=10)` | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| `NUMBER(P=0..38)` | `DecimalType(P=0..38, S=0)` | `NUMBER(P=0..38, S=0)` | `NUMBER(P=38, S=0)` | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| `NUMBER(P=0..38, S=0..38)` | `DecimalType(P=0..38, S=0..38)` | `NUMBER(P=0..38, S=0..38)` | `NUMBER(P=38, S=0..38)` | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| `NUMBER(P=..., S=-127..-1)` | unsupported [3]_ | | | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| `FLOAT` | `DecimalType(P=38, S=10)` | `NUMBER(P=38, S=10)` | `NUMBER(P=38, S=10)` | ++----------------------------------+ | | | +| `FLOAT(N)` | | | | ++----------------------------------+ | | | +| `REAL` | | | | ++----------------------------------+ | | | +| `DOUBLE PRECISION` | | | | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| `BINARY_FLOAT` | `FloatType()` | `NUMBER(P=19, S=4)` | `NUMBER(P=19, S=4)` | ++----------------------------------+-----------------------------------+ | | +| `BINARY_DOUBLE` | `DoubleType()` | | | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| `SMALLINT` | `DecimalType(P=38, S=0)` | `NUMBER(P=38, S=0)` | `NUMBER(P=38, S=0)` | ++----------------------------------+ | | | +| `INTEGER` | | | | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| `LONG` | `StringType()` | `CLOB` | `CLOB` | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +``` + +[^footnote-3]: Oracle support decimal types with negative scale, like `NUMBER(38, -10)`. Spark doesn't. + +### Temporal types + +```{eval-rst} ++--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++============================================+====================================+=================================+=================================+ +| `DATE`, days | `TimestampType()`, microseconds | `TIMESTAMP(6)`, microseconds | `TIMESTAMP(6)`, microseconds | ++--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ +| `TIMESTAMP`, microseconds | `TimestampType()`, microseconds | `TIMESTAMP(6)`, microseconds | `TIMESTAMP(6)`, microseconds | ++--------------------------------------------+ | | | +| `TIMESTAMP(0)`, seconds | | | | ++--------------------------------------------+ | | | +| `TIMESTAMP(3)`, milliseconds | | | | ++--------------------------------------------+ | | | +| `TIMESTAMP(6)`, microseconds | | | | ++--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ +| `TIMESTAMP(9)`, nanoseconds | `TimestampType()`, microseconds, | `TIMESTAMP(6)`, microseconds, | `TIMESTAMP(6)`, microseconds, | +| | **precision loss** [4]_ | **precision loss** | **precision loss** | ++--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ +| `TIMESTAMP WITH TIME ZONE` | unsupported | | | ++--------------------------------------------+ | | | +| `TIMESTAMP(N) WITH TIME ZONE` | | | | ++--------------------------------------------+ | | | +| `TIMESTAMP WITH LOCAL TIME ZONE` | | | | ++--------------------------------------------+ | | | +| `TIMESTAMP(N) WITH LOCAL TIME ZONE` | | | | ++--------------------------------------------+ | | | +| `INTERVAL YEAR TO MONTH` | | | | ++--------------------------------------------+ | | | +| `INTERVAL DAY TO SECOND` | | | | ++--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ +``` + +```{eval-rst} +.. warning:: + + Note that types in Oracle and Spark have different value ranges: + + +---------------+------------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + | Oracle type | Min value | Max value | Spark type | Min value | Max value | + +===============+====================================+===================================+=====================+================================+================================+ + | `date` | `-4712-01-01` | `9999-01-01` | `DateType()` | `0001-01-01` | `9999-12-31` | + +---------------+------------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + | `timestamp` | `-4712-01-01 00:00:00.000000000` | `9999-12-31 23:59:59.999999999` | `TimestampType()` | `0001-01-01 00:00:00.000000` | `9999-12-31 23:59:59.999999` | + +---------------+------------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all of values can be read from Oracle to Spark. + + References: + * `Oracle date, timestamp and intervals documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ +``` + +[^footnote-4]: Oracle support timestamp up to nanoseconds precision (`23:59:59.999999999`), + but Spark `TimestampType()` supports datetime up to microseconds precision (`23:59:59.999999`). + Nanoseconds will be lost during read or write operations. + +### String types + +```{eval-rst} ++-----------------------------+------------------+---------------------+----------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++=============================+==================+=====================+======================+ +| `CHAR` | `StringType()` | `CLOB` | `CLOB` | ++-----------------------------+ | | | +| `CHAR(N CHAR)` | | | | ++-----------------------------+ | | | +| `CHAR(N BYTE)` | | | | ++-----------------------------+ | | | +| `NCHAR` | | | | ++-----------------------------+ | | | +| `NCHAR(N)` | | | | ++-----------------------------+ | | | +| `VARCHAR(N)` | | | | ++-----------------------------+ | | | +| `LONG VARCHAR` | | | | ++-----------------------------+ | | | +| `VARCHAR2(N CHAR)` | | | | ++-----------------------------+ | | | +| `VARCHAR2(N BYTE)` | | | | ++-----------------------------+ | | | +| `NVARCHAR2(N)` | | | | ++-----------------------------+ | | | +| `CLOB` | | | | ++-----------------------------+ | | | +| `NCLOB` | | | | ++-----------------------------+------------------+---------------------+----------------------+ +``` + +### Binary types + +```{eval-rst} ++--------------------------+------------------+---------------------+----------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++==========================+==================+=====================+======================+ +| `RAW(N)` | `BinaryType()` | `BLOB` | `BLOB` | ++--------------------------+ | | | +| `LONG RAW` | | | | ++--------------------------+ | | | +| `BLOB` | | | | ++--------------------------+------------------+---------------------+----------------------+ +| `BFILE` | unsupported | | | ++--------------------------+------------------+---------------------+----------------------+ +``` + +### Struct types + +```{eval-rst} ++-------------------------------------+------------------+---------------------+----------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++=====================================+==================+=====================+======================+ +| `XMLType` | `StringType()` | `CLOB` | `CLOB` | ++-------------------------------------+ | | | +| `URIType` | | | | ++-------------------------------------+ | | | +| `DBURIType` | | | | ++-------------------------------------+ | | | +| `XDBURIType` | | | | ++-------------------------------------+ | | | +| `HTTPURIType` | | | | ++-------------------------------------+ | | | +| `CREATE TYPE ... AS OBJECT (...)` | | | | ++-------------------------------------+------------------+---------------------+----------------------+ +| `JSON` | unsupported | | | ++-------------------------------------+ | | | +| `CREATE TYPE ... AS VARRAY ...` | | | | ++-------------------------------------+ | | | +| `CREATE TYPE ... AS TABLE OF ...` | | | | ++-------------------------------------+------------------+---------------------+----------------------+ +``` + +### Special types + +```{eval-rst} ++--------------------+-------------------+---------------------+----------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++====================+===================+=====================+======================+ +| `BOOLEAN` | `BooleanType()` | `BOOLEAN` | `NUMBER(P=1, S=0)` | ++--------------------+-------------------+---------------------+----------------------+ +| `ROWID` | `StringType()` | `CLOB` | `CLOB` | ++--------------------+ | | | +| `UROWID` | | | | ++--------------------+ | | | +| `UROWID(N)` | | | | ++--------------------+-------------------+---------------------+----------------------+ +| `ANYTYPE` | unsupported | | | ++--------------------+ | | | +| `ANYDATA` | | | | ++--------------------+ | | | +| `ANYDATASET` | | | | ++--------------------+-------------------+---------------------+----------------------+ +``` + +## Explicit type cast + +### `DBReader` + +It is possible to explicitly cast column of unsupported type using `DBReader(columns=...)` syntax. + +For example, you can use `CAST(column AS CLOB)` to convert data to string representation on Oracle side, and so it will be read as Spark's `StringType()`. + +It is also possible to use [JSON_ARRAY](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/JSON_ARRAY.html) +or [JSON_OBJECT](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/JSON_OBJECT.html) Oracle functions +to convert column of any type to string representation. Then this JSON string can then be effectively parsed using the {obj}`JSON.parse_column ` method. + +```python +from onetl.file.format import JSON +from pyspark.sql.types import IntegerType, StructType, StructField + +from onetl.connection import Oracle +from onetl.db import DBReader + +oracle = Oracle(...) + +DBReader( + connection=oracle, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS VARCHAR2(4000)) unsupported_column_str", + # or + "JSON_ARRAY(array_column) array_column_json", + ], +) +df = reader.run() + +json_scheme = StructType([StructField("key", IntegerType())]) + +df = df.select( + df.id, + df.supported_column, + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + JSON().parse_column("array_column_json", json_scheme).alias("array_column"), +) +``` + +### `DBWriter` + +It is always possible to convert data on Spark side to string, and then write it to text column in Oracle table. + +To serialize and write JSON data to a `text` or `json` column in an Oracle table use the {obj}`JSON.serialize_column ` method. + +```python +from onetl.connection import Oracle +from onetl.db import DBWriter +from onetl.file.format import JSON + +oracle = Oracle(...) + +oracle.execute( + """ + CREATE TABLE schema.target_table ( + id INTEGER, + supported_column TIMESTAMP, + array_column_json VARCHAR2(4000) -- any string type, actually + ) + """, +) + +write_df = df.select( + df.id, + df.supported_column, + JSON().serialize_column(df.unsupported_column).alias("array_column_json"), +) + +writer = DBWriter( + connection=oracle, + target="schema.target_table", +) +writer.run(write_df) +``` + +Then you can parse this column on Oracle side - for example, by creating a view: + +```sql +SELECT + id, + supported_column, + JSON_VALUE(array_column_json, '$[0]' RETURNING NUMBER) AS array_item_0 +FROM + schema.target_table +``` + +Or by using [VIRTUAL column](https://oracle-base.com/articles/11g/virtual-columns-11gr1): + +```sql +CREATE TABLE schema.target_table ( + id INTEGER, + supported_column TIMESTAMP, + array_column_json VARCHAR2(4000), -- any string type, actually + array_item_0 GENERATED ALWAYS AS (JSON_VALUE(array_column_json, '$[0]' RETURNING NUMBER)) VIRTUAL +) +``` + +But data will be parsed on each table read in any case, as Oracle does no support `GENERATED ALWAYS AS (...) STORED` columns. diff --git a/mkdocs/docs/en/connection/db_connection/oracle/write.md b/mkdocs/docs/en/connection/db_connection/oracle/write.md new file mode 100644 index 000000000..09e021c79 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/oracle/write.md @@ -0,0 +1,56 @@ +(oracle-write)= + +# Writing to Oracle using `DBWriter` + +For writing data to Oracle, use {obj}`DBWriter `. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`oracle-types` +``` + +```{eval-rst} +.. warning:: + + It is always recommended to create table explicitly using :ref:`Oracle.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. +``` + +## Examples + +```python +from onetl.connection import Oracle +from onetl.db import DBWriter + +oracle = Oracle(...) + +df = ... # data is here + +writer = DBWriter( + connection=oracle, + target="schema.table", + options=Oracle.WriteOptions(if_exists="append"), +) + +writer.run(df) +``` + +## Options + +Method above accepts {obj}`OracleWriteOptions ` + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.oracle.options +``` + +```{eval-rst} +.. autopydantic_model:: OracleWriteOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/postgres/connection.md b/mkdocs/docs/en/connection/db_connection/postgres/connection.md new file mode 100644 index 000000000..80c59021c --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/postgres/connection.md @@ -0,0 +1,12 @@ +(postgres-connection)= + +# Postgres connection + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.postgres.connection +``` + +```{eval-rst} +.. autoclass:: Postgres + :members: get_packages, check +``` diff --git a/mkdocs/docs/en/connection/db_connection/postgres/execute.md b/mkdocs/docs/en/connection/db_connection/postgres/execute.md new file mode 100644 index 000000000..6964b43e9 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/postgres/execute.md @@ -0,0 +1,113 @@ +(postgres-execute)= + +# Executing statements in Postgres + +```{eval-rst} +.. warning:: + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`Postgres.sql ` instead. +``` + +## How to + +There are 2 ways to execute some statement in Postgres + +### Use `Postgres.fetch` + +Use this method to execute some `SELECT` query which returns **small number or rows**, like reading +Postgres config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts {obj}`Postgres.FetchOptions `. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`postgres-types`. +``` + +#### Syntax support + +This method supports **any** query syntax supported by Postgres, like: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples + +```python +from onetl.connection import Postgres + +postgres = Postgres(...) + +df = postgres.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Postgres.FetchOptions(queryTimeout=10), +) +postgres.close() +value = df.collect()[0][0] # get value from first row and first column +``` + +### Use `Postgres.execute` + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts {obj}`Postgres.ExecuteOptions `. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +#### Syntax support + +This method supports **any** query syntax supported by Postgres, like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...`, and so on +- ✅︎ `ALTER ...` +- ✅︎ `INSERT INTO ... SELECT ...`, `UPDATE ...`, `DELETE ...`, and so on +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, `TRUNCATE TABLE`, and so on +- ✅︎ `CALL procedure(arg1, arg2) ...` +- ✅︎ `SELECT func(arg1, arg2)` or `{call func(arg1, arg2)}` - special syntax for calling functions +- ✅︎ other statements not mentioned here +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples + +```python +from onetl.connection import Postgres + +postgres = Postgres(...) + +postgres.execute("DROP TABLE schema.table") +postgres.execute( + """ + CREATE TABLE schema.table ( + id bigint GENERATED ALWAYS AS IDENTITY, + key text, + value real + ) + """, + options=Postgres.ExecuteOptions(queryTimeout=10), +) +``` + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.postgres.options +``` + +```{eval-rst} +.. autopydantic_model:: PostgresFetchOptions + :inherited-members: GenericOptions + :member-order: bysource + +``` + +```{eval-rst} +.. autopydantic_model:: PostgresExecuteOptions + :inherited-members: GenericOptions + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/db_connection/postgres/index.md b/mkdocs/docs/en/connection/db_connection/postgres/index.md new file mode 100644 index 000000000..56442bfb0 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/postgres/index.md @@ -0,0 +1,28 @@ +(postgres)= + +# Postgres + +```{toctree} +:caption: Connection +:maxdepth: 1 + +prerequisites +connection +``` + +```{toctree} +:caption: Operations +:maxdepth: 1 + +read +sql +write +execute +``` + +```{toctree} +:caption: Troubleshooting +:maxdepth: 1 + +types +``` diff --git a/mkdocs/docs/en/connection/db_connection/postgres/prerequisites.md b/mkdocs/docs/en/connection/db_connection/postgres/prerequisites.md new file mode 100644 index 000000000..d2221552d --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/postgres/prerequisites.md @@ -0,0 +1,71 @@ +(postgres-prerequisites)= + +# Prerequisites + +## Version Compatibility + +- PostgreSQL server versions: + : - Officially declared: 8.2 - 17 + - Actually tested: 9.4.26, 17.3 +- Spark versions: 2.3.x - 3.5.x +- Java versions: 8 - 20 + +See [official documentation](https://jdbc.postgresql.org/). + +## Installing PySpark + +To use Postgres connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See {ref}`install-spark` installation instruction for more details. + +## Connecting to Postgres + +### Allowing connection to Postgres instance + +Ask your Postgres administrator to allow your user (and probably IP) to connect to instance, +e.g. by updating `pg_hba.conf` file. + +See [official documentation](https://www.postgresql.org/docs/current/auth-pg-hba-conf.html). + +### Connection port + +Connection is usually performed to port 5432. Port may differ for different Postgres instances. +Please ask your Postgres administrator to provide required information. + +### Connection host + +It is possible to connect to Postgres by using either DNS name of host or it's IP address. + +If you're using Postgres cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +### Required grants + +Ask your Postgres cluster administrator to set following grants for a user, +used for creating a connection: + +```{eval-rst} +.. tabs:: + + .. code-tab:: sql Read + Write + + -- allow creating tables in specific schema + GRANT USAGE, CREATE ON SCHEMA myschema TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON myschema.mytable TO username; + + -- only if if_exists="replace_entire_table" is used: + GRANT TRUNCATE ON myschema.mytable TO username; + + .. code-tab:: sql Read only + + -- allow creating tables in specific schema + GRANT USAGE ON SCHEMA myschema TO username; + + -- allow read access to specific table + GRANT SELECT ON myschema.mytable TO username; +``` + +More details can be found in [official documentation](https://www.postgresql.org/docs/current/sql-grant.html). diff --git a/mkdocs/docs/en/connection/db_connection/postgres/read.md b/mkdocs/docs/en/connection/db_connection/postgres/read.md new file mode 100644 index 000000000..71db1908c --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/postgres/read.md @@ -0,0 +1,91 @@ +(postgres-read)= + +# Reading from Postgres using `DBReader` + +{obj}`DBReader ` supports {ref}`strategy` for incremental data reading, +but does not support custom queries, like `JOIN`. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`postgres-types` +``` + +## Supported DBReader features + +- ✅︎ `columns` +- ✅︎ `where` +- ✅︎ `hwm`, supported strategies: +- - ✅︎ {ref}`snapshot-strategy` +- - ✅︎ {ref}`incremental-strategy` +- - ✅︎ {ref}`snapshot-batch-strategy` +- - ✅︎ {ref}`incremental-batch-strategy` +- ❌ `hint` (is not supported by Postgres) +- ❌ `df_schema` +- ✅︎ `options` (see {obj}`Postgres.ReadOptions `) + +## Examples + +Snapshot strategy: + +```python +from onetl.connection import Postgres +from onetl.db import DBReader + +postgres = Postgres(...) + +reader = DBReader( + connection=postgres, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + options=Postgres.ReadOptions(partitionColumn="id", numPartitions=10), +) +df = reader.run() +``` + +Incremental strategy: + +```python +from onetl.connection import Postgres +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +postgres = Postgres(...) + +reader = DBReader( + connection=postgres, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="postgres_hwm", expression="updated_dt"), + options=Postgres.ReadOptions(partitionColumn="id", numPartitions=10), +) + +with IncrementalStrategy(): + df = reader.run() +``` + +## Recommendations + +### Select only required columns + +Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This reduces the amount of data passed from Postgres to Spark. + +### Pay attention to `where` value + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where="column = 'value'")` clause. +This both reduces the amount of data send from Postgres to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.postgres.options +``` + +```{eval-rst} +.. autopydantic_model:: PostgresReadOptions + :inherited-members: GenericOptions + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/db_connection/postgres/sql.md b/mkdocs/docs/en/connection/db_connection/postgres/sql.md new file mode 100644 index 000000000..adceab0dc --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/postgres/sql.md @@ -0,0 +1,80 @@ +(postgres-sql)= + +# Reading from Postgres using `Postgres.sql` + +`Postgres.sql` allows passing custom SQL query, but does not support incremental strategies. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`postgres-types` +``` + +```{eval-rst} +.. warning:: + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. +``` + +## Syntax support + +Only queries with the following syntax are supported: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +## Examples + +```python +from onetl.connection import Postgres + +postgres = Postgres(...) +df = postgres.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Postgres.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), +) +``` + +## Recommendations + +### Select only required columns + +Instead of passing `SELECT * FROM ...` prefer passing exact column names `SELECT col1, col2, ...`. +This reduces the amount of data passed from Postgres to Spark. + +### Pay attention to `where` value + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `WHERE column = 'value'` clause. +This both reduces the amount of data send from Postgres to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.postgres.options +``` + +```{eval-rst} +.. autopydantic_model:: PostgresSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/postgres/types.md b/mkdocs/docs/en/connection/db_connection/postgres/types.md new file mode 100644 index 000000000..4fe06220e --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/postgres/types.md @@ -0,0 +1,490 @@ +(postgres-types)= + +# Postgres \<-> Spark type mapping + +```{eval-rst} +.. note:: + + The results below are valid for Spark 3.5.5, and may differ on other Spark versions. +``` + +## Type detection & casting + +Spark's DataFrames always have a `schema` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +### Reading from Postgres + +This is how Postgres connector performs this: + +- For each column in query result (`SELECT column1, column2, ... FROM table ...`) get column name and Postgres type. +- Find corresponding `Postgres type (read)` → `Spark type` combination (see below) for each DataFrame column [^footnote-1]. If no combination is found, raise exception. +- Create DataFrame from query with specific column names and Spark types. + +[^footnote-1]: All Postgres types that doesn't have corresponding Java type are converted to `String`. + +### Writing to some existing Postgres table + +This is how Postgres connector performs this: + +- Get names of columns in DataFrame. [^footnote-1] +- Perform `SELECT * FROM table LIMIT 0` query. +- Take only columns present in DataFrame (by name, case insensitive) [^footnote-2]. For each found column get Postgres type. +- Find corresponding `Spark type` → `Postgres type (write)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- If `Postgres type (write)` match `Postgres type (read)`, no additional casts will be performed, DataFrame column will be written to Postgres as is. +- If `Postgres type (write)` does not match `Postgres type (read)`, DataFrame column will be casted to target column type **on Postgres side**. + For example, you can write column with text data to `int` column, if column contains valid integer values within supported value range and precision [^footnote-3]. + +[^footnote-2]: This allows to write data to tables with `DEFAULT` and `GENERATED` columns - if DataFrame has no such column, + it will be populated by Postgres. + +[^footnote-3]: This is true only if either DataFrame column is a `StringType()`, or target column is `text` type. + + But other types cannot be silently converted, like `bytea -> bit(N)`. This requires explicit casting, see [Manual conversion to string]. + +### Create new table using Spark + +```{eval-rst} +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! +``` + +This is how Postgres connector performs this: + +- Find corresponding `Spark type` → `Postgres type (create)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Generate DDL for creating table in Postgres, like `CREATE TABLE (col1 ...)`, and run it. +- Write DataFrame to created table as is. + +But Postgres connector support only limited number of types and almost no custom clauses (like `PARTITION BY`, `INDEX`, etc). +So instead of relying on Spark to create tables: + +```{eval-rst} +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=postgres, + target="public.table", + options=Postgres.WriteOptions( + if_exists="append", + createTableOptions="PARTITION BY RANGE (id)", + ), + ) + writer.run(df) +``` + +Always prefer creating table with desired DDL **BEFORE WRITING DATA**: + +```{eval-rst} +.. dropdown:: See example + + .. code:: python + + postgres.execute( + """ + CREATE TABLE public.table ( + id bigint, + business_dt timestamp(6), + value json + ) + PARTITION BY RANGE (Id) + """, + ) + + writer = DBWriter( + connection=postgres, + target="public.table", + options=Postgres.WriteOptions(if_exists="append"), + ) + writer.run(df) +``` + +See Postgres [CREATE TABLE](https://www.postgresql.org/docs/current/sql-createtable.html) documentation. + +## Supported types + +### References + +See [List of Postgres types](https://www.postgresql.org/docs/current/datatype.html). + +Here you can find source code with type conversions: + +- [Postgres \<-> JDBC](https://github.com/pgjdbc/pgjdbc/blob/REL42.6.0/pgjdbc/src/main/java/org/postgresql/jdbc/TypeInfoCache.java#L78-L112) +- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala#L52-L108) +- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala#L118-L132) + +### Numeric types + +```{eval-rst} ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++==================================+===================================+===============================+=========================+ +| `decimal` | `DecimalType(P=38, S=18)` | `decimal(P=38, S=18)` | `decimal` (unbounded) | ++----------------------------------+-----------------------------------+-------------------------------+ | +| `decimal(P=0..38)` | `DecimalType(P=0..38, S=0)` | `decimal(P=0..38, S=0)` | | ++----------------------------------+-----------------------------------+-------------------------------+ | +| `decimal(P=0..38, S=0..38)` | `DecimalType(P=0..38, S=0..38)` | `decimal(P=0..38, S=0..38)` | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `decimal(P=39.., S=0..)` | unsupported [4]_ | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `decimal(P=.., S=..-1)` | unsupported [5]_ | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `real` | `FloatType()` | `real` | `real` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `double precision` | `DoubleType()` | `double precision` | `double precision` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `smallint` | `ShortType()` | `smallint` | `smallint` | ++----------------------------------+-----------------------------------+ | | +| `-`` | `ByteType()` | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `integer` | `IntegerType()` | `integer` | `integer` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `bigint` | `LongType()` | `bigint` | `bigint` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| `money` | `StringType()` [1]_ | `text` | `text` | ++----------------------------------+ | | | +| `int4range` | | | | ++----------------------------------+ | | | +| `int8range` | | | | ++----------------------------------+ | | | +| `numrange` | | | | ++----------------------------------+ | | | +| `int2vector` | | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +``` + +[^footnote-4]: Postgres support decimal types with unlimited precision. + + But Spark's `DecimalType(P, S)` supports maximum `P=38` (128 bit). It is impossible to read, write or operate with values of larger precision, + this leads to an exception. + +[^footnote-5]: Postgres support decimal types with negative scale, like `decimal(38, -10)`. Spark doesn't. + +### Temporal types + +```{eval-rst} ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++====================================+==============================+=======================+=========================+ +| `date` | `DateType()` | `date` | `date` | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| `time` | `TimestampType()`, | `timestamp(6)` | `timestamp(6)` | ++------------------------------------+ with time format quirks [6]_ | | | +| `time(0..6)` | | | | ++------------------------------------+ | | | +| `time with time zone` | | | | ++------------------------------------+ | | | +| `time(0..6) with time zone` | | | | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| `timestamp` | `TimestampType()` | `timestamp(6)` | `timestamp(6)` | ++------------------------------------+ | | | +| `timestamp(0..6)` | | | | ++------------------------------------+ | | | +| `timestamp with time zone` | | | | ++------------------------------------+ | | | +| `timestamp(0..6) with time zone` | | | | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| `-`` | `TimestampNTZType()` | `timestamp(6)` | `timestamp(6)` | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| `interval` of any precision | `StringType()` [1]_ | `text` | `text` | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| `-`` | `DayTimeIntervalType()` | unsupported | unsupported | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| `-`` | `YearMonthIntervalType()` | unsupported | unsupported | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| `daterange` | `StringType()` [1]_ | `text` | `text` | ++------------------------------------+ | | | +| `tsrange` | | | | ++------------------------------------+ | | | +| `tstzrange` | | | | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +``` + +```{eval-rst} +.. warning:: + + Note that types in Postgres and Spark have different value ranges: + + +---------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + | Postgres type | Min value | Max value | Spark type | Min value | Max value | + +===============+=================================+==================================+=====================+================================+================================+ + | `date` | `-4713-01-01` | `5874897-01-01` | `DateType()` | `0001-01-01` | `9999-12-31` | + +---------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + | `timestamp` | `-4713-01-01 00:00:00.000000` | `294276-12-31 23:59:59.999999` | `TimestampType()` | `0001-01-01 00:00:00.000000` | `9999-12-31 23:59:59.999999` | + +---------------+---------------------------------+----------------------------------+ | | | + | `time` | `00:00:00.000000` | `24:00:00.000000` | | | | + +---------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all of values can be read from Postgres to Spark. + + References: + * `Postgres date/time types documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ +``` + +[^footnote-6]: `time` type is the same as `timestamp` with date `1970-01-01`. So instead of reading data from Postgres like `23:59:59` + it is actually read `1970-01-01 23:59:59`, and vice versa. + +### String types + +```{eval-rst} ++-----------------------------+-----------------------+-----------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++=============================+=======================+=======================+=========================+ +| `character` | `StringType()` | `text` | `text` | ++-----------------------------+ | | | +| `character(N)` | | | | ++-----------------------------+ | | | +| `character varying` | | | | ++-----------------------------+ | | | +| `character varying(N)` | | | | ++-----------------------------+ | | | +| `text` | | | | ++-----------------------------+ | | | +| `json` | | | | ++-----------------------------+ | | | +| `jsonb` | | | | ++-----------------------------+ | | | +| `xml` | | | | ++-----------------------------+-----------------------+ | | +| `CREATE TYPE ... AS ENUM` | `StringType()` [1]_ | | | ++-----------------------------+ | | | +| `tsvector` | | | | ++-----------------------------+ | | | +| `tsquery` | | | | ++-----------------------------+-----------------------+-----------------------+-------------------------+ +| `-`` | `CharType()` | `unsupported` | `unsupported` | ++-----------------------------+-----------------------+-----------------------+-------------------------+ +| `-`` | `VarcharType()` | `unsupported` | `unsupported` | ++-----------------------------+-----------------------+-----------------------+-------------------------+ +``` + +### Binary types + +```{eval-rst} ++--------------------------+-----------------------+-----------------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++==========================+=======================+=============================+=========================+ +| `boolean` | `BooleanType()` | `boolean` | `boolean` | ++--------------------------+-----------------------+-----------------------------+-------------------------+ +| `bit` | `BooleanType()` | `bool`, | `bool` | ++--------------------------+ | **cannot insert data** [3]_ | | +| `bit(N=1)` | | | | ++--------------------------+-----------------------+-----------------------------+-------------------------+ +| `bit(N=2..)` | `ByteType()` | `bytea`, | `bytea` | +| | | **cannot insert data** [3]_ | | ++--------------------------+-----------------------+-----------------------------+-------------------------+ +| `bit varying` | `StringType()` [1]_ | `text` | `text` | ++--------------------------+ | | | +| `bit varying(N)` | | | | ++--------------------------+-----------------------+-----------------------------+-------------------------+ +| `bytea` | `BinaryType()` | `bytea` | `bytea` | ++--------------------------+-----------------------+-----------------------------+-------------------------+ +``` + +### Struct types + +```{eval-rst} ++--------------------------------+-----------------------+-----------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++================================+=======================+=======================+=========================+ +| `T[]` | `ArrayType(T)` | `T[]` | `T[]` | ++--------------------------------+-----------------------+-----------------------+-------------------------+ +| `T[][]` | unsupported | | | ++--------------------------------+-----------------------+-----------------------+-------------------------+ +| `CREATE TYPE sometype (...)` | `StringType()` [1]_ | `text` | `text` | ++--------------------------------+-----------------------+-----------------------+-------------------------+ +| `-`` | `StructType()` | unsupported | | ++--------------------------------+-----------------------+ | | +| `-`` | `MapType()` | | | ++--------------------------------+-----------------------+-----------------------+-------------------------+ +``` + +### Network types + +```{eval-rst} ++----------------------+-----------------------+-----------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++======================+=======================+=======================+=========================+ +| `cidr` | `StringType()` [1]_ | `text` | `text` | ++----------------------+ | | | +| `inet` | | | | ++----------------------+ | | | +| `macaddr` | | | | ++----------------------+ | | | +| `macaddr8` | | | | ++----------------------+-----------------------+-----------------------+-------------------------+ +``` + +### Geo types + +```{eval-rst} ++----------------------+-----------------------+-----------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++======================+=======================+=======================+=========================+ +| `circle` | `StringType()` [1]_ | `text` | `text` | ++----------------------+ | | | +| `box` | | | | ++----------------------+ | | | +| `line` | | | | ++----------------------+ | | | +| `lseg` | | | | ++----------------------+ | | | +| `path` | | | | ++----------------------+ | | | +| `point` | | | | ++----------------------+ | | | +| `polygon` | | | | ++----------------------+ | | | +| `polygon` | | | | ++----------------------+-----------------------+-----------------------+-------------------------+ +``` + +## Explicit type cast + +### `DBReader` + +It is possible to explicitly cast column of unsupported type using `DBReader(columns=...)` syntax. + +For example, you can use `CAST(column AS text)` to convert data to string representation on Postgres side, and so it will be read as Spark's `StringType()`. + +It is also possible to use [to_json](https://www.postgresql.org/docs/current/functions-json.html) Postgres function to convert column of any type to string representation, and then parse this column on Spark side you can use the {obj}`JSON.parse_column ` method: + +```python +from pyspark.sql.types import IntegerType + +from onetl.connection import Postgres +from onetl.db import DBReader +from onetl.file.format import JSON + +postgres = Postgres(...) + +DBReader( + connection=postgres, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS text) unsupported_column_str", + # or + "to_json(unsupported_column) array_column_json", + ], +) +df = reader.run() + +json_schema = StructType( + [ + StructField("id", IntegerType(), nullable=True), + StructField("name", StringType(), nullable=True), + ..., + ] +) +df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + JSON().parse_column("array_column_json", json_schema).alias("json_string"), +) +``` + +### `DBWriter` + +It is always possible to convert data on the Spark side to a string, and then write it to a text column in a Postgres table. + +#### Using JSON.serialize_column + +You can use the {obj}`JSON.serialize_column ` method for data serialization: + +```python +from onetl.file.format import JSON +from pyspark.sql.functions import col + +from onetl.connection import Postgres +from onetl.db import DBWriter + +postgres = Postgres(...) + +postgres.execute( + """ + CREATE TABLE schema.target_table ( + id int, + supported_column timestamp, + array_column_json jsonb -- any column type, actually + ) + """, +) + +write_df = df.select( + df.id, + df.supported_column, + JSON().serialize_column(df.unsupported_column).alias("array_column_json"), +) + +writer = DBWriter( + connection=postgres, + target="schema.target_table", +) +writer.run(write_df) +``` + +Then you can parse this column on the Postgres side (for example, by creating a view): + +```sql +SELECT + id, + supported_column, + array_column_json->'0' AS array_item_0 +FROM + schema.target_table +``` + +To avoid casting the value on every table read you can use [GENERATED ALWAYS STORED](https://www.postgresql.org/docs/current/ddl-generated-columns.html) column, but this requires 2x space (for original and parsed value). + +#### Manual conversion to string + +Postgres connector also supports conversion text value directly to target column type, if this value has a proper format. + +For example, you can write data like `[123, 345)` to `int8range` type because Postgres allows cast `'[123, 345)'::int8range'`: + +```python +from pyspark.sql.ftypes import StringType +from pyspark.sql.functions import udf + +from onetl.connection import Postgres +from onetl.db import DBReader + +postgres = Postgres(...) + +postgres.execute( + """ + CREATE TABLE schema.target_table ( + id int, + range_column int8range -- any column type, actually + ) + """, +) + + +@udf(returnType=StringType()) +def array_to_range(value: tuple): + """This UDF allows to convert tuple[start, end] to Postgres' range format""" + start, end = value + return f"[{start},{end})" + + +write_df = df.select( + df.id, + array_to_range(df.range_column).alias("range_column"), +) + +writer = DBWriter( + connection=postgres, + target="schema.target_table", +) +writer.run(write_df) +``` + +This can be tricky to implement and may lead to longer write process. +But this does not require extra space on Postgres side, and allows to avoid explicit value cast on every table read. diff --git a/mkdocs/docs/en/connection/db_connection/postgres/write.md b/mkdocs/docs/en/connection/db_connection/postgres/write.md new file mode 100644 index 000000000..8f01de26a --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/postgres/write.md @@ -0,0 +1,56 @@ +(postgres-write)= + +# Writing to Postgres using `DBWriter` + +For writing data to Postgres, use {obj}`DBWriter `. + +```{eval-rst} +.. warning:: + + Please take into account :ref:`postgres-types` +``` + +```{eval-rst} +.. warning:: + + It is always recommended to create table explicitly using :ref:`Postgres.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. +``` + +## Examples + +```python +from onetl.connection import Postgres +from onetl.db import DBWriter + +postgres = Postgres(...) + +df = ... # data is here + +writer = DBWriter( + connection=postgres, + target="schema.table", + options=Postgres.WriteOptions(if_exists="append"), +) + +writer.run(df) +``` + +## Options + +Method above accepts {obj}`Postgres.WriteOptions ` + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.postgres.options +``` + +```{eval-rst} +.. autopydantic_model:: PostgresWriteOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/teradata/connection.md b/mkdocs/docs/en/connection/db_connection/teradata/connection.md new file mode 100644 index 000000000..0804437cf --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/teradata/connection.md @@ -0,0 +1,12 @@ +(teradata-connection)= + +# Teradata connection + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.teradata.connection +``` + +```{eval-rst} +.. autoclass:: Teradata + :members: get_packages, check +``` diff --git a/mkdocs/docs/en/connection/db_connection/teradata/execute.md b/mkdocs/docs/en/connection/db_connection/teradata/execute.md new file mode 100644 index 000000000..de1b018fe --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/teradata/execute.md @@ -0,0 +1,110 @@ +(teradata-execute)= + +# Executing statements in Teradata + +```{eval-rst} +.. warning:: + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`Teradata.sql ` instead. +``` + +## How to + +There are 2 ways to execute some statement in Teradata + +### Use `Teradata.fetch` + +Use this method to execute some `SELECT` query which returns **small number or rows**, like reading +Teradata config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts {obj}`Teradata.FetchOptions `. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +#### Syntax support + +This method supports **any** query syntax supported by Teradata, like: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ✅︎ `SHOW ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples + +```python +from onetl.connection import Teradata + +teradata = Teradata(...) + +df = teradata.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Teradata.FetchOptions(queryTimeout=10), +) +teradata.close() +value = df.collect()[0][0] # get value from first row and first column +``` + +### Use `Teradata.execute` + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts {obj}`Teradata.ExecuteOptions `. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +#### Syntax support + +This method supports **any** query syntax supported by Teradata, like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...`, and so on +- ✅︎ `ALTER ...` +- ✅︎ `INSERT INTO ... SELECT ...`, `UPDATE ...`, `DELETE ...`, and so on +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, `TRUNCATE TABLE`, and so on +- ✅︎ `CALL procedure(arg1, arg2) ...` or `{call procedure(arg1, arg2)}` - special syntax for calling procedure +- ✅︎ `EXECUTE macro(arg1, arg2)` +- ✅︎ `EXECUTE FUNCTION ...` +- ✅︎ other statements not mentioned here +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples + +```python +from onetl.connection import Teradata + +teradata = Teradata(...) + +teradata.execute("DROP TABLE database.table") +teradata.execute( + """ + CREATE MULTISET TABLE database.table AS ( + id BIGINT, + key VARCHAR, + value REAL + ) + NO PRIMARY INDEX + """, + options=Teradata.ExecuteOptions(queryTimeout=10), +) +``` + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.teradata.options +``` + +```{eval-rst} +.. autopydantic_model:: TeradataFetchOptions + :inherited-members: GenericOptions + :member-order: bysource + +``` + +```{eval-rst} +.. autopydantic_model:: TeradataExecuteOptions + :inherited-members: GenericOptions + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/db_connection/teradata/index.md b/mkdocs/docs/en/connection/db_connection/teradata/index.md new file mode 100644 index 000000000..f86e25726 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/teradata/index.md @@ -0,0 +1,21 @@ +(teradata)= + +# Teradata + +```{toctree} +:caption: Connection +:maxdepth: 1 + +prerequisites +connection +``` + +```{toctree} +:caption: Operations +:maxdepth: 1 + +read +sql +write +execute +``` diff --git a/mkdocs/docs/en/connection/db_connection/teradata/prerequisites.md b/mkdocs/docs/en/connection/db_connection/teradata/prerequisites.md new file mode 100644 index 000000000..d180ffdeb --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/teradata/prerequisites.md @@ -0,0 +1,57 @@ +(teradata-prerequisites)= + +# Prerequisites + +## Version Compatibility + +- Teradata server versions: + : - Officially declared: 16.10 - 20.0 + - Actually tested: 16.10 +- Spark versions: 2.3.x - 3.5.x +- Java versions: 8 - 20 + +See [official documentation](https://teradata-docs.s3.amazonaws.com/doc/connectivity/jdbc/reference/current/platformMatrix.html). + +## Installing PySpark + +To use Teradata connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See {ref}`install-spark` installation instruction for more details. + +## Connecting to Teradata + +### Connection host + +It is possible to connect to Teradata by using either DNS name Parsing Engine (PE) host, or it's IP address. + +### Connection port + +Connection is usually performed to port `1025`. Port may differ for different Teradata instances. +Please ask your Teradata administrator to provide required information. + +### Required grants + +Ask your Teradata cluster administrator to set following grants for a user, +used for creating a connection: + +```{eval-rst} +.. tabs:: + + .. code-tab:: sql Read + Write + + -- allow creating tables in the target schema + GRANT CREATE TABLE ON database TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON database.mytable TO username; + + .. code-tab:: sql Read only + + -- allow read access to specific table + GRANT SELECT ON database.mytable TO username; +``` + +See: +: - [Teradata access rights](https://www.dwhpro.com/teradata-access-rights/) + - [GRANT documentation](https://teradata.github.io/presto/docs/0.167-t/sql/grant.html) diff --git a/mkdocs/docs/en/connection/db_connection/teradata/read.md b/mkdocs/docs/en/connection/db_connection/teradata/read.md new file mode 100644 index 000000000..c57b09119 --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/teradata/read.md @@ -0,0 +1,117 @@ +(teradata-read)= + +# Reading from Teradata using `DBReader` + +{obj}`DBReader ` supports {ref}`strategy` for incremental data reading, +but does not support custom queries, like `JOIN`. + +## Supported DBReader features + +- ✅︎ `columns` +- ✅︎ `where` +- ✅︎ `hwm`, supported strategies: +- - ✅︎ {ref}`snapshot-strategy` +- - ✅︎ {ref}`incremental-strategy` +- - ✅︎ {ref}`snapshot-batch-strategy` +- - ✅︎ {ref}`incremental-batch-strategy` +- ❌ `hint` (is not supported by Teradata) +- ❌ `df_schema` +- ✅︎ `options` (see {obj}`Teradata.ReadOptions `) + +## Examples + +Snapshot strategy: + +```python +from onetl.connection import Teradata +from onetl.db import DBReader + +teradata = Teradata(...) + +reader = DBReader( + connection=teradata, + source="database.table", + columns=["id", "key", "CAST(value AS VARCHAR) value", "updated_dt"], + where="key = 'something'", + options=Teradata.ReadOptions( + partitioning_mode="hash", + partitionColumn="id", + numPartitions=10, + ), +) +df = reader.run() +``` + +Incremental strategy: + +```python +from onetl.connection import Teradata +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +teradata = Teradata(...) + +reader = DBReader( + connection=teradata, + source="database.table", + columns=["id", "key", "CAST(value AS VARCHAR) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="teradata_hwm", expression="updated_dt"), + options=Teradata.ReadOptions( + partitioning_mode="hash", + partitionColumn="id", + numPartitions=10, + ), +) + +with IncrementalStrategy(): + df = reader.run() +``` + +## Recommendations + +### Select only required columns + +Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This reduces the amount of data passed from Teradata to Spark. + +### Pay attention to `where` value + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where="column = 'value'")` clause. +This both reduces the amount of data send from Teradata to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +### Read data in parallel + +`DBReader` can read data in multiple parallel connections by passing `Teradata.ReadOptions(numPartitions=..., partitionColumn=...)`. + +In the example above, Spark opens 10 parallel connections, and data is evenly distributed between all these connections using expression +`HASHAMP(HASHBUCKET(HASHROW({partition_column}))) MOD {num_partitions}`. +This allows sending each Spark worker only some piece of data, reducing resource consumption. +`partition_column` here can be table column of any type. + +It is also possible to use `partitioning_mode="mod"` or `partitioning_mode="range"`, but in this case +`partition_column` have to be an integer, should not contain `NULL`, and values to be uniformly distributed. +It is also less performant than `partitioning_mode="hash"` due to Teradata `HASHAMP` implementation. + +### Do **NOT** use `TYPE=FASTEXPORT` + +Teradata supports several [different connection types](https://teradata-docs.s3.amazonaws.com/doc/connectivity/jdbc/reference/current/jdbcug_chapter_2.html#BABFGFAF): +: - `TYPE=DEFAULT` - perform plain `SELECT` queries + - `TYPE=FASTEXPORT` - uses special FastExport protocol for select queries + +But `TYPE=FASTEXPORT` uses exclusive lock on the source table, so it is impossible to use multiple Spark workers parallel data read. +This leads to sending all the data to just one Spark worker, which is slow and takes a lot of RAM. + +Prefer using `partitioning_mode="hash"` from example above. + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.teradata.options +``` + +```{eval-rst} +.. autopydantic_model:: TeradataReadOptions + :inherited-members: GenericOptions + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/db_connection/teradata/sql.md b/mkdocs/docs/en/connection/db_connection/teradata/sql.md new file mode 100644 index 000000000..ad0919dce --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/teradata/sql.md @@ -0,0 +1,76 @@ +(teradata-sql)= + +# Reading from Teradata using `Teradata.sql` + +`Teradata.sql` allows passing custom SQL query, but does not support incremental strategies. + +```{eval-rst} +.. warning:: + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. +``` + +## Syntax support + +Only queries with the following syntax are supported: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ❌ `SHOW ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +## Examples + +```python +from onetl.connection import Teradata + +teradata = Teradata(...) +df = teradata.sql( + """ + SELECT + id, + key, + CAST(value AS VARCHAR) AS value, + updated_at, + HASHAMP(HASHBUCKET(HASHROW(id))) MOD 10 AS part_column + FROM + database.mytable + WHERE + key = 'something' + """, + options=Teradata.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), +) +``` + +## Recommendations + +### Select only required columns + +Instead of passing `SELECT * FROM ...` prefer passing exact column names `SELECT col1, col2, ...`. +This reduces the amount of data passed from Teradata to Spark. + +### Pay attention to `where` value + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `WHERE column = 'value'` clause. +This both reduces the amount of data send from Teradata to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.teradata.options +``` + +```{eval-rst} +.. autopydantic_model:: TeradataSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/db_connection/teradata/write.md b/mkdocs/docs/en/connection/db_connection/teradata/write.md new file mode 100644 index 000000000..6d1f6644f --- /dev/null +++ b/mkdocs/docs/en/connection/db_connection/teradata/write.md @@ -0,0 +1,120 @@ +(teradata-write)= + +# Writing to Teradata using `DBWriter` + +For writing data to Teradata, use {obj}`DBWriter `. + +```{eval-rst} +.. warning:: + + It is always recommended to create table explicitly using :ref:`Teradata.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. +``` + +## Examples + +```python +from onetl.connection import Teradata +from onetl.db import DBWriter + +teradata = Teradata( + ..., + extra={"TYPE": "FASTLOAD", "TMODE": "TERA"}, +) + +df = ... # data is here + +writer = DBWriter( + connection=teradata, + target="database.table", + options=Teradata.WriteOptions( + if_exists="append", + # avoid creating SET table, use MULTISET + createTableOptions="NO PRIMARY INDEX", + ), +) + +writer.run(df.repartition(1)) +``` + +## Recommendations + +### Number of connections + +Teradata is not MVCC based, so write operations take exclusive lock on the entire table. +So **it is impossible to write data to Teradata table in multiple parallel connections**, no exceptions. + +The only way to write to Teradata without making deadlocks is write dataframe with exactly 1 partition. + +It can be implemented using `df.repartition(1)`: + +```python +# do NOT use df.coalesce(1) as it can freeze +writer.run(df.repartition(1)) +``` + +This moves all the data to just one Spark worker, so it may consume a lot of RAM. It is usually require to increase `spark.executor.memory` to handle this. + +Another way is to write all dataframe partitions one-by-one: + +```python +from pyspark.sql.functions import spark_partition_id + +# get list of all partitions in the dataframe +partitions = sorted(df.select(spark_partition_id()).distinct().collect()) + +for partition in partitions: + # get only part of data within this exact partition + part_df = df.where(**partition.asDict()).coalesce(1) + + writer.run(part_df) +``` + +This require even data distribution for all partitions to avoid data skew and spikes of RAM consuming. + +### Choosing connection type + +Teradata supports several [different connection types](https://teradata-docs.s3.amazonaws.com/doc/connectivity/jdbc/reference/current/jdbcug_chapter_2.html#BABFGFAF): +: - `TYPE=DEFAULT` - perform plain `INSERT` queries + - `TYPE=FASTLOAD` - uses special FastLoad protocol for insert queries + +It is always recommended to use `TYPE=FASTLOAD` because: +: - It provides higher performance + - It properly handles inserting `NULL` values (`TYPE=DEFAULT` raises an exception) + +But it can be used only during write, not read. + +### Choosing transaction mode + +Teradata supports [2 different transaction modes](https://teradata-docs.s3.amazonaws.com/doc/connectivity/jdbc/reference/current/jdbcug_chapter_2.html#TMODESEC): +: - `TMODE=ANSI` + - `TMODE=TERA` + +Choosing one of the modes can alter connector behavior. For example: +: - Inserting data which exceeds table column length, like insert `CHAR(25)` to column with type `CHAR(24)`: + - - `TMODE=ANSI` - raises exception + - - `TMODE=TERA` - truncates input string to 24 symbols + - Creating table using Spark: + - - `TMODE=ANSI` - creates `MULTISET` table + - - `TMODE=TERA` - creates `SET` table with `PRIMARY KEY` is a first column in dataframe. + This can lead to slower insert time, because each row will be checked against a unique index. + Fortunately, this can be disabled by passing custom `createTableOptions`. + +## Options + +Method above accepts {obj}`Teradata.WriteOptions ` + +```{eval-rst} +.. currentmodule:: onetl.connection.db_connection.teradata.options +``` + +```{eval-rst} +.. autopydantic_model:: TeradataWriteOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/connection/file_connection/ftp.md b/mkdocs/docs/en/connection/file_connection/ftp.md new file mode 100644 index 000000000..8877f479e --- /dev/null +++ b/mkdocs/docs/en/connection/file_connection/ftp.md @@ -0,0 +1,12 @@ +(ftp)= + +# FTP connection + +```{eval-rst} +.. currentmodule:: onetl.connection.file_connection.ftp +``` + +```{eval-rst} +.. autoclass:: FTP + :members: __init__, check, path_exists, is_file, is_dir, get_stat, resolve_dir, resolve_file, create_dir, remove_file, remove_dir, rename_dir, rename_file, list_dir, walk, download_file, upload_file +``` diff --git a/mkdocs/docs/en/connection/file_connection/ftps.md b/mkdocs/docs/en/connection/file_connection/ftps.md new file mode 100644 index 000000000..668be45b5 --- /dev/null +++ b/mkdocs/docs/en/connection/file_connection/ftps.md @@ -0,0 +1,12 @@ +(ftps)= + +# FTPS connection + +```{eval-rst} +.. currentmodule:: onetl.connection.file_connection.ftps +``` + +```{eval-rst} +.. autoclass:: FTPS + :members: __init__, check, path_exists, is_file, is_dir, get_stat, resolve_dir, resolve_file, create_dir, remove_file, remove_dir, rename_dir, rename_file, list_dir, walk, download_file, upload_file +``` diff --git a/mkdocs/docs/en/connection/file_connection/hdfs/connection.md b/mkdocs/docs/en/connection/file_connection/hdfs/connection.md new file mode 100644 index 000000000..b1cffa679 --- /dev/null +++ b/mkdocs/docs/en/connection/file_connection/hdfs/connection.md @@ -0,0 +1,12 @@ +(hdfs-connection)= + +# HDFS connection + +```{eval-rst} +.. currentmodule:: onetl.connection.file_connection.hdfs.connection +``` + +```{eval-rst} +.. autoclass:: HDFS + :members: get_current, check, path_exists, is_file, is_dir, get_stat, resolve_dir, resolve_file, create_dir, remove_file, remove_dir, rename_dir, rename_file, list_dir, walk, download_file, upload_file +``` diff --git a/mkdocs/docs/en/connection/file_connection/hdfs/index.md b/mkdocs/docs/en/connection/file_connection/hdfs/index.md new file mode 100644 index 000000000..a4c9fa148 --- /dev/null +++ b/mkdocs/docs/en/connection/file_connection/hdfs/index.md @@ -0,0 +1,17 @@ +(hdfs)= + +# HDFS + +```{toctree} +:caption: Connection +:maxdepth: 1 + +connection +``` + +```{toctree} +:caption: For developers +:maxdepth: 1 + +slots +``` diff --git a/mkdocs/docs/en/connection/file_connection/hdfs/slots.md b/mkdocs/docs/en/connection/file_connection/hdfs/slots.md new file mode 100644 index 000000000..88f7e2943 --- /dev/null +++ b/mkdocs/docs/en/connection/file_connection/hdfs/slots.md @@ -0,0 +1,13 @@ +(hdfs-slots)= + +# HDFS Slots + +```{eval-rst} +.. currentmodule:: onetl.connection.file_connection.hdfs.slots +``` + +```{eval-rst} +.. autoclass:: HDFSSlots + :members: normalize_cluster_name, normalize_namenode_host, get_known_clusters, get_cluster_namenodes, get_current_cluster, get_webhdfs_port, is_namenode_active + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/file_connection/index.md b/mkdocs/docs/en/connection/file_connection/index.md new file mode 100644 index 000000000..94a1cfe57 --- /dev/null +++ b/mkdocs/docs/en/connection/file_connection/index.md @@ -0,0 +1,16 @@ +()= + +# File Connections { #file-connections-0 } + +```{toctree} +:caption: File Connections +:maxdepth: 1 + +FTP +FTPS +HDFS +Samba +SFTP +S3 +Webdav +``` diff --git a/mkdocs/docs/en/connection/file_connection/s3.md b/mkdocs/docs/en/connection/file_connection/s3.md new file mode 100644 index 000000000..c69a3ed8b --- /dev/null +++ b/mkdocs/docs/en/connection/file_connection/s3.md @@ -0,0 +1,12 @@ +(s3)= + +# S3 connection + +```{eval-rst} +.. currentmodule:: onetl.connection.file_connection.s3 +``` + +```{eval-rst} +.. autoclass:: S3 + :members: __init__, check, path_exists, is_file, is_dir, get_stat, resolve_dir, resolve_file, create_dir, remove_file, remove_dir, rename_file, list_dir, walk, download_file, upload_file +``` diff --git a/mkdocs/docs/en/connection/file_connection/samba.md b/mkdocs/docs/en/connection/file_connection/samba.md new file mode 100644 index 000000000..29f9f3f81 --- /dev/null +++ b/mkdocs/docs/en/connection/file_connection/samba.md @@ -0,0 +1,12 @@ +(samba)= + +# Samba connection + +```{eval-rst} +.. currentmodule:: onetl.connection.file_connection.samba +``` + +```{eval-rst} +.. autoclass:: Samba + :members: __init__, check, path_exists, is_file, is_dir, get_stat, resolve_dir, resolve_file, create_dir, remove_file, remove_dir, rename_file, list_dir, download_file, upload_file +``` diff --git a/mkdocs/docs/en/connection/file_connection/sftp.md b/mkdocs/docs/en/connection/file_connection/sftp.md new file mode 100644 index 000000000..45c7affa6 --- /dev/null +++ b/mkdocs/docs/en/connection/file_connection/sftp.md @@ -0,0 +1,12 @@ +(sftp)= + +# SFTP connection + +```{eval-rst} +.. currentmodule:: onetl.connection.file_connection.sftp +``` + +```{eval-rst} +.. autoclass:: SFTP + :members: __init__, check, path_exists, is_file, is_dir, get_stat, resolve_dir, resolve_file, create_dir, remove_file, remove_dir, rename_dir, rename_file, list_dir, walk, download_file, upload_file +``` diff --git a/mkdocs/docs/en/connection/file_connection/webdav.md b/mkdocs/docs/en/connection/file_connection/webdav.md new file mode 100644 index 000000000..5fd5ba006 --- /dev/null +++ b/mkdocs/docs/en/connection/file_connection/webdav.md @@ -0,0 +1,12 @@ +(webdav)= + +# WebDAV connection + +```{eval-rst} +.. currentmodule:: onetl.connection.file_connection.webdav +``` + +```{eval-rst} +.. autoclass:: WebDAV + :members: __init__, check, path_exists, is_file, is_dir, get_stat, resolve_dir, resolve_file, create_dir, remove_file, remove_dir, rename_file, list_dir, walk, download_file, upload_file +``` diff --git a/mkdocs/docs/en/connection/file_df_connection/base.md b/mkdocs/docs/en/connection/file_df_connection/base.md new file mode 100644 index 000000000..4b3935bf7 --- /dev/null +++ b/mkdocs/docs/en/connection/file_df_connection/base.md @@ -0,0 +1,12 @@ +(base-file-df-connection)= + +# Base interface + +```{eval-rst} +.. currentmodule:: onetl.base.base_file_df_connection +``` + +```{eval-rst} +.. autoclass:: BaseFileDFConnection + :members: check, check_if_format_supported, read_files_as_df, write_df_as_files +``` diff --git a/mkdocs/docs/en/connection/file_df_connection/index.md b/mkdocs/docs/en/connection/file_df_connection/index.md new file mode 100644 index 000000000..59322d077 --- /dev/null +++ b/mkdocs/docs/en/connection/file_df_connection/index.md @@ -0,0 +1,15 @@ +# File DataFrame Connections + +* [Spark LocalFS](spark_local_fs.md) +* [Spark HDFS](spark_hdfs/index.md) + * [Prerequisites](spark_hdfs/prerequisites.md) + * [Connection](spark_hdfs/connection.md) + * [Slots](spark_hdfs/slots.md) +* [Spark S3](spark_s3/index.md) + * [Prerequisites](spark_s3/prerequisites.md) + * [Connection](spark_s3/connection.md) + * [Troubleshooting](spark_s3/troubleshooting.md) + +# For developers + +* [Base interface](base.md) diff --git a/mkdocs/docs/en/connection/file_df_connection/spark_hdfs/connection.md b/mkdocs/docs/en/connection/file_df_connection/spark_hdfs/connection.md new file mode 100644 index 000000000..08b8e4c3b --- /dev/null +++ b/mkdocs/docs/en/connection/file_df_connection/spark_hdfs/connection.md @@ -0,0 +1,12 @@ +(spark-hdfs-connection)= + +# Spark HDFS Connection + +```{eval-rst} +.. currentmodule:: onetl.connection.file_df_connection.spark_hdfs.connection +``` + +```{eval-rst} +.. autoclass:: SparkHDFS + :members: check, get_current +``` diff --git a/mkdocs/docs/en/connection/file_df_connection/spark_hdfs/index.md b/mkdocs/docs/en/connection/file_df_connection/spark_hdfs/index.md new file mode 100644 index 000000000..77d347d7a --- /dev/null +++ b/mkdocs/docs/en/connection/file_df_connection/spark_hdfs/index.md @@ -0,0 +1,18 @@ +(spark-hdfs)= + +# Spark HDFS + +```{toctree} +:caption: Connection +:maxdepth: 1 + +prerequisites +Connection +``` + +```{toctree} +:caption: For developers +:maxdepth: 1 + +Slots +``` diff --git a/mkdocs/docs/en/connection/file_df_connection/spark_hdfs/prerequisites.md b/mkdocs/docs/en/connection/file_df_connection/spark_hdfs/prerequisites.md new file mode 100644 index 000000000..f946b3a70 --- /dev/null +++ b/mkdocs/docs/en/connection/file_df_connection/spark_hdfs/prerequisites.md @@ -0,0 +1,46 @@ +(spark-hdfs-prerequisites)= + +# Prerequisites + +## Version Compatibility + +- Hadoop versions: 2.x, 3.x (only with Hadoop 3.x libraries) +- Spark versions: 2.3.x - 3.5.x +- Java versions: 8 - 20 + +## Installing PySpark + +To use SparkHDFS connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See {ref}`install-spark` installation instruction for more details. + +## Using Kerberos + +Some of Hadoop managed clusters use Kerberos authentication. In this case, you should call [kinit](https://web.mit.edu/kerberos/krb5-1.12/doc/user/user_commands/kinit.html) command +**BEFORE** starting Spark session to generate Kerberos ticket. See {ref}`install-kerberos`. + +Sometimes it is also required to pass keytab file to Spark config, allowing Spark executors to generate own Kerberos tickets: + +```{eval-rst} +.. tabs:: + + .. code-tab:: python Spark 3 + + SparkSession.builder + .option("spark.kerberos.access.hadoopFileSystems", "hdfs://namenode1.domain.com:9820,hdfs://namenode2.domain.com:9820") + .option("spark.kerberos.principal", "user") + .option("spark.kerberos.keytab", "/path/to/keytab") + .gerOrCreate() + + .. code-tab:: python Spark 2 + + SparkSession.builder + .option("spark.yarn.access.hadoopFileSystems", "hdfs://namenode1.domain.com:9820,hdfs://namenode2.domain.com:9820") + .option("spark.yarn.principal", "user") + .option("spark.yarn.keytab", "/path/to/keytab") + .gerOrCreate() +``` + +See [Spark security documentation](https://spark.apache.org/docs/latest/security.html#kerberos) +for more details. diff --git a/mkdocs/docs/en/connection/file_df_connection/spark_hdfs/slots.md b/mkdocs/docs/en/connection/file_df_connection/spark_hdfs/slots.md new file mode 100644 index 000000000..715578579 --- /dev/null +++ b/mkdocs/docs/en/connection/file_df_connection/spark_hdfs/slots.md @@ -0,0 +1,13 @@ +(spark-hdfs-slots)= + +# Spark HDFS Slots + +```{eval-rst} +.. currentmodule:: onetl.connection.file_df_connection.spark_hdfs.slots +``` + +```{eval-rst} +.. autoclass:: SparkHDFSSlots + :members: normalize_cluster_name, normalize_namenode_host, get_known_clusters, get_cluster_namenodes, get_current_cluster, get_ipc_port, is_namenode_active + :member-order: bysource +``` diff --git a/mkdocs/docs/en/connection/file_df_connection/spark_local_fs.md b/mkdocs/docs/en/connection/file_df_connection/spark_local_fs.md new file mode 100644 index 000000000..524d1c676 --- /dev/null +++ b/mkdocs/docs/en/connection/file_df_connection/spark_local_fs.md @@ -0,0 +1,12 @@ +(spark-local-fs)= + +# Spark LocalFS + +```{eval-rst} +.. currentmodule:: onetl.connection.file_df_connection.spark_local_fs +``` + +```{eval-rst} +.. autoclass:: SparkLocalFS + :members: check +``` diff --git a/mkdocs/docs/en/connection/file_df_connection/spark_s3/connection.md b/mkdocs/docs/en/connection/file_df_connection/spark_s3/connection.md new file mode 100644 index 000000000..82de3999f --- /dev/null +++ b/mkdocs/docs/en/connection/file_df_connection/spark_s3/connection.md @@ -0,0 +1,12 @@ +(spark-s3-connection)= + +# Spark S3 Connection + +```{eval-rst} +.. currentmodule:: onetl.connection.file_df_connection.spark_s3.connection +``` + +```{eval-rst} +.. autoclass:: SparkS3 + :members: check, close, get_packages, get_exclude_packages +``` diff --git a/mkdocs/docs/en/connection/file_df_connection/spark_s3/index.md b/mkdocs/docs/en/connection/file_df_connection/spark_s3/index.md new file mode 100644 index 000000000..1ecf94737 --- /dev/null +++ b/mkdocs/docs/en/connection/file_df_connection/spark_s3/index.md @@ -0,0 +1,12 @@ +(spark-s3)= + +# Spark S3 + +```{toctree} +:caption: Connection +:maxdepth: 1 + +prerequisites +Connection +Troubleshooting +``` diff --git a/mkdocs/docs/en/connection/file_df_connection/spark_s3/prerequisites.md b/mkdocs/docs/en/connection/file_df_connection/spark_s3/prerequisites.md new file mode 100644 index 000000000..9135275f3 --- /dev/null +++ b/mkdocs/docs/en/connection/file_df_connection/spark_s3/prerequisites.md @@ -0,0 +1,61 @@ +(spark-s3-prerequisites)= + +# Prerequisites + +## Version Compatibility + +- Spark versions: 3.2.x - 3.5.x (only with Hadoop 3.x libraries) +- Java versions: 8 - 20 + +## Installing PySpark + +To use SparkS3 connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See {ref}`install-spark` installation instruction for more details. + +## Connecting to S3 + +### Bucket access style + +AWS and some other S3 cloud providers allows bucket access using domain style only, e.g. `https://mybucket.s3provider.com`. + +Other implementations, like Minio, by default allows path style access only, e.g. `https://s3provider.com/mybucket` +(see [MINIO_DOMAIN](https://min.io/docs/minio/linux/reference/minio-server/minio-server.html#envvar.MINIO_DOMAIN)). + +You should set `path.style.access` to `True` or `False`, to choose the preferred style. + +### Authentication + +Different S3 instances can use different authentication methods, like: +: - `access_key + secret_key` (or username + password) + - `access_key + secret_key + session_token` + +Usually these are just passed to SparkS3 constructor: + +```python +SparkS3( + access_key=..., + secret_key=..., + session_token=..., +) +``` + +But some S3 cloud providers, like AWS, may require custom credential providers. You can pass them like: + +```python +SparkS3( + extra={ + # provider class + "aws.credentials.provider": "org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider", + # other options, if needed + "assumed.role.arn": "arn:aws:iam::90066806600238:role/s3-restricted", + }, +) +``` + +See [Hadoop-AWS](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Changing_Authentication_Providers) documentation. + +## Troubleshooting + +See {ref}`spark-s3-troubleshooting`. diff --git a/mkdocs/docs/en/connection/file_df_connection/spark_s3/troubleshooting.md b/mkdocs/docs/en/connection/file_df_connection/spark_s3/troubleshooting.md new file mode 100644 index 000000000..32b225005 --- /dev/null +++ b/mkdocs/docs/en/connection/file_df_connection/spark_s3/troubleshooting.md @@ -0,0 +1,377 @@ +(spark-s3-troubleshooting)= + +# Spark S3 Troubleshooting + +```{eval-rst} +.. note:: + + General guide: :ref:`troubleshooting`. +``` + +More details: + +- [Hadoop AWS Troubleshooting Guide](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/troubleshooting_s3a.html) +- [Hadoop AWS Performance Guide](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/performance.html) +- [Spark integration with Cloud Infrastructures](https://spark.apache.org/docs/latest/cloud-integration.html) + +## `SparkS3.check()` and other methods hang + +### Details + +S3 may not respond for connection attempts for a long time if it's under heavy load. +To handle this, Hadoop AWS library has retry mechanism. By default it retries 7 times with 500ms interval. + +Hadoop AWS is based on AWS SDK library, which also has retry mechanism. This mechanism is not disabled because it handles different +errors than Hadoop AWS, so they complement each other. Default number of attempts in AWS SDK is 20 with minimal 5s interval, +which is exponentially increasing with each failed attempt. + +It is not a problem if S3 source is not accessible at all, like hostname cannot be resolved, or port is not opened. +These errors are not recoverable, and retry mechanism is not activated. + +But errors like SSL issues, are considered recoverable, and this causing retry of retry over increasing interval. +So user is waiting for [almost 15 minutes](https://issues.apache.org/jira/browse/HADOOP-18839) just to get exception message. + +### How to determine reason + +#### Make logging more verbose + +Change Spark session log level to {ref}`DEBUG ` to print result of each attempt. +Resulting logs will look like this + +```{eval-rst} +.. dropdown:: See log + + .. code:: text + + 23/08/03 11:25:10 DEBUG S3AFileSystem: Using S3ABlockOutputStream with buffer = disk; block=67108864; queue limit=4 + 23/08/03 11:25:10 DEBUG S3Guard: Metastore option source [core-default.xml] + 23/08/03 11:25:10 DEBUG S3Guard: Using NullMetadataStore metadata store for s3a filesystem + 23/08/03 11:25:10 DEBUG S3AFileSystem: S3Guard is disabled on this bucket: test-bucket + 23/08/03 11:25:10 DEBUG DirectoryPolicyImpl: Directory markers will be deleted + 23/08/03 11:25:10 DEBUG S3AFileSystem: Directory marker retention policy is DirectoryMarkerRetention{policy='delete'} + 23/08/03 11:25:10 DEBUG S3AUtils: Value of fs.s3a.multipart.purge.age is 86400 + 23/08/03 11:25:10 DEBUG S3AUtils: Value of fs.s3a.bulk.delete.page.size is 250 + 23/08/03 11:25:10 DEBUG FileSystem: Creating FS s3a://test-bucket/fake: duration 0:01.029s + 23/08/03 11:25:10 DEBUG IOStatisticsStoreImpl: Incrementing counter op_is_directory by 1 with final value 1 + 23/08/03 11:25:10 DEBUG S3AFileSystem: Getting path status for s3a://test-bucket/fake (fake); needEmptyDirectory=false + 23/08/03 11:25:10 DEBUG S3AFileSystem: S3GetFileStatus s3a://test-bucket/fake + 23/08/03 11:25:10 DEBUG S3AFileSystem: LIST List test-bucket:/fake/ delimiter=/ keys=2 requester pays=false + 23/08/03 11:25:10 DEBUG S3AFileSystem: Starting: LIST + 23/08/03 11:25:10 DEBUG IOStatisticsStoreImpl: Incrementing counter object_list_request by 1 with final value 1 + 23/08/03 11:25:10 DEBUG AWSCredentialProviderList: Using credentials from SimpleAWSCredentialsProvider + 23/08/03 11:25:10 DEBUG request: Sending Request: GET https://test-bucket.localhost:9000 / Parameters: ({"list-type":["2"],"delimiter":["/"],"max-keys":["2"],"prefix":["fake/"],"fetch-owner":["false"]}Headers: (amz-sdk-invocation-id: e6d62603-96e4-a80f-10a1-816e0822bc71, Content-Type: application/octet-stream, User-Agent: Hadoop 3.3.4, aws-sdk-java/1.12.262 Linux/6.4.7-1-MANJARO OpenJDK_64-Bit_Server_VM/25.292-b10 java/1.8.0_292 scala/2.12.17 vendor/AdoptOpenJDK cfg/retry-mode/legacy, ) + 23/08/03 11:25:10 DEBUG AWS4Signer: AWS4 Canonical Request: '"GET + / + delimiter=%2F&fetch-owner=false&list-type=2&max-keys=2&prefix=fake%2F + amz-sdk-invocation-id:e6d62603-96e4-a80f-10a1-816e0822bc71 + amz-sdk-request:attempt=1;max=21 + amz-sdk-retry:0/0/500 + content-type:application/octet-stream + host:test-bucket.localhost:9000 + user-agent:Hadoop 3.3.4, aws-sdk-java/1.12.262 Linux/6.4.7-1-MANJARO OpenJDK_64-Bit_Server_VM/25.292-b10 java/1.8.0_292 scala/2.12.17 vendor/AdoptOpenJDK cfg/retry-mode/legacy + x-amz-content-sha256:UNSIGNED-PAYLOAD + x-amz-date:20230803T112510Z + + amz-sdk-invocation-id;amz-sdk-request;amz-sdk-retry;content-type;host;user-agent;x-amz-content-sha256;x-amz-date + UNSIGNED-PAYLOAD" + 23/08/03 11:25:10 DEBUG AWS4Signer: AWS4 String to Sign: '"AWS4-HMAC-SHA256 + 20230803T112510Z + 20230803/us-east-1/s3/aws4_request + 31a317bb7f6d97248dd0cf03429d701cbb3e29ce889cfbb98ba7a34c57a3bfba" + 23/08/03 11:25:10 DEBUG AWS4Signer: Generating a new signing key as the signing key not available in the cache for the date 1691020800000 + 23/08/03 11:25:10 DEBUG RequestAddCookies: CookieSpec selected: default + 23/08/03 11:25:10 DEBUG RequestAuthCache: Auth cache not set in the context + 23/08/03 11:25:10 DEBUG PoolingHttpClientConnectionManager: Connection request: [route: {s}->https://test-bucket.localhost:9000][total available: 0; route allocated: 0 of 96; total allocated: 0 of 96] + 23/08/03 11:25:10 DEBUG PoolingHttpClientConnectionManager: Connection leased: [id: 0][route: {s}->https://test-bucket.localhost:9000][total available: 0; route allocated: 1 of 96; total allocated: 1 of 96] + 23/08/03 11:25:10 DEBUG MainClientExec: Opening connection {s}->https://test-bucket.localhost:9000 + 23/08/03 11:25:10 DEBUG DefaultHttpClientConnectionOperator: Connecting to test-bucket.localhost/127.0.0.1:9000 + 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Connecting socket to test-bucket.localhost/127.0.0.1:9000 with timeout 5000 + 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Enabled protocols: [TLSv1.2] + 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Enabled cipher suites:[TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384, TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384, TLS_RSA_WITH_AES_256_CBC_SHA256, TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384, TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384, TLS_DHE_RSA_WITH_AES_256_CBC_SHA256, TLS_DHE_DSS_WITH_AES_256_CBC_SHA256, TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA, TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA, TLS_RSA_WITH_AES_256_CBC_SHA, TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA, TLS_ECDH_RSA_WITH_AES_256_CBC_SHA, TLS_DHE_RSA_WITH_AES_256_CBC_SHA, TLS_DHE_DSS_WITH_AES_256_CBC_SHA, TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256, TLS_RSA_WITH_AES_128_CBC_SHA256, TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256, TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256, TLS_DHE_RSA_WITH_AES_128_CBC_SHA256, TLS_DHE_DSS_WITH_AES_128_CBC_SHA256, TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA, TLS_RSA_WITH_AES_128_CBC_SHA, TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA, TLS_ECDH_RSA_WITH_AES_128_CBC_SHA, TLS_DHE_RSA_WITH_AES_128_CBC_SHA, TLS_DHE_DSS_WITH_AES_128_CBC_SHA, TLS_EMPTY_RENEGOTIATION_INFO_SCSV] + 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Starting handshake + 23/08/03 11:25:10 DEBUG ClientConnectionManagerFactory: + java.lang.reflect.InvocationTargetException + at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + at java.lang.reflect.Method.invoke(Method.java:498) + at com.amazonaws.http.conn.ClientConnectionManagerFactory$Handler.invoke(ClientConnectionManagerFactory.java:76) + at com.amazonaws.http.conn.$Proxy32.connect(Unknown Source) + at com.amazonaws.thirdparty.apache.http.impl.execchain.MainClientExec.establishRoute(MainClientExec.java:393) + at com.amazonaws.thirdparty.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:236) + at com.amazonaws.thirdparty.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:186) + at com.amazonaws.thirdparty.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185) + at com.amazonaws.thirdparty.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83) + at com.amazonaws.thirdparty.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56) + at com.amazonaws.http.apache.client.impl.SdkHttpClient.execute(SdkHttpClient.java:72) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1346) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1157) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:814) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:781) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:755) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:715) + at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:697) + at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:561) + at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:541) + at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5456) + at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5403) + at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5397) + at com.amazonaws.services.s3.AmazonS3Client.listObjectsV2(AmazonS3Client.java:971) + at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$listObjects$11(S3AFileSystem.java:2595) + at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499) + at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:414) + at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:377) + at org.apache.hadoop.fs.s3a.S3AFileSystem.listObjects(S3AFileSystem.java:2586) + at org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3832) + at org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:3688) + at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$isDirectory$35(S3AFileSystem.java:4724) + at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499) + at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration(IOStatisticsBinding.java:444) + at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2337) + at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2356) + at org.apache.hadoop.fs.s3a.S3AFileSystem.isDirectory(S3AFileSystem.java:4722) + at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:54) + at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:366) + at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229) + at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211) + at scala.Option.getOrElse(Option.scala:189) + at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211) + at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:186) + at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + at java.lang.reflect.Method.invoke(Method.java:498) + at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) + at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374) + at py4j.Gateway.invoke(Gateway.java:282) + at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) + at py4j.commands.CallCommand.execute(CallCommand.java:79) + at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182) + at py4j.ClientServerConnection.run(ClientServerConnection.java:106) + at java.lang.Thread.run(Thread.java:748) + Caused by: javax.net.ssl.SSLException: Unsupported or unrecognized SSL message + at sun.security.ssl.SSLSocketInputRecord.handleUnknownRecord(SSLSocketInputRecord.java:448) + at sun.security.ssl.SSLSocketInputRecord.decode(SSLSocketInputRecord.java:184) + at sun.security.ssl.SSLTransport.decode(SSLTransport.java:109) + at sun.security.ssl.SSLSocketImpl.decode(SSLSocketImpl.java:1383) + at sun.security.ssl.SSLSocketImpl.readHandshakeRecord(SSLSocketImpl.java:1291) + at sun.security.ssl.SSLSocketImpl.startHandshake(SSLSocketImpl.java:435) + at com.amazonaws.thirdparty.apache.http.conn.ssl.SSLConnectionSocketFactory.createLayeredSocket(SSLConnectionSocketFactory.java:436) + at com.amazonaws.thirdparty.apache.http.conn.ssl.SSLConnectionSocketFactory.connectSocket(SSLConnectionSocketFactory.java:384) + at com.amazonaws.thirdparty.apache.http.impl.conn.DefaultHttpClientConnectionOperator.connect(DefaultHttpClientConnectionOperator.java:142) + at com.amazonaws.thirdparty.apache.http.impl.conn.PoolingHttpClientConnectionManager.connect(PoolingHttpClientConnectionManager.java:376) + ... 58 more + 23/08/03 11:25:10 DEBUG DefaultManagedHttpClientConnection: http-outgoing-0: Shutdown connection + 23/08/03 11:25:10 DEBUG MainClientExec: Connection discarded + 23/08/03 11:25:10 DEBUG PoolingHttpClientConnectionManager: Connection released: [id: 0][route: {s}->https://test-bucket.localhost:9000][total available: 0; route allocated: 0 of 96; total allocated: 0 of 96] + 23/08/03 11:25:10 DEBUG AmazonHttpClient: Unable to execute HTTP request: Unsupported or unrecognized SSL message Request will be retried. + 23/08/03 11:25:10 DEBUG request: Retrying Request: GET https://test-bucket.localhost:9000 / Parameters: ({"list-type":["2"],"delimiter":["/"],"max-keys":["2"],"prefix":["fake/"],"fetch-owner":["false"]}Headers: (amz-sdk-invocation-id: e6d62603-96e4-a80f-10a1-816e0822bc71, Content-Type: application/octet-stream, User-Agent: Hadoop 3.3.4, aws-sdk-java/1.12.262 Linux/6.4.7-1-MANJARO OpenJDK_64-Bit_Server_VM/25.292-b10 java/1.8.0_292 scala/2.12.17 vendor/AdoptOpenJDK cfg/retry-mode/legacy, ) + 23/08/03 11:25:10 DEBUG AmazonHttpClient: Retriable error detected, will retry in 49ms, attempt number: 0 +``` + +#### Change number of retries + +You can also change number of retries performed by both libraries using `extra` parameter: + +```python +spark_s3 = SparkS3( + ..., + extra={ + "attempts.maximum": 1, + "retry.limit": 1, + }, +) +``` + +So accessing S3 will fail almost immediately if there is any error. + +### Most common mistakes + +#### No network access + +```text +Caused by: java.net.ConnectException: Connection refused +``` + +Mostly caused by: + +- Trying to access port number which S3 server does not listen +- You're trying to access host which is unreachable from your network (e.g. running behind some proxy or VPN) +- There are some firewall restrictions for accessing specific host or port + +#### Using HTTPS protocol for HTTP port + +```text +Caused by: javax.net.ssl.SSLException: Unsupported or unrecognized SSL message +``` + +By default, SparkS3 uses HTTPS protocol for connection. +If you change port number, this does not lead to changing protocol: + +```python +spark_s3 = SparkS3(host="s3provider.com", port=8080, ...) +``` + +You should pass protocol explicitly: + +```python +spark_s3 = SparkS3(host="s3provider.com", port=8080, protocol="http", ...) +``` + +#### SSL certificate is self-signed + +```text +sun.security.provider.certpath.SunCertPathBuilderException: unable to find valid certification path to requested target +``` + +To connect to HTTPS port with self-signed certificate, you should +[add certificate chain to Java TrustedStore](https://stackoverflow.com/questions/373295/digital-certificate-how-to-import-cer-file-in-to-truststore-file-using). + +Another option is to disable SSL check: + +```python +spark_s3 = SparkS3( + ..., + extra={ + "connection.ssl.enabled": False, + }, +) +``` + +But is is **NOT** recommended. + +#### Accessing S3 without domain-style access style support + +```text +Caused by: java.net.UnknownHostException: my-bucket.s3provider.com +``` + +To use path-style access, use option below: + +```python +spark_s3 = SparkS3( + host="s3provider.com", + bucket="my-bucket", + ..., + extra={ + "path.style.access": True, + }, +) +``` + +## Slow or unstable writing to S3 + +Hadoop AWS allows to use different writing strategies for different S3 implementations, depending +on list of supported features by server. + +These strategies are called [committers](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/committers.html). +There are [different types of committers](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/committers.html#Switching_to_an_S3A_Committer): + +- `file` (default) +- `directory` +- `partitioned` +- `magic` + +### `file` committer + +This committer is quite slow and unstable, so it is not recommended to use: + +```text +WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe. +``` + +This is caused by the fact it creates files in the temp directory on remote filesystem, and after all of them are written successfully, +they are moved to target directory on same remote filesystem. + +This is not an issue for HDFS which does support file move operations and also support renaming directory +as atomic operation with `O(1)` time complexity. + +But S3 does support only file copying, so moving is performed via copy + delete. +Also it does not support atomic directory rename operation. Instead, renaming files with the same prefix has time complexity `O(n)`. + +### `directory` and `partitioned` committers + +These are [staging committers](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/committer_architecture.html), +meaning that they create temp directories on local filesystem, and after all files are written successfully, +they will be uploaded to S3. Local filesystems do support file moving and directory renaming, +so these committers does not have issues that `file` committer has. + +But they both require free space on local filesystem, and this may be an issue if user need to write large amount of data. +Also this can be an issue for container environment, like Kubernetes, there resources should be allocated before starting a container. + +### `magic` committer + +This committer uses multipart upload feature of S3 API, allowing to create multiple files +and after all of them were written successfully finish the transaction. Before transaction is finished, +files will not be accessible by other clients. + +Because it does not require neither file moving operations, nor directory atomic rename, +upload process is done in most efficient way S3 support. +This [drastically increases writing performance](https://spot.io/blog/improve-apache-spark-performance-with-the-s3-magic-committer/). + +To use this committer, set [following properties](https://github.com/apache/spark/pull/32518) while creating Spark session. + +```{eval-rst} +.. tabs:: + + .. code-tab:: py S3 your main distributed filesystem (Spark on Kubernetes) + + # https://issues.apache.org/jira/browse/SPARK-23977 + # https://spark.apache.org/docs/latest/cloud-integration.html#committing-work-into-cloud-storage-safely-and-fast + spark = ( + SparkSession.builder.appName("spark-app-name") + .config("spark.hadoop.fs.s3a.committer.magic.enabled", "true") + .config("spark.hadoop.fs.s3a.committer.name", "magic") + .config("spark.hadoop.mapreduce.outputcommitter.factory.scheme.s3a", "org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory") + .config("spark.sql.parquet.output.committer.class", "org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter") + .config("spark.sql.sources.commitProtocolClass", "org.apache.spark.internal.io.cloud.PathOutputCommitProtocol") + .getOrCreate() + ) + + .. code-tab:: py HDFS is your main distributed filesystem (Spark on Hadoop) + + # https://community.cloudera.com/t5/Support-Questions/spark-sql-sources-partitionOverwriteMode-dynamic-quot-not/m-p/343483/highlight/true + spark = ( + SparkSession.builder.appName("spark-app-name") + .config("spark.hadoop.fs.s3a.committer.magic.enabled", "true") + .config("spark.hadoop.fs.s3a.committer.name", "magic") + .getOrCreate() + ) +``` + +```{eval-rst} +.. warning:: + + `magic` committer requires S3 implementation to have strong consistency - file upload API return response only + if it was written on enough number of cluster nodes, and any cluster node error does not lead to missing or corrupting files. + + Some S3 implementations does have strong consistency + (like `AWS S3 `_ and + `MinIO `_), some not. Please contact your S3 provider + to get information about S3 implementation consistency. +``` + +```{eval-rst} +.. warning:: + + `magic` committer does not support `if_exists="replace_overlapping_partitions"`. + Either use another `if_exists` value, or use `partitioned` committer. +``` + +### See also + +- [directory.marker.retention="keep"](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/directory_markers.html) + +## Slow reading from S3 + +Please read following documentation: + +- [prefetch.enabled](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/prefetching.html) +- [experimental.input.fadvise](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/performance.html#Improving_data_input_performance_through_fadvise) +- [Parquet and ORC I/O settings](https://spark.apache.org/docs/latest/cloud-integration.html#parquet-io-settings) + +If you're reading data from row-based formats, like {ref}`csv-file-format`, prefer +[experimental.input.fadvise="sequential" with increased readahead.range](https://issues.apache.org/jira/browse/HADOOP-17789?focusedCommentId=17383559#comment-17383559). + +But for other file formats, especially using compression, prefer +[experimental.input.fadvise="normal"](https://issues.apache.org/jira/browse/HADOOP-17789?focusedCommentId=17383743#comment-17383743) diff --git a/mkdocs/docs/en/connection/index.md b/mkdocs/docs/en/connection/index.md new file mode 100644 index 000000000..99bd58d51 --- /dev/null +++ b/mkdocs/docs/en/connection/index.md @@ -0,0 +1,34 @@ +# Connection + + DB Connection + +* [DB Connections](db_connection/index.md) + * [Clickhouse](db_connection/clickhouse/index.md) + * [Greenplum](db_connection/greenplum/index.md) + * [Kafka](db_connection/kafka/index.md) + * [Hive](db_connection/hive/index.md) + * [MongoDB](db_connection/mongodb/index.md) + * [MSSQL](db_connection/mssql/index.md) + * [MySQL](db_connection/mysql/index.md) + * [Oracle](db_connection/oracle/index.md) + * [Postgres](db_connection/postgres/index.md) + * [Teradata](db_connection/teradata/index.md) + + File Connection + +* [File Connections](file_connection/index.md) + * [FTP](file_connection/ftp.md) + * [FTPS](file_connection/ftps.md) + * [HDFS](file_connection/hdfs/index.md) + * [Samba](file_connection/samba.md) + * [SFTP](file_connection/sftp.md) + * [S3](file_connection/s3.md) + * [Webdav](file_connection/webdav.md) + + File DataFrame Connection + +* [File DataFrame Connections](file_df_connection/index.md) + * [Spark LocalFS](file_df_connection/spark_local_fs.md) + * [Spark HDFS](file_df_connection/spark_hdfs/index.md) + * [Spark S3](file_df_connection/spark_s3/index.md) + * [Base interface](file_df_connection/base.md) diff --git a/mkdocs/docs/en/contributing.md b/mkdocs/docs/en/contributing.md new file mode 100644 index 000000000..f1d5ee765 --- /dev/null +++ b/mkdocs/docs/en/contributing.md @@ -0,0 +1,391 @@ +# Contributing Guide + +Welcome! There are many ways to contribute, including submitting bug +reports, improving documentation, submitting feature requests, reviewing +new submissions, or contributing code that can be incorporated into the +project. + +## Limitations + +We should keep close to these items during development: + +* Some companies still use old Spark versions, like 2.3.1. So it is required to keep compatibility if possible, e.g. adding branches for different Spark versions. +* Different users uses onETL in different ways - some uses only DB connectors, some only files. Connector-specific dependencies should be optional. +* Instead of creating classes with a lot of different options, prefer splitting them into smaller classes, e.g. options class, context manager, etc, and using composition. + +## Initial setup for local development + +### Install Git + +Please follow [instruction](https://docs.github.com/en/get-started/quickstart/set-up-git). + +### Create a fork + +If you are not a member of a development team building onETL, you should create a fork before making any changes. + +Please follow [instruction](https://docs.github.com/en/get-started/quickstart/fork-a-repo). + +### Clone the repo + +Open terminal and run these commands: + +```bash +git clone git@github.com:myuser/onetl.git -b develop + +cd onetl +``` + +### Setup environment + +Create virtualenv and install dependencies: + +```bash +python -m venv venv +source venv/bin/activate +pip install -U wheel +pip install -U pip setuptools +pip install -U \ + -r requirements/core.txt \ + -r requirements/ftp.txt \ + -r requirements/hdfs.txt \ + -r requirements/kerberos.txt \ + -r requirements/s3.txt \ + -r requirements/sftp.txt \ + -r requirements/webdav.txt \ + -r requirements/dev.txt \ + -r requirements/docs.txt \ + -r requirements/tests/base.txt \ + -r requirements/tests/clickhouse.txt \ + -r requirements/tests/kafka.txt \ + -r requirements/tests/mongodb.txt \ + -r requirements/tests/mssql.txt \ + -r requirements/tests/mysql.txt \ + -r requirements/tests/postgres.txt \ + -r requirements/tests/oracle.txt \ + -r requirements/tests/pydantic-2.txt \ + -r requirements/tests/spark-3.5.5.txt + +# TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 +pip install sphinx-plantuml --no-deps +``` + +### Enable pre-commit hooks + +Install pre-commit hooks: + +```bash +pre-commit install --install-hooks +``` + +Test pre-commit hooks run: + +```bash +pre-commit run +``` + +## How to + +### Run tests locally + +#### Using docker-compose + +Build image for running tests: + +```bash +docker-compose build +``` + +Start all containers with dependencies: + +```bash +docker-compose --profile all up -d +``` + +You can run limited set of dependencies: + +```bash +docker-compose --profile mongodb up -d +``` + +Run tests: + +```bash +docker-compose run --rm onetl ./run_tests.sh +``` + +You can pass additional arguments, they will be passed to pytest: + +```bash +docker-compose run --rm onetl ./run_tests.sh -m mongodb -lsx -vvvv --log-cli-level=INFO +``` + +You can run interactive bash session and use it: + +```bash +docker-compose run --rm onetl bash + +./run_tests.sh -m mongodb -lsx -vvvv --log-cli-level=INFO +``` + +See logs of test container: + +```bash +docker-compose logs -f onetl +``` + +Stop all containers and remove created volumes: + +```bash +docker-compose --profile all down -v +``` + +#### Without docker-compose + +#### WARNING +To run HDFS tests locally you should add the following line to your `/etc/hosts` (file path depends on OS): + +```default +# HDFS server returns container hostname as connection address, causing error in DNS resolution +127.0.0.1 hdfs +``` + +#### NOTE +To run Oracle tests you need to install [Oracle instantclient](https://www.oracle.com/database/technologies/instant-client.html), +and pass its path to `ONETL_ORA_CLIENT_PATH` and `LD_LIBRARY_PATH` environment variables, +e.g. `ONETL_ORA_CLIENT_PATH=/path/to/client64/lib`. + +It may also require to add the same path into `LD_LIBRARY_PATH` environment variable + +#### NOTE +To run Greenplum tests, you should: + +* Download [VMware Greenplum connector for Spark][greenplum-prerequisites] +* Either move it to `~/.ivy2/jars/`, or pass file path to `CLASSPATH` +* Set environment variable `ONETL_GP_PACKAGE_VERSION=local`. + +Start all containers with dependencies: + +```bash +docker-compose --profile all up -d +``` + +You can run limited set of dependencies: + +```bash +docker-compose --profile mongodb up -d +``` + +Load environment variables with connection properties: + +```bash +source .env.local +``` + +Run tests: + +```bash +./run_tests.sh +``` + +You can pass additional arguments, they will be passed to pytest: + +```bash +./run_tests.sh -m mongodb -lsx -vvvv --log-cli-level=INFO +``` + +Stop all containers and remove created volumes: + +```bash +docker-compose --profile all down -v +``` + +### Build documentation + +Build documentation using Sphinx: + +```bash +cd docs +make html +``` + +Then open in browser `docs/_build/index.html`. + +## Review process + +Please create a new GitHub issue for any significant changes and +enhancements that you wish to make. Provide the feature you would like +to see, why you need it, and how it will work. Discuss your ideas +transparently and get community feedback before proceeding. + +Significant Changes that you wish to contribute to the project should be +discussed first in a GitHub issue that clearly outlines the changes and +benefits of the feature. + +Small Changes can directly be crafted and submitted to the GitHub +Repository as a Pull Request. + +### Create pull request + +Commit your changes: + +```bash +git commit -m "Commit message" +git push +``` + +Then open Github interface and [create pull request](https://docs.github.com/en/get-started/quickstart/contributing-to-projects#making-a-pull-request). +Please follow guide from PR body template. + +After pull request is created, it get a corresponding number, e.g. 123 (`pr_number`). + +### Write release notes + +`onETL` uses [towncrier](https://pypi.org/project/towncrier/) +for changelog management. + +To submit a change note about your PR, add a text file into the +[docs/changelog/next_release](./next_release) folder. It should contain an +explanation of what applying this PR will change in the way +end-users interact with the project. One sentence is usually +enough but feel free to add as many details as you feel necessary +for the users to understand what it means. + +**Use the past tense** for the text in your fragment because, +combined with others, it will be a part of the “news digest” +telling the readers **what changed** in a specific version of +the library *since the previous version*. + +You should also use +reStructuredText syntax for highlighting code (inline or block), +linking parts of the docs or external sites. +If you wish to sign your change, feel free to add `-- by +:user:`github-username`` at the end (replace `github-username` +with your own!). + +Finally, name your file following the convention that Towncrier +understands: it should start with the number of an issue or a +PR followed by a dot, then add a patch type, like `feature`, +`doc`, `misc` etc., and add `.rst` as a suffix. If you +need to add more than one fragment, you may add an optional +sequence number (delimited with another period) between the type +and the suffix. + +In general the name will follow `..rst` pattern, +where the categories are: + +- `feature`: Any new feature +- `bugfix`: A bug fix +- `improvement`: An improvement +- `doc`: A change to the documentation +- `dependency`: Dependency-related changes +- `misc`: Changes internal to the repo like CI, test and build changes + +A pull request may have more than one of these components, for example +a code change may introduce a new feature that deprecates an old +feature, in which case two fragments should be added. It is not +necessary to make a separate documentation fragment for documentation +changes accompanying the relevant code changes. + +#### Examples for adding changelog entries to your Pull Requests + +```rst +Added a `:github:user:` role to Sphinx config -- by :github:user:`someuser` +``` + +```rst +Fixed behavior of `WebDAV` connector -- by :github:user:`someuser` +``` + +```rst +Added support of `timeout` in `S3` connector +-- by :github:user:`someuser`, :github:user:`anotheruser` and :github:user:`otheruser` +``` + +#### How to skip change notes check? + +Just add `ci:skip-changelog` label to pull request. + +#### Release Process + +Before making a release from the `develop` branch, follow these steps: + +1. Checkout to `develop` branch and update it to the actual state + +```bash +git checkout develop +git pull -p +``` + +1. Backup `NEXT_RELEASE.rst` + +```bash +cp "docs/changelog/NEXT_RELEASE.rst" "docs/changelog/temp_NEXT_RELEASE.rst" +``` + +1. Build the Release notes with Towncrier + +```bash +VERSION=$(cat onetl/VERSION) +towncrier build "--version=${VERSION}" --yes +``` + +1. Change file with changelog to release version number + +```bash +mv docs/changelog/NEXT_RELEASE.rst "docs/changelog/${VERSION}.rst" +``` + +1. Remove content above the version number heading in the `${VERSION}.rst` file + +```bash +awk '!/^.*towncrier release notes start/' "docs/changelog/${VERSION}.rst" > temp && mv temp "docs/changelog/${VERSION}.rst" +``` + +1. Update Changelog Index + +```bash +awk -v version=${VERSION} '/DRAFT/{print;print " " version;next}1' docs/changelog/index.rst > temp && mv temp docs/changelog/index.rst +``` + +1. Restore `NEXT_RELEASE.rst` file from backup + +```bash +mv "docs/changelog/temp_NEXT_RELEASE.rst" "docs/changelog/NEXT_RELEASE.rst" +``` + +1. Commit and push changes to `develop` branch + +```bash +git add . +git commit -m "Prepare for release ${VERSION}" +git push +``` + +1. Merge `develop` branch to `master`, **WITHOUT** squashing + +```bash +git checkout master +git pull +git merge develop +git push +``` + +1. Add git tag to the latest commit in `master` branch + +```bash +git tag "$VERSION" +git push origin "$VERSION" +``` + +1. Update version in `develop` branch **after release**: + +```bash +git checkout develop + +NEXT_VERSION=$(echo "$VERSION" | awk -F. '/[0-9]+\./{$NF++;print}' OFS=.) +echo "$NEXT_VERSION" > onetl/VERSION + +git add . +git commit -m "Bump version" +git push +``` diff --git a/mkdocs/docs/en/db_/index.md b/mkdocs/docs/en/db_/index.md new file mode 100644 index 000000000..99ef0e297 --- /dev/null +++ b/mkdocs/docs/en/db_/index.md @@ -0,0 +1,9 @@ +# DB + +```{toctree} +:caption: DB classes +:maxdepth: 1 + +db_reader +db_writer +``` diff --git a/mkdocs/docs/en/db_/reader.md b/mkdocs/docs/en/db_/reader.md new file mode 100644 index 000000000..7060b9ba0 --- /dev/null +++ b/mkdocs/docs/en/db_/reader.md @@ -0,0 +1,20 @@ +# DB Reader + + +```{eval-rst} +.. currentmodule:: onetl.db.db_reader.db_reader +``` + +```{eval-rst} +.. autosummary:: + + DBReader + DBReader.run + DBReader.has_data + DBReader.raise_if_no_data +``` + +```{eval-rst} +.. autoclass:: DBReader + :members: run, has_data, raise_if_no_data +``` diff --git a/mkdocs/docs/en/db_/writer.md b/mkdocs/docs/en/db_/writer.md new file mode 100644 index 000000000..6296364de --- /dev/null +++ b/mkdocs/docs/en/db_/writer.md @@ -0,0 +1,17 @@ +# DB Writer + +```{eval-rst} +.. currentmodule:: onetl.db.db_writer.db_writer +``` + +```{eval-rst} +.. autosummary:: + + DBWriter + DBWriter.run +``` + +```{eval-rst} +.. autoclass:: DBWriter + :members: run +``` diff --git a/mkdocs/docs/en/file/file_downloader/file_downloader.md b/mkdocs/docs/en/file/file_downloader/file_downloader.md new file mode 100644 index 000000000..46fa1b406 --- /dev/null +++ b/mkdocs/docs/en/file/file_downloader/file_downloader.md @@ -0,0 +1,21 @@ +(file-downloader)= + +# File Downloader + +```{eval-rst} +.. currentmodule:: onetl.file.file_downloader.file_downloader +``` + +```{eval-rst} +.. autosummary:: + + FileDownloader + FileDownloader.run + FileDownloader.view_files +``` + +```{eval-rst} +.. autoclass:: FileDownloader + :members: run, view_files + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file/file_downloader/index.md b/mkdocs/docs/en/file/file_downloader/index.md new file mode 100644 index 000000000..01b36be4c --- /dev/null +++ b/mkdocs/docs/en/file/file_downloader/index.md @@ -0,0 +1,10 @@ +# File Downloader { #file-downloader-0 } + +```{toctree} +:caption: File Downloader +:maxdepth: 1 + +file_downloader +options +result +``` diff --git a/mkdocs/docs/en/file/file_downloader/options.md b/mkdocs/docs/en/file/file_downloader/options.md new file mode 100644 index 000000000..2189236b1 --- /dev/null +++ b/mkdocs/docs/en/file/file_downloader/options.md @@ -0,0 +1,14 @@ +(file-downloader-options)= + +# File Downloader Options + +```{eval-rst} +.. currentmodule:: onetl.file.file_downloader.options +``` + +```{eval-rst} +.. autopydantic_model:: FileDownloaderOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/file/file_downloader/result.md b/mkdocs/docs/en/file/file_downloader/result.md new file mode 100644 index 000000000..cd8b0181c --- /dev/null +++ b/mkdocs/docs/en/file/file_downloader/result.md @@ -0,0 +1,10 @@ +# File Downloader Result { #file-downloader-result } + +```{eval-rst} +.. currentmodule:: onetl.file.file_downloader.result +``` + +```{eval-rst} +.. autoclass:: DownloadResult + :members: successful, failed, skipped, missing, successful_count, failed_count, skipped_count, missing_count, total_count, successful_size, failed_size, skipped_size, total_size, raise_if_failed, raise_if_missing, raise_if_skipped, raise_if_empty, is_empty, raise_if_contains_zero_size, details, summary, dict, json +``` diff --git a/mkdocs/docs/en/file/file_filters/base.md b/mkdocs/docs/en/file/file_filters/base.md new file mode 100644 index 000000000..375bb9a9c --- /dev/null +++ b/mkdocs/docs/en/file/file_filters/base.md @@ -0,0 +1,19 @@ +(base-file-filter)= + +# Base interface + +```{eval-rst} +.. currentmodule:: onetl.base.base_file_filter +``` + +```{eval-rst} +.. autosummary:: + + BaseFileFilter + BaseFileFilter.match +``` + +```{eval-rst} +.. autoclass:: BaseFileFilter + :members: match +``` diff --git a/mkdocs/docs/en/file/file_filters/exclude_dir.md b/mkdocs/docs/en/file/file_filters/exclude_dir.md new file mode 100644 index 000000000..6b9aaf80e --- /dev/null +++ b/mkdocs/docs/en/file/file_filters/exclude_dir.md @@ -0,0 +1,12 @@ +(exclude-dir-filter)= + +# ExcludeDir + +```{eval-rst} +.. currentmodule:: onetl.file.filter.exclude_dir +``` + +```{eval-rst} +.. autoclass:: ExcludeDir + :members: match +``` diff --git a/mkdocs/docs/en/file/file_filters/file_filter.md b/mkdocs/docs/en/file/file_filters/file_filter.md new file mode 100644 index 000000000..6ef16e7b1 --- /dev/null +++ b/mkdocs/docs/en/file/file_filters/file_filter.md @@ -0,0 +1,12 @@ +(file-filter)= + +# File Filter (legacy) + +```{eval-rst} +.. currentmodule:: onetl.core.file_filter.file_filter +``` + +```{eval-rst} +.. autoclass:: FileFilter + :members: match +``` diff --git a/mkdocs/docs/en/file/file_filters/file_mtime_filter.md b/mkdocs/docs/en/file/file_filters/file_mtime_filter.md new file mode 100644 index 000000000..4414d9bac --- /dev/null +++ b/mkdocs/docs/en/file/file_filters/file_mtime_filter.md @@ -0,0 +1,12 @@ +(file-modificatiom-time)= + +# FileModifiedTime + +```{eval-rst} +.. currentmodule:: onetl.file.filter.file_mtime +``` + +```{eval-rst} +.. autoclass:: FileModifiedTime + :members: match +``` diff --git a/mkdocs/docs/en/file/file_filters/file_size_filter.md b/mkdocs/docs/en/file/file_filters/file_size_filter.md new file mode 100644 index 000000000..a8eed5e22 --- /dev/null +++ b/mkdocs/docs/en/file/file_filters/file_size_filter.md @@ -0,0 +1,12 @@ +(file-size-range)= + +# FileSizeRange + +```{eval-rst} +.. currentmodule:: onetl.file.filter.file_size +``` + +```{eval-rst} +.. autoclass:: FileSizeRange + :members: match +``` diff --git a/mkdocs/docs/en/file/file_filters/glob.md b/mkdocs/docs/en/file/file_filters/glob.md new file mode 100644 index 000000000..9234b7d3c --- /dev/null +++ b/mkdocs/docs/en/file/file_filters/glob.md @@ -0,0 +1,12 @@ +(glob-filter)= + +# Glob + +```{eval-rst} +.. currentmodule:: onetl.file.filter.glob +``` + +```{eval-rst} +.. autoclass:: Glob + :members: match +``` diff --git a/mkdocs/docs/en/file/file_filters/index.md b/mkdocs/docs/en/file/file_filters/index.md new file mode 100644 index 000000000..d6b12f0dd --- /dev/null +++ b/mkdocs/docs/en/file/file_filters/index.md @@ -0,0 +1,29 @@ +(file-filters)= + +# File Filters + +```{toctree} +:caption: File filters +:maxdepth: 1 + +glob +regexp +exclude_dir +file_size_filter +file_mtime_filter +``` + +```{toctree} +:caption: Legacy +:maxdepth: 1 + +file_filter +``` + +```{toctree} +:caption: For developers +:maxdepth: 1 + +base +match_all_filters +``` diff --git a/mkdocs/docs/en/file/file_filters/match_all_filters.md b/mkdocs/docs/en/file/file_filters/match_all_filters.md new file mode 100644 index 000000000..26818a46d --- /dev/null +++ b/mkdocs/docs/en/file/file_filters/match_all_filters.md @@ -0,0 +1,11 @@ +(match-all-filters)= + +# match_all_filters + +```{eval-rst} +.. currentmodule:: onetl.file.filter.match_all_filters +``` + +```{eval-rst} +.. autofunction:: match_all_filters +``` diff --git a/mkdocs/docs/en/file/file_filters/regexp.md b/mkdocs/docs/en/file/file_filters/regexp.md new file mode 100644 index 000000000..e62fe9345 --- /dev/null +++ b/mkdocs/docs/en/file/file_filters/regexp.md @@ -0,0 +1,12 @@ +(regexp-filter)= + +# Regexp + +```{eval-rst} +.. currentmodule:: onetl.file.filter.regexp +``` + +```{eval-rst} +.. autoclass:: Regexp + :members: match +``` diff --git a/mkdocs/docs/en/file/file_limits/base.md b/mkdocs/docs/en/file/file_limits/base.md new file mode 100644 index 000000000..7990c28a9 --- /dev/null +++ b/mkdocs/docs/en/file/file_limits/base.md @@ -0,0 +1,21 @@ +(base-limit)= + +# Base interface + +```{eval-rst} +.. currentmodule:: onetl.base.base_file_limit +``` + +```{eval-rst} +.. autosummary:: + + BaseFileLimit + BaseFileLimit.reset + BaseFileLimit.stops_at + BaseFileLimit.is_reached +``` + +```{eval-rst} +.. autoclass:: BaseFileLimit + :members: reset, stops_at, is_reached +``` diff --git a/mkdocs/docs/en/file/file_limits/file_limit.md b/mkdocs/docs/en/file/file_limits/file_limit.md new file mode 100644 index 000000000..5034bf0f5 --- /dev/null +++ b/mkdocs/docs/en/file/file_limits/file_limit.md @@ -0,0 +1,12 @@ +(file-limit)= + +# File Limit (legacy) + +```{eval-rst} +.. currentmodule:: onetl.core.file_limit.file_limit +``` + +```{eval-rst} +.. autoclass:: FileLimit + :members: reset, stops_at, is_reached +``` diff --git a/mkdocs/docs/en/file/file_limits/index.md b/mkdocs/docs/en/file/file_limits/index.md new file mode 100644 index 000000000..52d226b3e --- /dev/null +++ b/mkdocs/docs/en/file/file_limits/index.md @@ -0,0 +1,28 @@ +(file-limits)= + +# File Limits + +```{toctree} +:caption: File limits +:maxdepth: 1 + +max_files_count +total_files_size +``` + +```{toctree} +:caption: Legacy +:maxdepth: 1 + +file_limit +``` + +```{toctree} +:caption: For developers +:maxdepth: 1 + +base +limits_stop_at +limits_reached +reset_limits +``` diff --git a/mkdocs/docs/en/file/file_limits/limits_reached.md b/mkdocs/docs/en/file/file_limits/limits_reached.md new file mode 100644 index 000000000..2d9a837b9 --- /dev/null +++ b/mkdocs/docs/en/file/file_limits/limits_reached.md @@ -0,0 +1,11 @@ +(limits-reached)= + +# limits_reached + +```{eval-rst} +.. currentmodule:: onetl.file.limit.limits_reached +``` + +```{eval-rst} +.. autofunction:: limits_reached +``` diff --git a/mkdocs/docs/en/file/file_limits/limits_stop_at.md b/mkdocs/docs/en/file/file_limits/limits_stop_at.md new file mode 100644 index 000000000..a53aba6d7 --- /dev/null +++ b/mkdocs/docs/en/file/file_limits/limits_stop_at.md @@ -0,0 +1,11 @@ +(limits-stop-at)= + +# limits_stop_at + +```{eval-rst} +.. currentmodule:: onetl.file.limit.limits_stop_at +``` + +```{eval-rst} +.. autofunction:: limits_stop_at +``` diff --git a/mkdocs/docs/en/file/file_limits/max_files_count.md b/mkdocs/docs/en/file/file_limits/max_files_count.md new file mode 100644 index 000000000..b82896718 --- /dev/null +++ b/mkdocs/docs/en/file/file_limits/max_files_count.md @@ -0,0 +1,12 @@ +(max-files-count)= + +# MaxFilesCount + +```{eval-rst} +.. currentmodule:: onetl.file.limit.max_files_count +``` + +```{eval-rst} +.. autoclass:: MaxFilesCount + :members: reset, stops_at, is_reached +``` diff --git a/mkdocs/docs/en/file/file_limits/reset_limits.md b/mkdocs/docs/en/file/file_limits/reset_limits.md new file mode 100644 index 000000000..a55769041 --- /dev/null +++ b/mkdocs/docs/en/file/file_limits/reset_limits.md @@ -0,0 +1,11 @@ +(reset-limits)= + +# reset_limits + +```{eval-rst} +.. currentmodule:: onetl.file.limit.reset_limits +``` + +```{eval-rst} +.. autofunction:: reset_limits +``` diff --git a/mkdocs/docs/en/file/file_limits/total_files_size.md b/mkdocs/docs/en/file/file_limits/total_files_size.md new file mode 100644 index 000000000..bc06c4f76 --- /dev/null +++ b/mkdocs/docs/en/file/file_limits/total_files_size.md @@ -0,0 +1,12 @@ +(total-files-size-limit)= + +# TotalFilesSize + +```{eval-rst} +.. currentmodule:: onetl.file.limit.total_files_size +``` + +```{eval-rst} +.. autoclass:: TotalFilesSize + :members: reset, stops_at, is_reached +``` diff --git a/mkdocs/docs/en/file/file_mover/file_mover.md b/mkdocs/docs/en/file/file_mover/file_mover.md new file mode 100644 index 000000000..39e2cb1b7 --- /dev/null +++ b/mkdocs/docs/en/file/file_mover/file_mover.md @@ -0,0 +1,21 @@ +(file-mover)= + +# File Mover + +```{eval-rst} +.. currentmodule:: onetl.file.file_mover.file_mover +``` + +```{eval-rst} +.. autosummary:: + + FileMover + FileMover.run + FileMover.view_files +``` + +```{eval-rst} +.. autoclass:: FileMover + :members: run, view_files + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file/file_mover/index.md b/mkdocs/docs/en/file/file_mover/index.md new file mode 100644 index 000000000..194adebc9 --- /dev/null +++ b/mkdocs/docs/en/file/file_mover/index.md @@ -0,0 +1,10 @@ +# File Mover { #file-mover-0 } + +```{toctree} +:caption: File Mover +:maxdepth: 1 + +file_mover +options +result +``` diff --git a/mkdocs/docs/en/file/file_mover/options.md b/mkdocs/docs/en/file/file_mover/options.md new file mode 100644 index 000000000..57d42d500 --- /dev/null +++ b/mkdocs/docs/en/file/file_mover/options.md @@ -0,0 +1,14 @@ +(file-mover-options)= + +# File Mover Options + +```{eval-rst} +.. currentmodule:: onetl.file.file_mover.options +``` + +```{eval-rst} +.. autopydantic_model:: FileMoverOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/file/file_mover/result.md b/mkdocs/docs/en/file/file_mover/result.md new file mode 100644 index 000000000..3e5698270 --- /dev/null +++ b/mkdocs/docs/en/file/file_mover/result.md @@ -0,0 +1,10 @@ +# File Mover Result { #file-mover-result } + +```{eval-rst} +.. currentmodule:: onetl.file.file_mover.result +``` + +```{eval-rst} +.. autoclass:: MoveResult + :members: successful, failed, skipped, missing, successful_count, failed_count, skipped_count, missing_count, total_count, successful_size, failed_size, skipped_size, total_size, raise_if_failed, raise_if_missing, raise_if_skipped, raise_if_empty, is_empty, raise_if_contains_zero_size, details, summary, dict, json +``` diff --git a/mkdocs/docs/en/file/file_uploader/file_uploader.md b/mkdocs/docs/en/file/file_uploader/file_uploader.md new file mode 100644 index 000000000..c43d07141 --- /dev/null +++ b/mkdocs/docs/en/file/file_uploader/file_uploader.md @@ -0,0 +1,19 @@ +# File Uploader { #file-uploader-0 } + +```{eval-rst} +.. currentmodule:: onetl.file.file_uploader.file_uploader +``` + +```{eval-rst} +.. autosummary:: + + FileUploader + FileUploader.run + FileUploader.view_files +``` + +```{eval-rst} +.. autoclass:: FileUploader + :members: run, view_files + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file/file_uploader/index.md b/mkdocs/docs/en/file/file_uploader/index.md new file mode 100644 index 000000000..ada740442 --- /dev/null +++ b/mkdocs/docs/en/file/file_uploader/index.md @@ -0,0 +1,12 @@ +(file-uploader-root)= + +# File Uploader + +```{toctree} +:caption: File Uploader +:maxdepth: 1 + +file_uploader +options +result +``` diff --git a/mkdocs/docs/en/file/file_uploader/options.md b/mkdocs/docs/en/file/file_uploader/options.md new file mode 100644 index 000000000..8b987e1d8 --- /dev/null +++ b/mkdocs/docs/en/file/file_uploader/options.md @@ -0,0 +1,14 @@ +(file-uploader-options)= + +# File Uploader Options + +```{eval-rst} +.. currentmodule:: onetl.file.file_uploader.options +``` + +```{eval-rst} +.. autopydantic_model:: FileUploaderOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false +``` diff --git a/mkdocs/docs/en/file/file_uploader/result.md b/mkdocs/docs/en/file/file_uploader/result.md new file mode 100644 index 000000000..0e5233898 --- /dev/null +++ b/mkdocs/docs/en/file/file_uploader/result.md @@ -0,0 +1,10 @@ +# File Uploader Result { #file-uploader-result } + +```{eval-rst} +.. currentmodule:: onetl.file.file_uploader.result +``` + +```{eval-rst} +.. autoclass:: UploadResult + :members: successful, failed, skipped, missing, successful_count, failed_count, skipped_count, missing_count, total_count, successful_size, failed_size, skipped_size, total_size, raise_if_failed, raise_if_missing, raise_if_skipped, raise_if_empty, is_empty, raise_if_contains_zero_size, details, summary, dict, json +``` diff --git a/mkdocs/docs/en/file/index.md b/mkdocs/docs/en/file/index.md new file mode 100644 index 000000000..49365dce5 --- /dev/null +++ b/mkdocs/docs/en/file/index.md @@ -0,0 +1,12 @@ +(file-root)= + +```{toctree} +:caption: File classes +:maxdepth: 1 + +file_downloader/index +file_uploader/index +file_mover/index +file_filters/index +file_limits/index +``` diff --git a/mkdocs/docs/en/file_df/file_df_reader/file_df_reader.md b/mkdocs/docs/en/file_df/file_df_reader/file_df_reader.md new file mode 100644 index 000000000..713916f77 --- /dev/null +++ b/mkdocs/docs/en/file_df/file_df_reader/file_df_reader.md @@ -0,0 +1,13 @@ +(file-df-reader)= + +# FileDF Reader + +```{eval-rst} +.. currentmodule:: onetl.file.file_df_reader.file_df_reader +``` + +```{eval-rst} +.. autoclass:: FileDFReader + :members: run + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file_df/file_df_reader/index.md b/mkdocs/docs/en/file_df/file_df_reader/index.md new file mode 100644 index 000000000..58ceadf22 --- /dev/null +++ b/mkdocs/docs/en/file_df/file_df_reader/index.md @@ -0,0 +1,11 @@ +(file-df-reader-root)= + +# FileDF Reader { #filedf-reader-0} + +```{toctree} +:caption: FileDF Reader +:maxdepth: 1 + +file_df_reader +options +``` diff --git a/mkdocs/docs/en/file_df/file_df_reader/options.md b/mkdocs/docs/en/file_df/file_df_reader/options.md new file mode 100644 index 000000000..4318d288a --- /dev/null +++ b/mkdocs/docs/en/file_df/file_df_reader/options.md @@ -0,0 +1,13 @@ +(file-df-reader-options)= + +# Options + +```{eval-rst} +.. currentmodule:: onetl.file.file_df_reader.options +``` + +```{eval-rst} +.. autoclass:: FileDFReaderOptions + :members: recursive + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file_df/file_df_writer/file_df_writer.md b/mkdocs/docs/en/file_df/file_df_writer/file_df_writer.md new file mode 100644 index 000000000..b600f045d --- /dev/null +++ b/mkdocs/docs/en/file_df/file_df_writer/file_df_writer.md @@ -0,0 +1,11 @@ +# FileDF Writer { #filedf-writer-0 } + +```{eval-rst} +.. currentmodule:: onetl.file.file_df_writer.file_df_writer +``` + +```{eval-rst} +.. autoclass:: FileDFWriter + :members: run + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file_df/file_df_writer/index.md b/mkdocs/docs/en/file_df/file_df_writer/index.md new file mode 100644 index 000000000..473031415 --- /dev/null +++ b/mkdocs/docs/en/file_df/file_df_writer/index.md @@ -0,0 +1,11 @@ +(file-df-writer-root)= + +# FileDF Writer + +```{toctree} +:caption: FileDF Writer +:maxdepth: 1 + +file_df_writer +options +``` diff --git a/mkdocs/docs/en/file_df/file_df_writer/options.md b/mkdocs/docs/en/file_df/file_df_writer/options.md new file mode 100644 index 000000000..7afee67c7 --- /dev/null +++ b/mkdocs/docs/en/file_df/file_df_writer/options.md @@ -0,0 +1,13 @@ +(file-df-writer-options)= + +# Options + +```{eval-rst} +.. currentmodule:: onetl.file.file_df_writer.options +``` + +```{eval-rst} +.. autoclass:: FileDFWriterOptions + :members: if_exists, partition_by + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file_df/file_formats/avro.md b/mkdocs/docs/en/file_df/file_formats/avro.md new file mode 100644 index 000000000..528e3810a --- /dev/null +++ b/mkdocs/docs/en/file_df/file_formats/avro.md @@ -0,0 +1,13 @@ +(avro-file-format)= + +# Avro + +```{eval-rst} +.. currentmodule:: onetl.file.format.avro +``` + +```{eval-rst} +.. autoclass:: Avro + :members: get_packages, parse_column, serialize_column, schema_dict, schema_url, recordName,recordNamespace,compression,mode,datetimeRebaseMode,positionalFieldMatching,enableStableIdentifiersForUnionType + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file_df/file_formats/base.md b/mkdocs/docs/en/file_df/file_formats/base.md new file mode 100644 index 000000000..91a175871 --- /dev/null +++ b/mkdocs/docs/en/file_df/file_formats/base.md @@ -0,0 +1,19 @@ +(base-file-format)= + +# Base interface + +```{eval-rst} +.. currentmodule:: onetl.base.base_file_format +``` + +```{eval-rst} +.. autoclass:: BaseReadableFileFormat + :members: check_if_supported, apply_to_reader + :member-order: bysource +``` + +```{eval-rst} +.. autoclass:: BaseWritableFileFormat + :members: check_if_supported, apply_to_writer + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file_df/file_formats/csv.md b/mkdocs/docs/en/file_df/file_formats/csv.md new file mode 100644 index 000000000..30098ea66 --- /dev/null +++ b/mkdocs/docs/en/file_df/file_formats/csv.md @@ -0,0 +1,13 @@ +(csv-file-format)= + +# CSV + +```{eval-rst} +.. currentmodule:: onetl.file.format.csv +``` + +```{eval-rst} +.. autoclass:: CSV + :members: __init__, parse_column, serialize_column, charToEscapeQuoteEscaping,columnNameOfCorruptRecord,comment,compression,dateFormat,delimiter,emptyValue,enforceSchema,escapeQuotes,header,ignoreLeadingWhiteSpace,ignoreTrailingWhiteSpace,inferSchema,locale,maxCharsPerColumn,mode,multiLine,nanValue,negativeInf,nullValue,positiveInf,preferDate,quote,quoteAll,samplingRatio,timestampFormat,timestampNTZFormat,unescapedQuoteHandling + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file_df/file_formats/excel.md b/mkdocs/docs/en/file_df/file_formats/excel.md new file mode 100644 index 000000000..98e96003e --- /dev/null +++ b/mkdocs/docs/en/file_df/file_formats/excel.md @@ -0,0 +1,13 @@ +(excel-file-format)= + +# Excel + +```{eval-rst} +.. currentmodule:: onetl.file.format.excel +``` + +```{eval-rst} +.. autoclass:: Excel + :members: get_packages,header,dataAddress,treatEmptyValuesAsNulls,setErrorCellsToFallbackValues,usePlainNumberFormat,inferSchema,timestampFormat,maxRowsInMemory,maxByteArraySize,tempFileThreshold,excerptSize,workbookPassword + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file_df/file_formats/index.md b/mkdocs/docs/en/file_df/file_formats/index.md new file mode 100644 index 000000000..155cb7121 --- /dev/null +++ b/mkdocs/docs/en/file_df/file_formats/index.md @@ -0,0 +1,24 @@ +(file-formats)= + +# File Formats + +```{toctree} +:caption: File formats +:maxdepth: 1 + +avro +csv +excel +json +jsonline +orc +parquet +xml +``` + +```{toctree} +:caption: For developers +:maxdepth: 1 + +base +``` diff --git a/mkdocs/docs/en/file_df/file_formats/json.md b/mkdocs/docs/en/file_df/file_formats/json.md new file mode 100644 index 000000000..976855782 --- /dev/null +++ b/mkdocs/docs/en/file_df/file_formats/json.md @@ -0,0 +1,13 @@ +(json-file-format)= + +# JSON + +```{eval-rst} +.. currentmodule:: onetl.file.format.json +``` + +```{eval-rst} +.. autoclass:: JSON + :members: __init__, parse_column, serialize_column, allowBackslashEscapingAnyCharacter,allowComments,allowNonNumericNumbers,allowNumericLeadingZeros,allowSingleQuotes,allowUnquotedControlChars,allowUnquotedFieldNames,columnNameOfCorruptRecord,dateFormat,dropFieldIfAllNull,encoding,lineSep,locale,mode,prefersDecimal,primitivesAsString,samplingRatio,timestampFormat,timestampNTZFormat,timezone + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file_df/file_formats/jsonline.md b/mkdocs/docs/en/file_df/file_formats/jsonline.md new file mode 100644 index 000000000..6a3794de7 --- /dev/null +++ b/mkdocs/docs/en/file_df/file_formats/jsonline.md @@ -0,0 +1,13 @@ +(jsonline-file-format)= + +# JSONLine + +```{eval-rst} +.. currentmodule:: onetl.file.format.jsonline +``` + +```{eval-rst} +.. autoclass:: JSONLine + :members: __init__, allowBackslashEscapingAnyCharacter,allowComments,allowNonNumericNumbers,allowNumericLeadingZeros,allowSingleQuotes,allowUnquotedControlChars,allowUnquotedFieldNames,columnNameOfCorruptRecord,compression,dateFormat,dropFieldIfAllNull,encoding,ignoreNullFields,lineSep,locale,mode,prefersDecimal,primitivesAsString,samplingRatio,timestampFormat,timestampNTZFormat,timezone + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file_df/file_formats/orc.md b/mkdocs/docs/en/file_df/file_formats/orc.md new file mode 100644 index 000000000..be830b88f --- /dev/null +++ b/mkdocs/docs/en/file_df/file_formats/orc.md @@ -0,0 +1,13 @@ +(orc-file-format)= + +# ORC + +```{eval-rst} +.. currentmodule:: onetl.file.format.orc +``` + +```{eval-rst} +.. autoclass:: ORC + :members: __init__, mergeSchema,compression + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file_df/file_formats/parquet.md b/mkdocs/docs/en/file_df/file_formats/parquet.md new file mode 100644 index 000000000..2d9b91bbc --- /dev/null +++ b/mkdocs/docs/en/file_df/file_formats/parquet.md @@ -0,0 +1,13 @@ +(parquet-file-format)= + +# Parquet + +```{eval-rst} +.. currentmodule:: onetl.file.format.parquet +``` + +```{eval-rst} +.. autoclass:: Parquet + :members: __init__, mergeSchema,compression + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file_df/file_formats/xml.md b/mkdocs/docs/en/file_df/file_formats/xml.md new file mode 100644 index 000000000..92b127bce --- /dev/null +++ b/mkdocs/docs/en/file_df/file_formats/xml.md @@ -0,0 +1,13 @@ +(xml-file-format)= + +# XML + +```{eval-rst} +.. currentmodule:: onetl.file.format.xml +``` + +```{eval-rst} +.. autoclass:: XML + :members: get_packages, parse_column,arrayElementName,attributePrefix,charset,columnNameOfCorruptRecord,compression,dateFormat,declaration,excludeAttribute,ignoreNamespace,ignoreSurroundingSpaces,inferSchema,mode,nullValue,rootTag,row_tag,rowValidationXSDPath,samplingRatio,timestampFormat,valueTag,wildcardColName + :member-order: bysource +``` diff --git a/mkdocs/docs/en/file_df/index.md b/mkdocs/docs/en/file_df/index.md new file mode 100644 index 000000000..a955dcca5 --- /dev/null +++ b/mkdocs/docs/en/file_df/index.md @@ -0,0 +1,10 @@ +(file-df-root)= + +```{toctree} +:caption: File DataFrame classes +:maxdepth: 1 + +file_df_reader/index +file_df_writer/index +file_formats/index +``` diff --git a/mkdocs/docs/en/hooks/design.md b/mkdocs/docs/en/hooks/design.md new file mode 100644 index 000000000..963d61d39 --- /dev/null +++ b/mkdocs/docs/en/hooks/design.md @@ -0,0 +1,694 @@ +(hooks-design)= + +# High level design + +## What are hooks? + +Hook mechanism is a part of onETL which allows to inject some additional behavior into +existing methods of (almost) any class. + +### Features + +Hooks mechanism allows to: + +- Inspect and validate input arguments and output results of method call +- Access, modify or replace method call result (but NOT input arguments) +- Wrap method calls with a context manager and catch raised exceptions + +Hooks can be placed into {ref}`plugins`, allowing to modify onETL behavior by installing some additional package. + +### Limitations + +- Hooks can be bound to methods of a class only (not functions). +- Only methods decorated with {ref}`slot-decorator` implement hooks mechanism. These class and methods are marked as {{ support_hooks }}. +- Hooks can be bound to public methods only. + +## Terms + +- {ref}`slot-decorator` - method of a class with a special decorator +- `Callback` - function which implements some additional logic which modifies slot behavior +- {ref}`hook-decorator` - wrapper around callback which stores hook state, priority and some useful methods +- `Hooks mechanism` - calling `Slot()` will call all enabled hooks which are bound to the slot. Implemented by {ref}`support-hooks-decorator`. + +## How to implement hooks? + +### TL;DR + +```python +from onetl.hooks import support_hooks, slot, hook + + +@support_hooks # enabling hook mechanism for the class +class MyClass: + def __init__(self, data): + self.data = data + + # this is slot + @slot + def method(self, arg): + pass + + +@MyClass.method.bind # bound hook to the slot +@hook # this is hook +def callback(obj, arg): # this is callback + print(obj.data, arg) + + +obj = MyClass(1) +obj.method(2) # will call callback(obj, 1) + +# prints "1 2" +``` + +#### Define a slot + +- Create a class with a method: + +```python +class MyClass: + def __init__(self, data): + self.data = data + + def method(self, arg): + return self.data, arg +``` + +- Add {ref}`slot-decorator` to the method: + +```python +from onetl.hooks import support_hooks, slot, hook + + +class MyClass: + @slot + def method(self, arg): + return self.data, arg +``` + +If method has other decorators like `@classmethod` or `@staticmethod`, `@slot` should be placed on the top: + +```python +from onetl.hooks import support_hooks, slot, hook + + +class MyClass: + @slot + @classmethod + def class_method(cls, arg): + return cls, arg + + @slot + @staticmethod + def static_method(arg): + return arg +``` + +- Add {ref}`support-hooks-decorator` to the class: + +```python +from onetl.hooks import support_hooks, slot, hook + + +@support_hooks +class MyClass: + @slot + def method(self, arg): + return self.data, arg +``` + +Slot is created. + +#### Define a callback + +Define some function (a.k.a callback): + +```python +def callback(self, arg): + print(self.data, arg) +``` + +It should have signature *compatible* with `MyClass.method`. *Compatible* does not mean *exactly the same* - +for example, you can rename positional arguments: + +```python +def callback(obj, arg): + print(obj.data, arg) +``` + +Use `*args` and `**kwargs` to omit arguments you don't care about: + +```python +def callback(obj, *args, **kwargs): + print(obj.data, args, kwargs) +``` + +There is also an argument `method_name` which has a special meaning - +the method name which the callback is bound to is passed into this argument: + +```python +def callback(obj, *args, method_name: str, **kwargs): + print(obj.data, args, method_name, kwargs) +``` + +```{eval-rst} +.. note:: + + `method_name` should always be a keyword argument, **NOT** positional. +``` + +```{eval-rst} +.. warning:: + + If callback signature is not compatible with slot signature, an exception will be raised, + but **ONLY** while slot is called. +``` + +#### Define a hook + +Add {ref}`hook-decorator` to create a hook from your callback: + +```python +@hook +def callback(obj, arg): + print(obj.data, arg) +``` + +You can pass more options to the `@hook` decorator, like state or priority. +See decorator documentation for more details. + +#### Bind hook to the slot + +Use `Slot.bind` method to bind hook to the slot: + +```python +@MyClass.method.bind +@hook +def callback(obj, arg): + print(obj, arg) +``` + +You can bind more than one hook to the same slot, and bind same hook to multiple slots: + +```python +@MyClass.method1.bind +@MyClass.method2.bind +@hook +def callback1(obj, arg): + "Will be called by both MyClass.method1 and MyClass.method2" + + +@MyClass.method1.bind +@hook +def callback2(obj, arg): + "Will be called by MyClass.method1 too" +``` + +## How hooks are called? + +### General + +Just call the method decorated by `@slot` to trigger the hook: + +```python +obj = MyClass(1) +obj.method(2) # will call callback(obj, 2) + +# prints "1 2" +``` + +There are some special callback types that has a slightly different behavior. + +### Context managers + +`@hook` decorator can be placed on a context manager class: + +```python +@hook +class ContextManager: + def __init__(self, obj, arg): + self.obj = obj + self.arg = arg + + def __enter__(self): + # do something on enter + print(obj.data, arg) + return self + + def __exit__(self, exc_type, exc_value, traceback): + # do something on exit + return False +``` + +Context manager is entered while calling the `Slot()`, and exited then the call is finished. + +If present, method `process_result` has a special meaning - +it can receive `MyClass.method` call result, and also modify/replace it: + +```python +@hook +class ContextManager: + def __init__(self, obj, arg): + self.obj = obj + self.arg = arg + + def __enter__(self): + # do something on enter + print(obj.data, arg) + return self + + def __exit__(self, exc_type, exc_value, traceback): + # do something on exit + return False + + def process_result(self, result): + # do something with method call result + return modified(result) +``` + +See examples below for more information. + +### Generator function + +`@hook` decorator can be placed on a generator function: + +```python +@hook +def callback(obj, arg): + print(obj.data, arg) + # this is called before original method body + + yield # method is called here + + # this is called after original method body +``` + +It is converted to a context manager, in the same manner as +[contextlib.contextmanager](https://docs.python.org/3/library/contextlib.html#contextlib.contextmanager). + +Generator body can be wrapped with `try..except..finally` to catch exceptions: + +```python +@hook +def callback(obj, arg): + print(obj.data, arg) + + try: + # this is called before original method body + + yield # method is called here + except Exception as e: + process_exception(a) + finally: + # this is called after original method body + finalizer() +``` + +There is also a special syntax which allows generator to access and modify/replace method call result: + +``` +@hook +def callback(obj, arg): + original_result = yield # method is called here + + new_result = do_something(original_result) + + yield new_result # modify/replace the result +``` + +### Calling hooks in details + +- The callback will be called with the same arguments as the original method. + + - If slot is a regular method: + + ```python + callback_result = callback(self, *args, **kwargs) + ``` + + Here `self` is a class instance (`obj`). + + - If slot is a class method: + + ```python + callback_result = callback(cls, *args, **kwargs) + ``` + + Here `cls` is the class itself (`MyClass`). + + - If slot is a static method: + + ```python + callback_result = callback(*args, **kwargs) + ``` + + Neither object not class are passed to the callback in this case. + +- If `callback_result` is a context manager, enter the context. Context manager can catch all the exceptions raised. + + > If there are multiple hooks bound the the slot, every context manager will be entered. + +- Then call the original method wrapped by `@slot`: + + ```python + original_result = method(*args, **kwargs) + ``` + +- Process `original_result`: + + - If `callback_result` object has method `process_result`, or is a generator wrapped with `@hook`, call it: + + ```python + new_result = callback_result.process_result(original_result) + ``` + + - Otherwise set `new_result = callback_result`. + + - If there are multiple hooks bound the the method, pass `new_result` through the chain: + + ```python + new_result = callback1_result.process_result(original_result) + new_result = callback2_result.process_result(new_result or original_result) + new_result = callback3_result.process_result(new_result or original_result) + ``` + +- Finally return: + + ```python + return new_result or original_result + ``` + + All `None` values are ignored on every step above. + +- Exit all the context managers entered during the slot call. + +### Hooks priority + +Hooks are executed in the following order: + +1. Parent class slot + {obj}`FIRST ` +2. Inherited class slot + {obj}`FIRST ` +3. Parent class slot + {obj}`NORMAL ` +4. Inherited class slot + {obj}`NORMAL ` +5. Parent class slot + {obj}`LAST ` +6. Inherited class slot + {obj}`LAST ` + +Hooks with the same priority and inheritance will be executed in the same order they were registered (`Slot.bind` call). + +```{eval-rst} +.. note:: + + Calls of `super()` inside inherited class methods does not trigger hooks call. + Hooks are triggered only if method is called explicitly. + + This allow to wrap with a hook the entire slot call without influencing its internal logic. + +``` + +### Hook types + +Here are several examples of using hooks. These types are not exceptional, they can be mixed - for example, +hook can both modify method result and catch exceptions. + +#### Before hook + +Can be used for inspecting or validating input args of the original function: + +```python +@hook +def before1(obj, arg): + print(obj, arg) + # original method is called after exiting this function + + +@hook +def before2(obj, arg): + if arg == 1: + raise ValueError("arg=1 is not allowed") + return None # return None is the same as no return statement +``` + +Executed before calling the original method wrapped by `@slot`. +If hook raises an exception, method will not be called at all. + +#### After hook + +Can be used for performing some actions after original method was successfully executed: + +```python +@hook +def after1(obj, arg): + yield # original method is called here + print(obj, arg) + + +@hook +def after2(obj, arg): + yield None # yielding None is the same as empty yield + if arg == 1: + raise ValueError("arg=1 is not allowed") +``` + +If original method raises an exception, the block of code after `yield` will not be called. + +#### Context hook + +Can be used for catching and handling some exceptions, or to determine that there was no exception during slot call: + +```{eval-rst} +.. tabs:: + + .. code-tab:: py Generator syntax + + # This is just the same as using @contextlib.contextmanager + + @hook + def context_generator(obj, arg): + try: + yield # original method is called here + print(obj, arg) # <-- this line will not be called if method raised an exception + except SomeException as e: + magic(e) + finally: + finalizer() + + .. code-tab:: py Context manager syntax + + @hook + class ContextManager: + def __init__(self, obj, args): + self.obj = obj + self.args = args + + def __enter__(self): + return self + + # original method is called between __enter__ and __exit__ + + def __exit__(self, exc_type, exc_value, traceback): + result = False + if exc_type is not None and isinstance(exc_value, SomeException): + magic(exc_value) + result = True # suppress exception + else: + print(self.obj, self.arg) + finalizer() + return result +``` + +```{eval-rst} +.. note:: + + Contexts are exited in the reverse order of the hook calls. + So if some hook raised an exception, it will be passed into the previous hook, not the next one. + + It is recommended to specify the proper priority for the hook, e.g. :obj:`FIRST ` +``` + +#### Replacing result hook + +Replaces the output result of the original method. + +Can be used for delegating some implementation details for third-party extensions. +See {ref}`hive` and {ref}`hdfs` as an example. + +```python +@hook +def replace1(obj, arg): + result = arg + 10 # any non-None return result + + # original method call result is ignored, output will always be arg + 10 + return result + + +@hook +def replace2(obj, arg): + yield arg + 10 # same as above +``` + +```{eval-rst} +.. note:: + + If there are multiple hooks bound to the same slot, the result of last hook will be used. + It is recommended to specify the proper priority for the hook, e.g. :obj:`LAST ` + +``` + +#### Accessing result hook + +Can access output result of the original method and inspect or validate it: + +```{eval-rst} +.. tabs:: + + .. code-tab:: py Generator syntax + + @hook + def access_result(obj, arg): + result = yield # original method is called here, and result can be used in the hook + print(result) + yield # does not modify result + + .. code-tab:: py Context manager syntax + + @hook + class ModifiesResult: + def __init__(self, obj, args): + self.obj = obj + self.args = args + + def __enter__(self): + return self + + # original method is called between __enter__ and __exit__ + # result is passed into process_result method of context manager, if present + + def process_result(self, result): + print(result) # result can be used in the hook + return None # does not modify result. same as no return statement in the method + + def __exit__(self, exc_type, exc_value, traceback): + return False + +``` + +#### Modifying result hook + +Can access output result of the original method, and return the modified one: + +```{eval-rst} +.. tabs:: + + .. code-tab:: py Generator syntax + + @hook + def modifies_result(obj, arg): + result = yield # original method is called here, and result can be used in the hook + yield result + 10 # modify output result. None values are ignored + + .. code-tab:: py Context manager syntax + + @hook + class ModifiesResult: + def __init__(self, obj, args): + self.obj = obj + self.args = args + + def __enter__(self): + return self + + # original method is called between __enter__ and __exit__ + # result is passed into process_result method of context manager, if present + + def process_result(self, result): + print(result) # result can be used in the hook + return result + 10 # modify output result. None values are ignored + + def __exit__(self, exc_type, exc_value, traceback): + return False +``` + +```{eval-rst} +.. note:: + + If there are multiple hooks bound to the same slot, the result of last hook will be used. + It is recommended to specify the proper priority for the hook, e.g. :obj:`LAST ` + +``` + +## How to enable/disable hooks? + +You can enable/disable/temporary disable hooks on 4 different levels: + +- Manage global hooks state (level 1): + + > - {obj}`onetl.hooks.hooks_state.stop_all_hooks` + > - {obj}`onetl.hooks.hooks_state.resume_all_hooks` + > - {obj}`onetl.hooks.hooks_state.skip_all_hooks` + +- Manage all hooks bound to a specific class (level 2): + + > - {obj}`onetl.hooks.support_hooks.suspend_hooks` + > - {obj}`onetl.hooks.support_hooks.resume_hooks` + > - {obj}`onetl.hooks.support_hooks.skip_hooks` + +- Manage all hooks bound to a specific slot (level 3): + + > - {obj}`onetl.hooks.slot.Slot.suspend_hooks` + > - {obj}`onetl.hooks.slot.Slot.resume_hooks` + > - {obj}`onetl.hooks.slot.Slot.skip_hooks` + +- Manage state of a specific hook (level 4): + + > - {obj}`onetl.hooks.hook.Hook.enable` + > - {obj}`onetl.hooks.hook.Hook.disable` + +More details in the documentation above. + +```{eval-rst} +.. note:: + + All of these levels are independent. + + Calling `stop` on the level 1 has higher priority than level 2, and so on. + But calling `resume` on the level 1 does not automatically resume hooks stopped in the level 2, + they should be resumed explicitly. + +``` + +## How to see logs of the hook mechanism? + +Hooks registration emits logs with `DEBUG` level: + +```python +from onetl.logs import setup_logging + +setup_logging() +``` + +```text +DEBUG |onETL| Registered hook 'mymodule.callback1' for 'MyClass.method' (enabled=True, priority=HookPriority.NORMAL) +DEBUG |onETL| Registered hook 'mymodule.callback2' for 'MyClass.method' (enabled=True, priority=HookPriority.NORMAL) +DEBUG |onETL| Registered hook 'mymodule.callback3' for 'MyClass.method' (enabled=False, priority=HookPriority.NORMAL) +``` + +But most of logs are emitted with even lower level `NOTICE`, to make output less verbose: + +```python +from onetl.logs import NOTICE, setup_logging + +setup_logging(level=NOTICE) +``` + +``` +NOTICE |Hooks| 2 hooks registered for 'MyClass.method' +NOTICE |Hooks| Calling hook 'mymodule.callback1' (1/2) +NOTICE |Hooks| Hook is finished with returning non-None result +NOTICE |Hooks| Calling hook 'mymodule.callback2' (2/2) +NOTICE |Hooks| This is a context manager, entering ... +NOTICE |Hooks| Calling original method 'MyClass.method' +NOTICE |Hooks| Method call is finished +NOTICE |Hooks| Method call result (*NOT* None) will be replaced with result of hook 'mymodule.callback1' +NOTICE |Hooks| Passing result to 'process_result' method of context manager 'mymodule.callback2' +NOTICE |Hooks| Method call result (*NOT* None) is modified by hook! +``` diff --git a/mkdocs/docs/en/hooks/global_state.md b/mkdocs/docs/en/hooks/global_state.md new file mode 100644 index 000000000..f4fd50a43 --- /dev/null +++ b/mkdocs/docs/en/hooks/global_state.md @@ -0,0 +1,27 @@ +(hooks-global-state)= + +# Hooks global state + +```{eval-rst} +.. currentmodule:: onetl.hooks.hooks_state +``` + +```{eval-rst} +.. autosummary:: + + skip_all_hooks + stop_all_hooks + resume_all_hooks +``` + +```{eval-rst} +.. autofunction:: skip_all_hooks +``` + +```{eval-rst} +.. autofunction:: stop_all_hooks +``` + +```{eval-rst} +.. autofunction:: resume_all_hooks +``` diff --git a/mkdocs/docs/en/hooks/hook.md b/mkdocs/docs/en/hooks/hook.md new file mode 100644 index 000000000..8dc54cf56 --- /dev/null +++ b/mkdocs/docs/en/hooks/hook.md @@ -0,0 +1,34 @@ +(hook-decorator)= + +# `@hook` decorator + +```{eval-rst} +.. currentmodule:: onetl.hooks.hook +``` + +```{eval-rst} +.. autosummary:: + + hook + HookPriority + Hook + Hook.enable + Hook.disable + Hook.skip +``` + +```{eval-rst} +.. autodecorator:: hook +``` + +```{eval-rst} +.. autoclass:: HookPriority + :members: FIRST, NORMAL, LAST + :member-order: bysource +``` + +```{eval-rst} +.. autoclass:: Hook + :members: enable, disable, skip + :member-order: bysource +``` diff --git a/mkdocs/docs/en/hooks/index.md b/mkdocs/docs/en/hooks/index.md new file mode 100644 index 000000000..6d8218027 --- /dev/null +++ b/mkdocs/docs/en/hooks/index.md @@ -0,0 +1,15 @@ +# Hooks + +:octicons-versions-16: **version added 0.6.0** + + +```{toctree} +:caption: Hooks +:maxdepth: 1 + +High level design +@hook decorator +@slot decorator +@support_hooks decorator +Hooks global state +``` diff --git a/mkdocs/docs/en/hooks/slot.md b/mkdocs/docs/en/hooks/slot.md new file mode 100644 index 000000000..ac3c8d74b --- /dev/null +++ b/mkdocs/docs/en/hooks/slot.md @@ -0,0 +1,26 @@ +(slot-decorator)= + +# `@slot` decorator + +```{eval-rst} +.. currentmodule:: onetl.hooks.slot +``` + +```{eval-rst} +.. autosummary:: + + slot + Slot + Slot.bind + Slot.skip_hooks + Slot.suspend_hooks + Slot.resume_hooks +``` + +```{eval-rst} +.. autodecorator:: slot +``` + +```{eval-rst} +.. autoprotocol:: Slot +``` diff --git a/mkdocs/docs/en/hooks/support_hooks.md b/mkdocs/docs/en/hooks/support_hooks.md new file mode 100644 index 000000000..a2fd9fb60 --- /dev/null +++ b/mkdocs/docs/en/hooks/support_hooks.md @@ -0,0 +1,32 @@ +(support-hooks-decorator)= + +# `@support_hooks` decorator + +```{eval-rst} +.. currentmodule:: onetl.hooks.support_hooks +``` + +```{eval-rst} +.. autosummary:: + + support_hooks + skip_hooks + suspend_hooks + resume_hooks +``` + +```{eval-rst} +.. autodecorator:: support_hooks +``` + +```{eval-rst} +.. autofunction:: skip_hooks +``` + +```{eval-rst} +.. autofunction:: suspend_hooks +``` + +```{eval-rst} +.. autofunction:: resume_hooks +``` diff --git a/mkdocs/docs/en/hwm_store/index.md b/mkdocs/docs/en/hwm_store/index.md new file mode 100644 index 000000000..cc9630c5d --- /dev/null +++ b/mkdocs/docs/en/hwm_store/index.md @@ -0,0 +1,14 @@ +# HWM + +Since onETL v0.10.0, the `HWMStore` and `HWM` classes have been moved to a separate library {etl-entities}`etl-entities <>`. + +The only class was left intact is {ref}`yaml-hwm-store`, **which is default** in onETL. + +Other known implementation is [HorizonHWMStore](https://horizon-hwm-store.readthedocs.io/). + +```{toctree} +:hidden: true +:maxdepth: 2 + +yaml_hwm_store +``` diff --git a/mkdocs/docs/en/hwm_store/yaml_hwm_store.md b/mkdocs/docs/en/hwm_store/yaml_hwm_store.md new file mode 100644 index 000000000..63beb672e --- /dev/null +++ b/mkdocs/docs/en/hwm_store/yaml_hwm_store.md @@ -0,0 +1,12 @@ +(yaml-hwm-store)= + +# YAMLHWMStore + +```{eval-rst} +.. currentmodule:: onetl.hwm.store.yaml_hwm_store +``` + +```{eval-rst} +.. autoclass:: YAMLHWMStore + :members: get_hwm, set_hwm, __enter__ +``` diff --git a/mkdocs/docs/en/index.md b/mkdocs/docs/en/index.md new file mode 100644 index 000000000..aea080ea2 --- /dev/null +++ b/mkdocs/docs/en/index.md @@ -0,0 +1,18 @@ +# onETL + +{{ repo_status_badge }} +{{ pypi_release_bage }} +{{ pypi_license_bage }} +{{ pypi_pyversion_bage }} +{{ pypi_downloads_bage }} + +{{ docs_status_badge }} +{{ ci_status_badge }} +{{ precommit_badge }} + + +{{ onetl_logo_wide }} + +----8<---- +../mddocs/docs/en/snippet_0.md +----8<---- diff --git a/mkdocs/docs/en/install/files.md b/mkdocs/docs/en/install/files.md new file mode 100644 index 000000000..7a9867f53 --- /dev/null +++ b/mkdocs/docs/en/install/files.md @@ -0,0 +1,9 @@ +(install-files)= + +# File connections + +```{eval-rst} +.. include:: ../../README.rst + :start-after: .. _files-install: + :end-before: With Kerberos support +``` diff --git a/mkdocs/docs/en/install/full.md b/mkdocs/docs/en/install/full.md new file mode 100644 index 000000000..f6d1cc107 --- /dev/null +++ b/mkdocs/docs/en/install/full.md @@ -0,0 +1,9 @@ +(install-full)= + +# Full bundle + +```{eval-rst} +.. include:: ../../README.rst + :start-after: .. _full-bundle: + :end-before: .. _quick-start: +``` diff --git a/mkdocs/docs/en/install/index.md b/mkdocs/docs/en/install/index.md new file mode 100644 index 000000000..d71a13de9 --- /dev/null +++ b/mkdocs/docs/en/install/index.md @@ -0,0 +1,22 @@ +(install)= + +# How to install + +```{eval-rst} +.. include:: ../../README.rst + :start-after: .. _minimal-install: + :end-before: With DB and FileDF connections +``` + +## Installation in details + +```{toctree} +:caption: How to install +:maxdepth: 1 + +self +spark +files +kerberos +full +``` diff --git a/mkdocs/docs/en/install/kerberos.md b/mkdocs/docs/en/install/kerberos.md new file mode 100644 index 000000000..7c772b94a --- /dev/null +++ b/mkdocs/docs/en/install/kerberos.md @@ -0,0 +1,9 @@ +(install-kerberos)= + +# Kerberos support + +```{eval-rst} +.. include:: ../../README.rst + :start-after: .. _kerberos-install: + :end-before: Full bundle +``` diff --git a/mkdocs/docs/en/install/spark.md b/mkdocs/docs/en/install/spark.md new file mode 100644 index 000000000..aa092ea24 --- /dev/null +++ b/mkdocs/docs/en/install/spark.md @@ -0,0 +1,328 @@ +(install-spark)= + +# Spark + +```{eval-rst} +.. include:: ../../README.rst + :start-after: .. _spark-install: + :end-before: .. _java-install: +``` + +## Installing Java + +```{eval-rst} +.. include:: ../../README.rst + :start-after: .. _java-install: + :end-before: .. _pyspark-install: +``` + +## Installing PySpark + +```{eval-rst} +.. include:: ../../README.rst + :start-after: .. _pyspark-install: + :end-before: With File connections +``` + +(java-packages)= + +## Injecting Java packages + +Some DB and FileDF connection classes require specific packages to be inserted to `CLASSPATH` of Spark session, +like JDBC drivers. + +This is usually done by setting up `spark.jars.packages` option while creating Spark session: + +```python +# here is a list of packages to be downloaded: +maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Teradata.get_packages() +) + +spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() +) +``` + +Spark automatically resolves package and all its dependencies, download them and inject to Spark session +(both driver and all executors). + +This requires internet access, because package metadata and `.jar` files are fetched from [Maven Repository](https://mvnrepository.com/). + +But sometimes it is required to: + +- Install package without direct internet access (isolated network) +- Install package which is not available in Maven + +There are several ways to do that. + +### Using `spark.jars` + +The most simple solution, but this requires to store raw `.jar` files somewhere on filesystem or web server. + +- Download `package.jar` files (it's usually something like `some-package_1.0.0.jar`). Local file name does not matter, but it should be unique. +- (For `spark.submit.deployMode=cluster`) place downloaded files to HDFS or deploy to any HTTP web server serving static files. See [official documentation](https://spark.apache.org/docs/latest/submitting-applications.html#advanced-dependency-management) for more details. +- Create Spark session with passing `.jar` absolute file path to `spark.jars` Spark config option: + +```{eval-rst} +.. tabs:: + + .. code-tab:: py for spark.submit.deployMode=client (default) + + jar_files = ["/path/to/package.jar"] + + # do not pass spark.jars.packages + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars", ",".join(jar_files)) + .getOrCreate() + ) + + .. code-tab:: py for spark.submit.deployMode=cluster + + # you can also pass URLs like http://domain.com/path/to/downloadable/package.jar + jar_files = ["hdfs:///path/to/package.jar"] + + # do not pass spark.jars.packages + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars", ",".join(jar_files)) + .getOrCreate() + ) +``` + +### Using `spark.jars.repositories` + +```{eval-rst} +.. note:: + + In this case Spark still will try to fetch packages from the internet, so if you don't have internet access, + Spark session will be created with significant delay because of all attempts to fetch packages. +``` + +Can be used if you have access both to public repos (like Maven) and a private Artifactory/Nexus repo. + +- Setup private Maven repository in [JFrog Artifactory](https://jfrog.com/artifactory/) or [Sonatype Nexus](https://www.sonatype.com/products/sonatype-nexus-repository). +- Download `package.jar` file (it's usually something like `some-package_1.0.0.jar`). Local file name does not matter. +- Upload `package.jar` file to private repository (with same `groupId` and `artifactoryId` as in source package in Maven). +- Pass repo URL to `spark.jars.repositories` Spark config option. +- Create Spark session with passing Package name to `spark.jars.packages` Spark config option: + +```python +maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Teradata.get_packages() +) + +spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.repositories", "http://nexus.mydomain.com/private-repo/") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() +) +``` + +### Using `spark.jars.ivySettings` + +Same as above, but can be used even if there is no network access to public repos like Maven. + +- Setup private Maven repository in [JFrog Artifactory](https://jfrog.com/artifactory/) or [Sonatype Nexus](https://www.sonatype.com/products/sonatype-nexus-repository). +- Download `package.jar` file (it's usually something like `some-package_1.0.0.jar`). Local file name does not matter. +- Upload `package.jar` file to [private repository](https://help.sonatype.com/repomanager3/nexus-repository-administration/repository-management#RepositoryManagement-HostedRepository) (with same `groupId` and `artifactoryId` as in source package in Maven). +- Create `ivysettings.xml` file (see below). +- Add here a resolver with repository URL (and credentials, if required). +- Pass `ivysettings.xml` absolute path to `spark.jars.ivySettings` Spark config option. +- Create Spark session with passing package name to `spark.jars.packages` Spark config option: + +```{eval-rst} +.. tabs:: + + .. code-tab:: xml ivysettings-all-packages-uploaded-to-nexus.xml + + + + + + + + + + + + + + + + .. code-tab:: xml ivysettings-private-packages-in-nexus-public-in-maven.xml + + + + + + + + + + + + + + + + + + + + .. code-tab:: xml ivysettings-private-packages-in-nexus-public-fetched-using-proxy-repo.xml + + + + + + + + + + + + + + + + + + .. code-tab:: xml ivysettings-nexus-with-auth-required.xml + + + + + + + + + + + + + + + + + + + + + +``` + +```{code-block} python +:caption: script.py + +maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Teradata.get_packages() +) + +spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.ivySettings", "/path/to/ivysettings.xml") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() +) +``` + +### Place `.jar` file to `-/.ivy2/jars/` + +Can be used to pass already downloaded file to Ivy, and skip resolving package from Maven. + +- Download `package.jar` file (it's usually something like `some-package_1.0.0.jar`). Local file name does not matter, but it should be unique. +- Move it to `-/.ivy2/jars/` folder. +- Create Spark session with passing package name to `spark.jars.packages` Spark config option: + +```python +maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Teradata.get_packages() +) + +spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() +) +``` + +### Place `.jar` file to Spark jars folder + +```{eval-rst} +.. note:: + + Package file should be placed on all hosts/containers Spark is running, + both driver and all executors. + + Usually this is used only with either: + * `spark.master=local` (driver and executors are running on the same host), + * `spark.master=k8s://...` (`.jar` files are added to image or to volume mounted to all pods). +``` + +Can be used to embed `.jar` files to a default Spark classpath. + +- Download `package.jar` file (it's usually something like `some-package_1.0.0.jar`). Local file name does not matter, but it should be unique. +- Move it to `$SPARK_HOME/jars/` folder, e.g. `^/.local/lib/python3.7/site-packages/pyspark/jars/` or `/opt/spark/3.2.3/jars/`. +- Create Spark session **WITHOUT** passing Package name to `spark.jars.packages` + +```python +# no need to set spark.jars.packages or any other spark.jars.* option +# all jars already present in CLASSPATH, and loaded automatically + +spark = SparkSession.builder.config("spark.app.name", "onetl").getOrCreate() +``` + +### Manually adding `.jar` files to `CLASSPATH` + +```{eval-rst} +.. note:: + + Package file should be placed on all hosts/containers Spark is running, + both driver and all executors. + + Usually this is used only with either: + * `spark.master=local` (driver and executors are running on the same host), + * `spark.master=k8s://...` (`.jar` files are added to image or to volume mounted to all pods). +``` + +Can be used to embed `.jar` files to a default Java classpath. + +- Download `package.jar` file (it's usually something like `some-package_1.0.0.jar`). Local file name does not matter. +- Set environment variable `CLASSPATH` to `/path/to/package.jar`. You can set multiple file paths +- Create Spark session **WITHOUT** passing Package name to `spark.jars.packages` + +```python +# no need to set spark.jars.packages or any other spark.jars.* option +# all jars already present in CLASSPATH, and loaded automatically + +import os + +jar_files = ["/path/to/package.jar"] +# different delimiters for Windows and Linux +delimiter = ";" if os.name == "nt" else ":" +spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.driver.extraClassPath", delimiter.join(jar_files)) + .config("spark.executor.extraClassPath", delimiter.join(jar_files)) + .getOrCreate() +) +``` diff --git a/mkdocs/docs/en/logging.md b/mkdocs/docs/en/logging.md new file mode 100644 index 000000000..c040ebfdc --- /dev/null +++ b/mkdocs/docs/en/logging.md @@ -0,0 +1,157 @@ +# Logging + +Logging is quite important to understand what's going on under the hood of onETL. + +Default logging level for Python interpreters is `WARNING`, but most of onETL logs are in `INFO` level, so users usually don't see much. + +To change logging level, there is a function [setup_logging][onetl.log.setup_logging] which should be called at the top of the script: + +```python +from onetl.log import setup_logging +from other.lib import some, more, imports + +setup_logging() + +# rest of code +... +``` + +This changes both log level and log formatting to something like this: + +??? "See logs" + + ```text + + 2024-04-12 10:12:10,834 [INFO ] MainThread: |onETL| Using IncrementalStrategy as a strategy + 2024-04-12 10:12:10,835 [INFO ] MainThread: =================================== DBReader.run() starts =================================== + 2024-04-12 10:12:10,835 [INFO ] MainThread: |DBReader| Getting Spark type for HWM expression: 'updated_at' + 2024-04-12 10:12:10,836 [INFO ] MainThread: |MSSQL| Fetching schema of table 'source_schema.table' ... + 2024-04-12 10:12:11,636 [INFO ] MainThread: |MSSQL| Schema fetched. + 2024-04-12 10:12:11,642 [INFO ] MainThread: |DBReader| Got Spark field: StructField('updated_at', TimestampType(), True) + 2024-04-12 10:12:11,642 [INFO ] MainThread: |DBReader| Detected HWM type: 'ColumnDateTimeHWM' + 2024-04-12 10:12:11,643 [INFO ] MainThread: |IncrementalStrategy| Fetching HWM from HorizonHWMStore: + 2024-04-12 10:12:11,643 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb' + 2024-04-12 10:12:12,181 [INFO ] MainThread: |IncrementalStrategy| Fetched HWM: + 2024-04-12 10:12:12,182 [INFO ] MainThread: hwm = ColumnDateTimeHWM( + 2024-04-12 10:12:12,182 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb', + 2024-04-12 10:12:12,182 [INFO ] MainThread: entity = 'source_schema.table', + 2024-04-12 10:12:12,182 [INFO ] MainThread: expression = 'updated_at', + 2024-04-12 10:12:12,184 [INFO ] MainThread: value = datetime.datetime(2024, 4, 11, 18, 10, 2, 120000), + 2024-04-12 10:12:12,184 [INFO ] MainThread: ) + 2024-04-12 10:12:12,184 [INFO ] MainThread: |MSSQL| -> |Spark| Reading DataFrame from source using parameters: + 2024-04-12 10:12:12,185 [INFO ] MainThread: source = 'source_schema.table' + 2024-04-12 10:12:12,185 [INFO ] MainThread: columns = [ + 2024-04-12 10:12:12,185 [INFO ] MainThread: 'id', + 2024-04-12 10:12:12,186 [INFO ] MainThread: 'new_value', + 2024-04-12 10:12:12,186 [INFO ] MainThread: 'old_value', + 2024-04-12 10:12:12,186 [INFO ] MainThread: 'updated_at', + 2024-04-12 10:12:12,186 [INFO ] MainThread: ] + 2024-04-12 10:12:12,187 [INFO ] MainThread: where = "field = 'some'" + 2024-04-12 10:12:12,187 [INFO ] MainThread: hwm = AutoDetectHWM( + 2024-04-12 10:12:12,187 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb', + 2024-04-12 10:12:12,187 [INFO ] MainThread: entity = 'source_schema.table', + 2024-04-12 10:12:12,187 [INFO ] MainThread: expression = 'updated_at', + 2024-04-12 10:12:12,188 [INFO ] MainThread: ) + 2024-04-12 10:12:12,188 [INFO ] MainThread: options = { + 2024-04-12 10:12:12,188 [INFO ] MainThread: 'fetchsize': 100000, + 2024-04-12 10:12:12,188 [INFO ] MainThread: 'numPartitions': 1, + 2024-04-12 10:12:12,189 [INFO ] MainThread: 'partitioningMode': 'range', + 2024-04-12 10:12:12,189 [INFO ] MainThread: } + 2024-04-12 10:12:12,189 [INFO ] MainThread: |MSSQL| Checking connection availability... + 2024-04-12 10:12:12,189 [INFO ] MainThread: |MSSQL| Using connection parameters: + 2024-04-12 10:12:12,190 [INFO ] MainThread: user = 'db_user' + 2024-04-12 10:12:12,190 [INFO ] MainThread: password = SecretStr('**********') + 2024-04-12 10:12:12,190 [INFO ] MainThread: host = 'mssql.host' + 2024-04-12 10:12:12,190 [INFO ] MainThread: port = 1433 + 2024-04-12 10:12:12,191 [INFO ] MainThread: database = 'somedb' + 2024-04-12 10:12:12,191 [INFO ] MainThread: extra = {'applicationIntent': 'ReadOnly', 'trustServerCertificate': 'true'} + 2024-04-12 10:12:12,191 [INFO ] MainThread: jdbc_url = 'jdbc:sqlserver:/mssql.host:1433' + 2024-04-12 10:12:12,579 [INFO ] MainThread: |MSSQL| Connection is available. + 2024-04-12 10:12:12,581 [INFO ] MainThread: |MSSQL| Executing SQL query (on driver): + 2024-04-12 10:12:12,581 [INFO ] MainThread: SELECT + 2024-04-12 10:12:12,581 [INFO ] MainThread: MIN(updated_at) AS "min", + 2024-04-12 10:12:12,582 [INFO ] MainThread: MAX(updated_at) AS "max" + 2024-04-12 10:12:12,582 [INFO ] MainThread: FROM + 2024-04-12 10:12:12,582 [INFO ] MainThread: source_schema.table + 2024-04-12 10:12:12,582 [INFO ] MainThread: WHERE + 2024-04-12 10:12:12,582 [INFO ] MainThread: (field = 'some') + 2024-04-12 10:12:12,583 [INFO ] MainThread: AND + 2024-04-12 10:12:12,583 [INFO ] MainThread: (updated_at >= CAST('2024-04-11T18:10:02.120000' AS datetime2)) + 2024-04-12 10:16:22,537 [INFO ] MainThread: |MSSQL| Received values: + 2024-04-12 10:16:22,538 [INFO ] MainThread: MIN(updated_at) = datetime.datetime(2024, 4, 11, 21, 10, 7, 397000) + 2024-04-12 10:16:22,538 [INFO ] MainThread: MAX(updated_at) = datetime.datetime(2024, 4, 12, 13, 12, 2, 123000) + 2024-04-12 10:16:22,540 [INFO ] MainThread: |MSSQL| Executing SQL query (on executor): + 2024-04-12 10:16:22,540 [INFO ] MainThread: SELECT + 2024-04-12 10:16:22,540 [INFO ] MainThread: id, + 2024-04-12 10:16:22,541 [INFO ] MainThread: new_value, + 2024-04-12 10:16:22,541 [INFO ] MainThread: old_value, + 2024-04-12 10:16:22,541 [INFO ] MainThread: updated_at + 2024-04-12 10:16:22,541 [INFO ] MainThread: FROM + 2024-04-12 10:16:22,541 [INFO ] MainThread: source_schema.table + 2024-04-12 10:16:22,542 [INFO ] MainThread: WHERE + 2024-04-12 10:16:22,542 [INFO ] MainThread: (field = 'some') + 2024-04-12 10:16:22,542 [INFO ] MainThread: AND + 2024-04-12 10:16:22,542 [INFO ] MainThread: (updated_at > CAST('2024-04-11T18:10:02.120000' AS datetime2)) + 2024-04-12 10:16:22,542 [INFO ] MainThread: AND + 2024-04-12 10:16:22,542 [INFO ] MainThread: (updated_at <= CAST('2024-04-12T13:12:02.123000' AS datetime2)) + 2024-04-12 10:16:22,892 [INFO ] MainThread: |Spark| DataFrame successfully created from SQL statement + 2024-04-12 10:16:22,892 [INFO ] MainThread: ------------------------------------ DBReader.run() ends ------------------------------------ + 2024-04-12 10:40:42,409 [INFO ] MainThread: =================================== DBWriter.run() starts =================================== + 2024-04-12 10:40:42,409 [INFO ] MainThread: |Spark| -> |Hive| Writing DataFrame to target using parameters: + 2024-04-12 10:40:42,410 [INFO ] MainThread: target = 'target_source_schema.table' + 2024-04-12 10:40:42,410 [INFO ] MainThread: options = { + 2024-04-12 10:40:42,410 [INFO ] MainThread: 'mode': 'append', + 2024-04-12 10:40:42,410 [INFO ] MainThread: 'format': 'orc', + 2024-04-12 10:40:42,410 [INFO ] MainThread: 'partitionBy': 'part_dt', + 2024-04-12 10:40:42,410 [INFO ] MainThread: } + 2024-04-12 10:40:42,411 [INFO ] MainThread: df_schema: + 2024-04-12 10:40:42,412 [INFO ] MainThread: root + 2024-04-12 10:40:42,412 [INFO ] MainThread: |-- id: integer (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- new_value: string (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- old_value: string (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- updated_at: timestamp (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- part_dt: date (nullable = true) + 2024-04-12 10:40:42,414 [INFO ] MainThread: + 2024-04-12 10:40:42,421 [INFO ] MainThread: |Hive| Checking connection availability... + 2024-04-12 10:40:42,421 [INFO ] MainThread: |Hive| Using connection parameters: + 2024-04-12 10:40:42,421 [INFO ] MainThread: cluster = 'dwh' + 2024-04-12 10:40:42,475 [INFO ] MainThread: |Hive| Connection is available. + 2024-04-12 10:40:42,476 [INFO ] MainThread: |Hive| Fetching schema of table 'target_source_schema.table' ... + 2024-04-12 10:40:43,518 [INFO ] MainThread: |Hive| Schema fetched. + 2024-04-12 10:40:43,521 [INFO ] MainThread: |Hive| Table 'target_source_schema.table' already exists + 2024-04-12 10:40:43,521 [WARNING ] MainThread: |Hive| User-specified options {'partitionBy': 'part_dt'} are ignored while inserting into existing table. Using only table parameters from Hive metastore + 2024-04-12 10:40:43,782 [INFO ] MainThread: |Hive| Inserting data into existing table 'target_source_schema.table' ... + 2024-04-12 11:06:07,396 [INFO ] MainThread: |Hive| Data is successfully inserted into table 'target_source_schema.table'. + 2024-04-12 11:06:07,397 [INFO ] MainThread: ------------------------------------ DBWriter.run() ends ------------------------------------ + 2024-04-12 11:06:07,397 [INFO ] MainThread: |onETL| Exiting IncrementalStrategy + 2024-04-12 11:06:07,397 [INFO ] MainThread: |IncrementalStrategy| Saving HWM to 'HorizonHWMStore': + 2024-04-12 11:06:07,397 [INFO ] MainThread: hwm = ColumnDateTimeHWM( + 2024-04-12 11:06:07,397 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb', + 2024-04-12 11:06:07,397 [INFO ] MainThread: entity = 'source_source_schema.table', + 2024-04-12 11:06:07,397 [INFO ] MainThread: expression = 'updated_at', + 2024-04-12 11:06:07,397 [INFO ] MainThread: value = datetime.datetime(2024, 4, 12, 13, 12, 2, 123000), + 2024-04-12 11:06:07,397 [INFO ] MainThread: ) + 2024-04-12 11:06:07,495 [INFO ] MainThread: |IncrementalStrategy| HWM has been saved + ``` + +Each step performed by onETL is extensively logged, which should help with debugging. + +You can make logs even more verbose by changing level to `DEBUG`: + +```python +from onetl.log import setup_logging + +setup_logging(level="DEBUG", enable_clients=True) + +# rest of code +... +``` + +This also changes log level for all underlying Python libraries, e.g. showing each HTTP request being made, and so on. + +::: onetl.log + options: + members: + - setup_logging + - setup_clients_logging + - set_default_logging_format diff --git a/mkdocs/docs/en/plugins.md b/mkdocs/docs/en/plugins.md new file mode 100644 index 000000000..8f92ecd6f --- /dev/null +++ b/mkdocs/docs/en/plugins.md @@ -0,0 +1,144 @@ +# Plugins + +:octicons-versions-16: **version added 0.6.0** + + +## What are plugins? + +### Terms + +- `Plugin` - some Python package which implements some extra functionality for onETL, like [hooks][hooks] +- `Plugin autoimport` - onETL behavior which allows to automatically import this package if it contains proper metadata (`entry_points`) + +### Features + +Plugins mechanism allows to: + +- Automatically register [hooks][hooks] which can alter onETL behavior +- Automatically register new classes, like HWM type, HWM stores and so on + +### Limitations + +Unlike other projects (like *Airflow 1.x*), plugins does not inject imported classes or functions to `onetl.*` namespace. +Users should import classes from the plugin package **explicitly** to avoid name collisions. + +## How to implement plugin? + +Create a Python package `some-plugin` with a file `some_plugin/setup.py`: + +```python +# some_plugin/setup.py +from setuptools import setup + +setup( + # if you want to import something from onETL, add it to requirements list + install_requires=["onetl"], + entry_points={ + # this key enables plugins autoimport functionality + "onetl.plugins": [ + "some-plugin-name=some_plugin.module", # automatically import all module content + "some-plugin-class=some_plugin.module.internals:MyClass", # import a specific class + "some-plugin-function=some_plugin.module.internals:my_function", # import a specific function + ], + }, +) +``` + +See [setuptools documentation for entry_points](https://setuptools.pypa.io/en/latest/userguide/entry_point.html) + +## How plugins are imported? + +- User should install a package implementing the plugin: + +```bash +pip install some-package +``` + +- Then user should import something from `onetl` module or its submodules: + +```python +import onetl +from onetl.connection import SomeConnection + +# and so on +``` + +- This import automatically executes something like: + +```python +import some_plugin.module +from some_plugin.module.internals import MyClass +from some_plugin.module.internals import my_function +``` + +If specific module/class/function uses some registration capabilities of onETL, +like [`@hook` decorator][hook-decorator], it will be executed during this import. + +## How to enable/disable plugins? + +:octicons-versions-16: **version added 0.7.0** + +### Disable/enable all plugins + +By default plugins are enabled. + +To disabled them, you can set environment variable `ONETL_PLUGINS_ENABLED` to `false` BEFORE +importing onETL. This will disable all plugins autoimport. + +But user is still be able to explicitly import `some_plugin.module`, executing +all decorators and registration capabilities of onETL. + +### Disable a specific plugin (blacklist) + +If some plugin is failing during import, you can disable it by setting up environment variable +`ONETL_PLUGINS_BLACKLIST=some-failing-plugin`. Multiple plugin names could be passed with `,` as delimiter. + +Again, this environment variable should be set BEFORE importing onETL. + +### Disable all plugins except a specific one (whitelist) + +You can also disable all plugins except a specific one by setting up environment variable +`ONETL_PLUGINS_WHITELIST=some-not-failing-plugin`. Multiple plugin names could be passed with `,` as delimiter. + +Again, this environment variable should be set BEFORE importing onETL. + +If both whitelist and blacklist environment variables are set, blacklist has a higher priority. + +## How to see logs of the plugins mechanism? + +Plugins registration emits logs with `INFO` level: + +```python +import logging + +logging.basicConfig(level=logging.INFO) +``` + +```text +INFO |onETL| Found 2 plugins +INFO |onETL| Loading plugin 'my-plugin' +INFO |onETL| Skipping plugin 'failing' because it is in a blacklist +``` + +More detailed logs are emitted with `DEBUG` level, to make output less verbose: + +```python +import logging + +logging.basicConfig(level=logging.DEBUG) +``` + +```text +DEBUG |onETL| Searching for plugins with group 'onetl.plugins' +DEBUG |Plugins| Plugins whitelist: [] +DEBUG |Plugins| Plugins blacklist: ['failing-plugin'] +INFO |Plugins| Found 2 plugins +INFO |onETL| Loading plugin (1/2): +DEBUG name: 'my-plugin' +DEBUG package: 'my-package' +DEBUG version: '0.1.0' +DEBUG importing: 'my_package.my_module:MyClass' +DEBUG |onETL| Successfully loaded plugin 'my-plugin' +DEBUG source: '/usr/lib/python3.11/site-packages/my_package/my_module/my_class.py' +INFO |onETL| Skipping plugin 'failing' because it is in a blacklist +``` diff --git a/mkdocs/docs/en/quickstart.md b/mkdocs/docs/en/quickstart.md new file mode 100644 index 000000000..17bd8cfc8 --- /dev/null +++ b/mkdocs/docs/en/quickstart.md @@ -0,0 +1,540 @@ +# onETL + +{{ repo_status_badge }} +{{ pypi_release_bage }} +{{ pypi_license_bage }} +{{ pypi_pyversion_bage }} +{{ pypi_downloads_bage }} + +{{ docs_status_badge }} +{{ ci_status_badge }} +{{ precommit_badge }} + + +{{ onetl_logo_wide }} + +----8<---- +../mddocs/docs/en/snippet_0.md +----8<---- + + + +## Documentation + +See at [ReadTheDocs](https://onetl.readthedocs.io/en/latest/) + +## How to install + + + +### Minimal installation + + + +Base `onetl` package contains: + +- `DBReader`, `DBWriter` and related classes +- `FileDownloader`, `FileUploader`, `FileMover` and related classes, like file filters & limits +- `FileDFReader`, `FileDFWriter` and related classes, like file formats +- Read Strategies & HWM classes +- Plugins support + +It can be installed via: + +```bash +pip install onetl +``` + +!!! warning + + This method does NOT include any connections. + + This method is recommended for use in third-party libraries which require for `onetl` to be installed, + but do not use its connection classes. + + +### With DB and FileDF connections + + + +All DB connection classes (`Clickhouse`, `Greenplum`, `Hive` and others) +and all FileDF connection classes (`SparkHDFS`, `SparkLocalFS`, `SparkS3`) +require Spark to be installed. + + + +Firstly, you should install JDK. The exact installation instruction depends on your OS, here are some examples: + +```bash +yum install java-1.8.0-openjdk-devel # CentOS 7 | Spark 2 +dnf install java-11-openjdk-devel # CentOS 8 | Spark 3 +apt-get install openjdk-11-jdk # Debian-based | Spark 3 +``` + + + +#### Compatibility matrix + +| Spark | Python | Java | Scala | +| --------------------------------------------------------- | ---------- | ---------- | ----- | +| [2.3.x](https://spark.apache.org/docs/2.3.1/#downloading) | 3.7 only | 8 only | 2.11 | +| [2.4.x](https://spark.apache.org/docs/2.4.8/#downloading) | 3.7 only | 8 only | 2.11 | +| [3.2.x](https://spark.apache.org/docs/3.2.4/#downloading) | 3.7 - 3.10 | 8u201 - 11 | 2.12 | +| [3.3.x](https://spark.apache.org/docs/3.3.4/#downloading) | 3.7 - 3.12 | 8u201 - 17 | 2.12 | +| [3.4.x](https://spark.apache.org/docs/3.4.4/#downloading) | 3.7 - 3.12 | 8u362 - 20 | 2.12 | +| [3.5.x](https://spark.apache.org/docs/3.5.5/#downloading) | 3.8 - 3.13 | 8u371 - 20 | 2.12 | + + + +Then you should install PySpark via passing `spark` to `extras`: + +```bash +pip install onetl[spark] # install latest PySpark +``` + +or install PySpark explicitly: + +```bash +pip install onetl pyspark==3.5.5 # install a specific PySpark version +``` + +or inject PySpark to `sys.path` in some other way BEFORE creating a class instance. +**Otherwise connection object cannot be created.** + +### With File connections + + + +All File (but not *FileDF*) connection classes (`FTP`, `SFTP`, `HDFS` and so on) requires specific Python clients to be installed. + +Each client can be installed explicitly by passing connector name (in lowercase) to `extras`: + +```bash +pip install onetl[ftp] # specific connector +pip install onetl[ftp,ftps,sftp,hdfs,s3,webdav,samba] # multiple connectors +``` + +To install all file connectors at once you can pass `files` to `extras`: + +```bash +pip install onetl[files] +``` + +**Otherwise class import will fail.** + +### With Kerberos support + + + +Most of Hadoop instances set up with Kerberos support, +so some connections require additional setup to work properly. + +- `HDFS` + Uses [requests-kerberos](https://pypi.org/project/requests-kerberos/) and + [GSSApi](https://pypi.org/project/gssapi/) for authentication. + It also uses `kinit` executable to generate Kerberos ticket. +- `Hive` and `SparkHDFS` + require Kerberos ticket to exist before creating Spark session. + +So you need to install OS packages with: + +- `krb5` libs +- Headers for `krb5` +- `gcc` or other compiler for C sources + +The exact installation instruction depends on your OS, here are some examples: + +```bash +apt install libkrb5-dev krb5-user gcc # Debian-based +dnf install krb5-devel krb5-libs krb5-workstation gcc # CentOS, OracleLinux +``` + +Also you should pass `kerberos` to `extras` to install required Python packages: + +```bash +pip install onetl[kerberos] +``` + +### Full bundle + + + +To install all connectors and dependencies, you can pass `all` into `extras`: + +```bash +pip install onetl[all] + +# this is just the same as +pip install onetl[spark,files,kerberos] +``` + +!!! warning + + This method consumes a lot of disk space, and requires for Java & Kerberos libraries to be installed into your OS. + + + + +## Quick start + +### MSSQL → Hive + +Read data from MSSQL, transform & write to Hive. + +```bash +# install onETL and PySpark +pip install onetl[spark] +``` + +```python +# Import pyspark to initialize the SparkSession +from pyspark.sql import SparkSession + +# import function to setup onETL logging +from onetl.log import setup_logging + +# Import required connections +from onetl.connection import MSSQL, Hive + +# Import onETL classes to read & write data +from onetl.db import DBReader, DBWriter + +# change logging level to INFO, and set up default logging format and handler +setup_logging() + +# Initialize new SparkSession with MSSQL driver loaded +maven_packages = MSSQL.get_packages() +spark = ( + SparkSession.builder.appName("spark_app_onetl_demo") + .config("spark.jars.packages", ",".join(maven_packages)) + .enableHiveSupport() # for Hive + .getOrCreate() +) + +# Initialize MSSQL connection and check if database is accessible +mssql = MSSQL( + host="mssqldb.demo.com", + user="onetl", + password="onetl", + database="Telecom", + spark=spark, + # These options are passed to MSSQL JDBC Driver: + extra={"applicationIntent": "ReadOnly"}, +).check() + +# >>> INFO:|MSSQL| Connection is available + +# Initialize DBReader +reader = DBReader( + connection=mssql, + source="dbo.demo_table", + columns=["on", "etl"], + # Set some MSSQL read options: + options=MSSQL.ReadOptions(fetchsize=10000), +) + +# checks that there is data in the table, otherwise raises exception +reader.raise_if_no_data() + +# Read data to DataFrame +df = reader.run() +df.printSchema() +# root +# |-- id: integer (nullable = true) +# |-- phone_number: string (nullable = true) +# |-- region: string (nullable = true) +# |-- birth_date: date (nullable = true) +# |-- registered_at: timestamp (nullable = true) +# |-- account_balance: double (nullable = true) + +# Apply any PySpark transformations +from pyspark.sql.functions import lit + +df_to_write = df.withColumn("engine", lit("onetl")) +df_to_write.printSchema() +# root +# |-- id: integer (nullable = true) +# |-- phone_number: string (nullable = true) +# |-- region: string (nullable = true) +# |-- birth_date: date (nullable = true) +# |-- registered_at: timestamp (nullable = true) +# |-- account_balance: double (nullable = true) +# |-- engine: string (nullable = false) + +# Initialize Hive connection +hive = Hive(cluster="rnd-dwh", spark=spark) + +# Initialize DBWriter +db_writer = DBWriter( + connection=hive, + target="dl_sb.demo_table", + # Set some Hive write options: + options=Hive.WriteOptions(if_exists="replace_entire_table"), +) + +# Write data from DataFrame to Hive +db_writer.run(df_to_write) + +# Success! +``` + +### SFTP → HDFS + +Download files from SFTP & upload them to HDFS. + +```bash +# install onETL with SFTP and HDFS clients, and Kerberos support +pip install onetl[hdfs,sftp,kerberos] +``` + +```python +# import function to setup onETL logging +from onetl.log import setup_logging + +# Import required connections +from onetl.connection import SFTP, HDFS + +# Import onETL classes to download & upload files +from onetl.file import FileDownloader, FileUploader + +# import filter & limit classes +from onetl.file.filter import Glob, ExcludeDir +from onetl.file.limit import MaxFilesCount + +# change logging level to INFO, and set up default logging format and handler +setup_logging() + +# Initialize SFTP connection and check it +sftp = SFTP( + host="sftp.test.com", + user="someuser", + password="somepassword", +).check() + +# >>> INFO:|SFTP| Connection is available + +# Initialize downloader +file_downloader = FileDownloader( + connection=sftp, + source_path="/remote/tests/Report", # path on SFTP + local_path="/local/onetl/Report", # local fs path + filters=[ + # download only files matching the glob + Glob("*.csv"), + # exclude files from this directory + ExcludeDir("/remote/tests/Report/exclude_dir/"), + ], + limits=[ + # download max 1000 files per run + MaxFilesCount(1000), + ], + options=FileDownloader.Options( + # delete files from SFTP after successful download + delete_source=True, + # mark file as failed if it already exist in local_path + if_exists="error", + ), +) + +# Download files to local filesystem +download_result = downloader.run() + +# Method run returns a DownloadResult object, +# which contains collection of downloaded files, divided to 4 categories +download_result + +# DownloadResult( +# successful=[ +# LocalPath('/local/onetl/Report/file_1.json'), +# LocalPath('/local/onetl/Report/file_2.json'), +# ], +# failed=[FailedRemoteFile('/remote/onetl/Report/file_3.json')], +# ignored=[RemoteFile('/remote/onetl/Report/file_4.json')], +# missing=[], +# ) + +# Raise exception if there are failed files, or there were no files in the remote filesystem +download_result.raise_if_failed() or download_result.raise_if_empty() + +# Do any kind of magic with files: rename files, remove header for csv files, ... +renamed_files = my_rename_function(download_result.success) + +# function removed "_" from file names +# [ +# LocalPath('/home/onetl/Report/file1.json'), +# LocalPath('/home/onetl/Report/file2.json'), +# ] + +# Initialize HDFS connection +hdfs = HDFS( + host="my.name.node", + user="someuser", + password="somepassword", # or keytab +) + +# Initialize uploader +file_uploader = FileUploader( + connection=hdfs, + target_path="/user/onetl/Report/", # hdfs path +) + +# Upload files from local fs to HDFS +upload_result = file_uploader.run(renamed_files) + +# Method run returns a UploadResult object, +# which contains collection of uploaded files, divided to 4 categories +upload_result + +# UploadResult( +# successful=[RemoteFile('/user/onetl/Report/file1.json')], +# failed=[FailedLocalFile('/local/onetl/Report/file2.json')], +# ignored=[], +# missing=[], +# ) + +# Raise exception if there are failed files, or there were no files in the local filesystem, or some input file is missing +upload_result.raise_if_failed() or upload_result.raise_if_empty() or upload_result.raise_if_missing() + +# Success! +``` + +### S3 → Postgres + +Read files directly from S3 path, convert them to dataframe, transform it and then write to a database. + +```bash +# install onETL and PySpark +pip install onetl[spark] +``` + +```python +# Import pyspark to initialize the SparkSession +from pyspark.sql import SparkSession + +# import function to setup onETL logging +from onetl.log import setup_logging + +# Import required connections +from onetl.connection import Postgres, SparkS3 + +# Import onETL classes to read files +from onetl.file import FileDFReader +from onetl.file.format import CSV + +# Import onETL classes to write data +from onetl.db import DBWriter + +# change logging level to INFO, and set up default logging format and handler +setup_logging() + +# Initialize new SparkSession with Hadoop AWS libraries and Postgres driver loaded +maven_packages = SparkS3.get_packages(spark_version="3.5.5") | Postgres.get_packages() +exclude_packages = SparkS3.get_exclude_packages() +spark = ( + SparkSession.builder.appName("spark_app_onetl_demo") + .config("spark.jars.packages", ",".join(maven_packages)) + .config("spark.jars.excludes", ",".join(exclude_packages)) + .getOrCreate() +) + +# Initialize S3 connection and check it +spark_s3 = SparkS3( + host="s3.test.com", + protocol="https", + bucket="my-bucket", + access_key="somekey", + secret_key="somesecret", + # Access bucket as s3.test.com/my-bucket + extra={"path.style.access": True}, + spark=spark, +).check() + +# >>> INFO:|SparkS3| Connection is available + +# Describe file format and parsing options +csv = CSV( + delimiter=";", + header=True, + encoding="utf-8", +) + +# Describe DataFrame schema of files +from pyspark.sql.types import ( + DateType, + DoubleType, + IntegerType, + StringType, + StructField, + StructType, + TimestampType, +) + +df_schema = StructType( + [ + StructField("id", IntegerType()), + StructField("phone_number", StringType()), + StructField("region", StringType()), + StructField("birth_date", DateType()), + StructField("registered_at", TimestampType()), + StructField("account_balance", DoubleType()), + ], +) + +# Initialize file df reader +reader = FileDFReader( + connection=spark_s3, + source_path="/remote/tests/Report", # path on S3 there *.csv files are located + format=csv, # file format with specific parsing options + df_schema=df_schema, # columns & types +) + +# Read files directly from S3 as Spark DataFrame +df = reader.run() + +# Check that DataFrame schema is same as expected +df.printSchema() +# root +# |-- id: integer (nullable = true) +# |-- phone_number: string (nullable = true) +# |-- region: string (nullable = true) +# |-- birth_date: date (nullable = true) +# |-- registered_at: timestamp (nullable = true) +# |-- account_balance: double (nullable = true) + +# Apply any PySpark transformations +from pyspark.sql.functions import lit + +df_to_write = df.withColumn("engine", lit("onetl")) +df_to_write.printSchema() +# root +# |-- id: integer (nullable = true) +# |-- phone_number: string (nullable = true) +# |-- region: string (nullable = true) +# |-- birth_date: date (nullable = true) +# |-- registered_at: timestamp (nullable = true) +# |-- account_balance: double (nullable = true) +# |-- engine: string (nullable = false) + +# Initialize Postgres connection +postgres = Postgres( + host="192.169.11.23", + user="onetl", + password="somepassword", + database="mydb", + spark=spark, +) + +# Initialize DBWriter +db_writer = DBWriter( + connection=postgres, + # write to specific table + target="public.my_table", + # with some writing options + options=Postgres.WriteOptions(if_exists="append"), +) + +# Write DataFrame to Postgres table +db_writer.run(df_to_write) + +# Success! +``` diff --git a/mkdocs/docs/en/security.md b/mkdocs/docs/en/security.md new file mode 100644 index 000000000..3048c040b --- /dev/null +++ b/mkdocs/docs/en/security.md @@ -0,0 +1,25 @@ +# Security + +## Supported Python versions + +3.7 or above + +## Product development security recommendations + +1. Update dependencies to last stable version +2. Build SBOM for the project +3. Perform SAST (Static Application Security Testing) where possible + +## Product development security requirements + +1. No binaries in repository +2. No passwords, keys, access tokens in source code +3. No “Critical” and/or “High” vulnerabilities in contributed source code + +## Vulnerability reports + +Please, use email [mailto:onetools@mts.ru](mailto:onetools@mts.ru) for reporting security issues or anything that can cause any consequences for security. + +Please avoid any public disclosure (including registering issues) at least until it is fixed. + +Thank you in advance for understanding. diff --git a/mkdocs/docs/en/snippet_0.md b/mkdocs/docs/en/snippet_0.md new file mode 100644 index 000000000..f91e2c15a --- /dev/null +++ b/mkdocs/docs/en/snippet_0.md @@ -0,0 +1,44 @@ +## What is onETL? + +Python ETL/ELT library powered by [Apache Spark](https://spark.apache.org/) & other open-source tools. + +## Goals + +- Provide unified classes to extract data from (**E**) & load data to (**L**) various stores. +- Provides [Spark DataFrame API](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html) for performing transformations (**T**) in terms of *ETL*. +- Provide direct assess to database, allowing to execute SQL queries, as well as DDL, DML, and call functions/procedures. This can be used for building up *ELT* pipelines. +- Support different [read strategies][strategy] for incremental and batch data fetching. +- Provide [hooks][hooks] & [plugins][plugins] mechanism for altering behavior of internal classes. + +## Non-goals + +- onETL is not a Spark replacement. It just provides additional functionality that Spark does not have, and improves UX for end users. +- onETL is not a framework, as it does not have requirements to project structure, naming, the way of running ETL/ELT processes, configuration, etc. All of that should be implemented in some other tool. +- onETL is deliberately developed without any integration with scheduling software like Apache Airflow. All integrations should be implemented as separated tools. +- Only batch operations, no streaming. For streaming prefer [Apache Flink](https://flink.apache.org/). + +## Requirements + +- **Python** 3.7 - 3.13 +- PySpark 2.3.x - 3.5.x (depends on used connector) +- Java 8+ (required by Spark, see below) +- Kerberos libs & GCC (required by `Hive`, `HDFS` and `SparkHDFS` connectors) + +## Supported storages + + +| Type | Storage | Powered by | +|--------------------|--------------|-------------------------------------------------------------------------------------------------------------------------| +| Database {: rowspan=5} | Clickhouse
MSSQL
MySQL
Postgres
Oracle
Teradata |

Apache Spark [JDBC Data Source](https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html) | +| Hive | Apache Spark [Hive integration](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html) | +| Kafka | Apache Spark [Kafka integration](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html) | +| Greenplum | VMware [Greenplum Spark connector](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/index.html) | +| MongoDB | [MongoDB Spark connector](https://www.mongodb.com/docs/spark-connector/current) | +| File {: rowspan=6} | HDFS | [HDFS Python client](https://pypi.org/project/hdfs/) | +| S3 | [minio-py client](https://pypi.org/project/minio/) | +| SFTP | [Paramiko library](https://pypi.org/project/paramiko/) | +| FTP
FTPS | [FTPUtil library](https://pypi.org/project/ftputil/) | +| WebDAV | [WebdavClient3 library](https://pypi.org/project/webdavclient3/) | +| Samba | [pysmb library](https://pypi.org/project/pysmb/) | +| Files as DataFrame {: rowspan=2} | SparkLocalFS
SparkHDFS | Apache Spark [File Data Source](https://spark.apache.org/docs/latest/sql-data-sources-generic-options.html) | +| SparkS3 | [Hadoop AWS](https://hadoop.apache.org/docs/current3/hadoop-aws/tools/hadoop-aws/index.html) library | diff --git a/mkdocs/docs/en/strategy/incremental_batch_strategy.md b/mkdocs/docs/en/strategy/incremental_batch_strategy.md new file mode 100644 index 000000000..8884066a2 --- /dev/null +++ b/mkdocs/docs/en/strategy/incremental_batch_strategy.md @@ -0,0 +1,12 @@ +(incremental-batch-strategy)= + +# Incremental Batch Strategy + +```{eval-rst} +.. currentmodule:: onetl.strategy.incremental_strategy +``` + +```{eval-rst} +.. autoclass:: IncrementalBatchStrategy + :members: __init__ +``` diff --git a/mkdocs/docs/en/strategy/incremental_strategy.md b/mkdocs/docs/en/strategy/incremental_strategy.md new file mode 100644 index 000000000..35c4fff36 --- /dev/null +++ b/mkdocs/docs/en/strategy/incremental_strategy.md @@ -0,0 +1,12 @@ +(incremental-strategy)= + +# Incremental Strategy + +```{eval-rst} +.. currentmodule:: onetl.strategy.incremental_strategy +``` + +```{eval-rst} +.. autoclass:: IncrementalStrategy + :members: __init__ +``` diff --git a/mkdocs/docs/en/strategy/index.md b/mkdocs/docs/en/strategy/index.md new file mode 100644 index 000000000..827b13d14 --- /dev/null +++ b/mkdocs/docs/en/strategy/index.md @@ -0,0 +1,19 @@ +# Read Strategies { #strategy data-toc-label='strategy' } + +```{toctree} +:caption: Read Strategies +:hidden: true +:maxdepth: 3 + +snapshot_strategy +incremental_strategy +snapshot_batch_strategy +incremental_batch_strategy +``` + +onETL have several builtin strategies for reading data: + +1. {doc}`snapshot_strategy` +2. {doc}`incremental_strategy` +3. {doc}`snapshot_batch_strategy` +4. {doc}`incremental_batch_strategy` diff --git a/mkdocs/docs/en/strategy/snapshot_batch_strategy.md b/mkdocs/docs/en/strategy/snapshot_batch_strategy.md new file mode 100644 index 000000000..dcff0670d --- /dev/null +++ b/mkdocs/docs/en/strategy/snapshot_batch_strategy.md @@ -0,0 +1,12 @@ +(snapshot-batch-strategy)= + +# Snapshot Batch Strategy + +```{eval-rst} +.. currentmodule:: onetl.strategy.snapshot_strategy +``` + +```{eval-rst} +.. autoclass:: SnapshotBatchStrategy + :members: __init__ +``` diff --git a/mkdocs/docs/en/strategy/snapshot_strategy.md b/mkdocs/docs/en/strategy/snapshot_strategy.md new file mode 100644 index 000000000..e5d973831 --- /dev/null +++ b/mkdocs/docs/en/strategy/snapshot_strategy.md @@ -0,0 +1,12 @@ +(snapshot-strategy)= + +# Snapshot Strategy + +```{eval-rst} +.. currentmodule:: onetl.strategy.snapshot_strategy +``` + +```{eval-rst} +.. autoclass:: SnapshotStrategy + :members: __init__ +``` diff --git a/mkdocs/docs/en/troubleshooting/index.md b/mkdocs/docs/en/troubleshooting/index.md new file mode 100644 index 000000000..de93ac11e --- /dev/null +++ b/mkdocs/docs/en/troubleshooting/index.md @@ -0,0 +1,25 @@ +(troubleshooting)= + +# Troubleshooting + +In case of error please follow instructions below: + +- Read the logs or exception messages you've faced with. + : - If Python logs are note verbose enough, {ref}`increase the log level `. + - If Spark logs are note verbose enough, {ref}`increase the log level `. +- Read documentation related to a class or method you are using. +- [Google](https://google.com) the error message, and carefully read the search result: + : - [StackOverflow](https://stackoverflow.com/) answers. + - [Spark](https://spark.apache.org/docs/latest/) documentation. + - Documentation of database or filesystem you are connecting to. + - Documentation of underlying connector. +- Search for known [issues](https://github.com/MobileTeleSystems/onetl/issues), or create a new one. +- Always use the most resent versions of onETL, PySpark and connector packages, {ref}`compatible with your environment `. + +```{toctree} +:caption: Troubleshooting +:hidden: true +:maxdepth: 3 + +spark +``` diff --git a/mkdocs/docs/en/troubleshooting/spark.md b/mkdocs/docs/en/troubleshooting/spark.md new file mode 100644 index 000000000..a71d8b732 --- /dev/null +++ b/mkdocs/docs/en/troubleshooting/spark.md @@ -0,0 +1,75 @@ +(troubleshooting-spark)= + +# Spark Troubleshooting + +## Restarting Spark session + +Sometimes it is required to stop current Spark session and start a new one, e.g. to add some .jar packages, or change session config. +But PySpark not only starts Spark session, but also starts Java virtual machine (JVM) process in the background, +which. So calling `sparkSession.stop()` [does not shutdown JVM](https://issues.apache.org/jira/browse/SPARK-47740), +and this can cause some issue. + +Also apart from JVM properties, stopping Spark session does not clear Spark context, which is a global object. So new +Spark sessions are created using the same context object, and thus using the same Spark config options. + +To properly stop Spark session, it is **required** to: +\* Stop Spark session by calling `sparkSession.stop()`. +\* **STOP PYTHON INTERPRETER**, e.g. by calling `sys.exit()`. +\* Start new Python interpreter. +\* Start new Spark session with config options you need. + +Skipping some of these steps can lead to issues with creating new Spark session. + +## Driver log level + +Default logging level for Spark session is `WARN`. To show more verbose logs, use: + +```python +spark.sparkContext.setLogLevel("INFO") +``` + +or increase verbosity even more: + +```python +spark.sparkContext.setLogLevel("DEBUG") +``` + +After getting all information you need, you can return back the previous log level: + +```python +spark.sparkContext.setLogLevel("WARN") +``` + +## Executors log level + +`sparkContext.setLogLevel` changes only log level of Spark session on Spark **driver**. +To make Spark executor logs more verbose, perform following steps: + +- Create `log4j.properties` file with content like this: + + ```jproperties + log4j.rootCategory=DEBUG, console + + log4j.appender.console=org.apache.log4j.ConsoleAppender + log4j.appender.console.target=System.err + log4j.appender.console.layout=org.apache.log4j.PatternLayout + log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + ``` + +- Stop existing Spark session and create a new one with following options: + + ```python + from pyspark.sql import SparkSession + + spark = ( + SparkSesion.builder.config("spark.files", "file:log4j.properties").config( + "spark.executor.extraJavaOptions", "-Dlog4j.configuration=file:log4j.properties" + ) + # you can apply the same logging settings to Spark driver, by uncommenting the line below + # .config("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:log4j.properties") + .getOrCreate() + ) + ``` + +Each Spark executor will receive a copy of `log4j.properties` file during start, and load it to change own log level. +Same approach can be used for Spark driver as well, to investigate issue when Spark session cannot properly start.