dlt: Hello, World!

amotl · amotl · commit 0b71383883a1 · 2025-06-09T21:00:36.000+02:00
diff --git a/framework/dlt/.dlt/config.toml b/framework/dlt/.dlt/config.toml
@@ -0,0 +1,12 @@
+# Put your main configuration values here.
+#add_dlt_id = false
+#add_dlt_load_id = false
+
+[runtime]
+
+# The system log level of dlt.
+log_level="DEBUG"
+
+# Use the `dlthub_telemetry` setting to enable/disable anonymous
+# usage data reporting, see https://dlthub.com/docs/reference/telemetry.
+dlthub_telemetry = false
diff --git a/framework/dlt/.dlt/secrets.toml b/framework/dlt/.dlt/secrets.toml
@@ -0,0 +1,12 @@
+[destination.cratedb.credentials]
+host = "localhost"
+port = 5432
+username = "crate"
+password = ""
+
+[destination.sqlalchemy.credentials]
+drivername = "crate"
+host = "localhost"
+port = 4200
+username = "crate"
+password = ""
diff --git a/framework/dlt/.gitignore b/framework/dlt/.gitignore
@@ -0,0 +1,11 @@
+# ignore secrets, virtual environments and typical python compilation artifacts
+# remark: Add it in this case, in order to provide out-of-the-box settings for localhost
+# secrets.toml
+# ignore basic python artifacts
+.env
+**/__pycache__/
+**/*.py[cod]
+**/*$py.class
+# ignore duckdb
+*.duckdb
+*.wal
diff --git a/framework/dlt/basic.py b/framework/dlt/basic.py
@@ -0,0 +1,135 @@
+# mypy: disable-error-code="no-untyped-def,arg-type"
+"""The Intro Pipeline Template contains the example from the docs intro page"""
+
+from typing import Optional
+import pandas as pd
+import sqlalchemy as sa
+
+import dlt
+from dlt.sources.helpers import requests
+
+
+CRATEDB_ADDRESS = "postgresql://crate:@localhost:5432/"
+
+
+def load_api_data() -> None:
+    """Load data from the chess api, for more complex examples use our rest_api source"""
+
+    # Create a dlt pipeline that will load
+    # chess player data to the DuckDB destination
+    pipeline = dlt.pipeline(
+        pipeline_name="from_api",
+        destination=dlt.destinations.cratedb(CRATEDB_ADDRESS),
+        dataset_name="doc",
+    )
+
+    # Grab some player data from Chess.com API
+    data = []
+    for player in ["magnuscarlsen", "rpragchess"]:
+        response = requests.get(f"https://api.chess.com/pub/player/{player}")
+        response.raise_for_status()
+        data.append(response.json())
+
+    # Extract, normalize, and load the data
+    load_info = pipeline.run(
+        data=data,
+        table_name="chess_players",
+    )
+    print(load_info)  # noqa: T201
+
+
+def load_pandas_data() -> None:
+    """Load data from a public csv via pandas"""
+
+    owid_disasters_csv = (
+        "https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/"
+        "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020)/"
+        "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020).csv"
+    )
+    df = pd.read_csv(owid_disasters_csv)
+
+    pipeline = dlt.pipeline(
+        pipeline_name="from_csv",
+        destination=dlt.destinations.cratedb(CRATEDB_ADDRESS),
+        dataset_name="doc",
+    )
+    load_info = pipeline.run(
+        data=df,
+        table_name="natural_disasters",
+    )
+
+    print(load_info)  # noqa: T201
+
+
+def load_sql_data() -> None:
+    """Load data from a sql database with sqlalchemy, for more complex examples use our sql_database source"""
+
+    # Use any SQL database supported by SQLAlchemy, below we use a public
+    # MySQL instance to get data.
+    # NOTE: you'll need to install pymysql with `pip install pymysql`
+    # NOTE: loading data from public mysql instance may take several seconds
+    engine = sa.create_engine(
+        "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam"
+    )
+
+    with engine.connect() as conn:
+        # Select genome table, stream data in batches of 100 elements
+        query = "SELECT * FROM genome LIMIT 1000"
+        rows = conn.execution_options(yield_per=100).exec_driver_sql(query)
+
+        pipeline = dlt.pipeline(
+            pipeline_name="from_database",
+            destination=dlt.destinations.cratedb(CRATEDB_ADDRESS),
+            dataset_name="doc",
+        )
+
+        # Convert the rows into dictionaries on the fly with a map function
+        load_info = pipeline.run(
+            data=map(lambda row: dict(row._mapping), rows),
+            table_name="genome",
+        )
+
+    print(load_info)  # noqa: T201
+
+
+@dlt.resource(write_disposition="replace")
+def github_api_resource(api_secret_key: Optional[str] = dlt.secrets.value):
+    from dlt.sources.helpers.rest_client import paginate
+    from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
+    from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator
+
+    url = "https://api.github.com/repos/dlt-hub/dlt/issues"
+
+    # Github allows both authenticated and non-authenticated requests (with low rate limits)
+    auth = BearerTokenAuth(api_secret_key) if api_secret_key else None
+    for page in paginate(
+        url,
+        auth=auth,
+        paginator=HeaderLinkPaginator(),
+        params={"state": "open", "per_page": "100"},
+    ):
+        yield page
+
+
+@dlt.source
+def github_api_source(api_secret_key: Optional[str] = dlt.secrets.value):
+    return github_api_resource(api_secret_key=api_secret_key)
+
+
+def load_data_from_source():
+    pipeline = dlt.pipeline(
+        pipeline_name="github_api_pipeline",
+        destination=dlt.destinations.cratedb(CRATEDB_ADDRESS),
+        dataset_name="doc",
+    )
+    load_info = pipeline.run(
+        data=github_api_source(),
+        table_name="github_api_data",
+    )
+    print(load_info)  # noqa: T201
+
+
+if __name__ == "__main__":
+    load_api_data()
+    load_pandas_data()
+    load_sql_data()
diff --git a/framework/dlt/pokemon.py b/framework/dlt/pokemon.py
@@ -0,0 +1,43 @@
+"""data load tool (dlt) — the open-source Python library for data loading
+
+How to create a data loading pipeline with dlt and CrateDB in 3 seconds:
+
+0. Configure `cratedb` destination in `.dlt/secrets.toml`.
+  ```toml
+  [destination.cratedb.credentials]
+  host = "localhost"
+  port = 5432
+  username = "crate"
+  password = ""
+  ```
+
+1. Write a pipeline script
+>>> import dlt
+>>> from dlt.sources.helpers import requests
+>>> dlt.run(
+...     data=requests.get("https://pokeapi.co/api/v2/pokemon/").json()["results"],
+...     destination="cratedb",
+...     dataset_name="doc",
+...     table_name="pokemon")
+
+2. Run your pipeline script
+  > $ python pokemon.py
+
+3. See and query your data with autogenerated Streamlit app
+  > $ dlt pipeline dlt_pokemon show
+
+Or start with our pipeline template with sample PokeAPI (pokeapi.co) data loaded to bigquery
+
+  > $ dlt init pokemon bigquery
+
+For more detailed info, see https://dlthub.com/docs/intro
+"""
+
+
+import dlt
+from dlt.sources.helpers import requests
+dlt.run(
+    data=requests.get("https://pokeapi.co/api/v2/pokemon/").json()["results"],
+    destination="cratedb",
+    dataset_name="doc",
+    table_name="pokemon")
diff --git a/framework/dlt/requirements.txt b/framework/dlt/requirements.txt
@@ -0,0 +1,5 @@
+# Development
+dlt[cratedb] @ git+https://github.com/crate-workbench/dlt@cratedb
+
+# Production
+# dlt[cratedb]>=1.12.0