diff --git a/.ruby-version b/.ruby-version index a4dd9dba..47b322c9 100644 --- a/.ruby-version +++ b/.ruby-version @@ -1 +1 @@ -2.7.4 +3.4.1 diff --git a/Gemfile b/Gemfile index 3ff3ae06..b7aab331 100644 --- a/Gemfile +++ b/Gemfile @@ -28,4 +28,12 @@ gem "webrick" gem 'jekyll-redirect-from' -gem 'jekyll-sitemap' \ No newline at end of file +gem 'jekyll-sitemap' + +gem 'csv' + +gem 'logger' + +gem 'base64' + +gem 'bigdecimal' diff --git a/Gemfile.lock b/Gemfile.lock index 4caa4632..95a4f588 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,20 +1,33 @@ GEM remote: https://rubygems.org/ specs: - addressable (2.8.0) - public_suffix (>= 2.0.2, < 5.0) + addressable (2.8.7) + public_suffix (>= 2.0.2, < 7.0) + base64 (0.2.0) + bigdecimal (3.1.9) colorator (1.1.0) - concurrent-ruby (1.1.9) + concurrent-ruby (1.3.5) + csv (3.3.2) em-websocket (0.5.3) eventmachine (>= 0.12.9) http_parser.rb (~> 0) eventmachine (1.2.7) - ffi (1.15.5) + ffi (1.17.1) + ffi (1.17.1-aarch64-linux-gnu) + ffi (1.17.1-aarch64-linux-musl) + ffi (1.17.1-arm-linux-gnu) + ffi (1.17.1-arm-linux-musl) + ffi (1.17.1-arm64-darwin) + ffi (1.17.1-x86-linux-gnu) + ffi (1.17.1-x86-linux-musl) + ffi (1.17.1-x86_64-darwin) + ffi (1.17.1-x86_64-linux-gnu) + ffi (1.17.1-x86_64-linux-musl) forwardable-extended (2.6.0) http_parser.rb (0.8.0) - i18n (1.8.11) + i18n (1.14.7) concurrent-ruby (~> 1.0) - jekyll (4.2.1) + jekyll (4.2.2) addressable (~> 2.4) colorator (~> 1.0) em-websocket (~> 0.5) @@ -29,67 +42,80 @@ GEM rouge (~> 3.0) safe_yaml (~> 1.0) terminal-table (~> 2.0) - jekyll-feed (0.16.0) + jekyll-feed (0.17.0) jekyll (>= 3.7, < 5.0) - jekyll-last-modified-at (1.3.0) + jekyll-include-cache (0.2.1) + jekyll (>= 3.7, < 5.0) + jekyll-last-modified-at (1.3.2) jekyll (>= 3.7, < 5.0) - posix-spawn (~> 0.3.9) jekyll-redirect-from (0.16.0) jekyll (>= 3.3, < 5.0) - jekyll-sass-converter (2.1.0) + jekyll-sass-converter (2.2.0) sassc (> 2.0.1, < 3.0) - jekyll-seo-tag (2.7.1) + jekyll-seo-tag (2.8.0) jekyll (>= 3.8, < 5.0) jekyll-sitemap (1.4.0) jekyll (>= 3.7, < 5.0) jekyll-watch (2.2.1) listen (~> 3.0) - just-the-docs (0.3.3) + just-the-docs (0.10.1) jekyll (>= 3.8.5) - jekyll-seo-tag (~> 2.0) - rake (>= 12.3.1, < 13.1.0) - kramdown (2.3.1) - rexml + jekyll-include-cache + jekyll-seo-tag (>= 2.0) + rake (>= 12.3.1) + kramdown (2.5.1) + rexml (>= 3.3.9) kramdown-parser-gfm (1.1.0) kramdown (~> 2.0) liquid (4.0.4) - listen (3.7.0) + listen (3.9.0) rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) + logger (1.6.6) mercenary (0.4.0) pathutil (0.16.2) forwardable-extended (~> 2.6) - posix-spawn (0.3.15) - public_suffix (4.0.6) - rake (13.0.6) - rb-fsevent (0.11.0) - rb-inotify (0.10.1) + public_suffix (6.0.1) + rake (13.2.1) + rb-fsevent (0.11.2) + rb-inotify (0.11.1) ffi (~> 1.0) - rexml (3.3.6) - strscan - rouge (3.27.0) + rexml (3.4.1) + rouge (3.30.0) safe_yaml (1.0.5) sassc (2.4.0) ffi (~> 1.9) - strscan (3.1.0) terminal-table (2.0.0) unicode-display_width (~> 1.1, >= 1.1.1) unicode-display_width (1.8.0) - webrick (1.8.2) + webrick (1.9.1) PLATFORMS + aarch64-linux-gnu + aarch64-linux-musl + arm-linux-gnu + arm-linux-musl + arm64-darwin ruby - x86_64-darwin-16 + x86-linux-gnu + x86-linux-musl + x86_64-darwin + x86_64-linux-gnu + x86_64-linux-musl DEPENDENCIES + base64 + bigdecimal + csv jekyll (~> 4.2.0) jekyll-feed (~> 0.12) jekyll-last-modified-at jekyll-redirect-from jekyll-sitemap just-the-docs + logger wdm (~> 0.1.1) webrick BUNDLED WITH - 2.2.27 + 2.6.4 diff --git a/_config.yml b/_config.yml index 4e14fa9f..f9adf93f 100644 --- a/_config.yml +++ b/_config.yml @@ -21,6 +21,10 @@ plugins: - jekyll-last-modified-at - jekyll-sitemap +just_the_docs: + collections: + navigation: true + logo: /assets/images/soda-logo.svg color_scheme: soda diff --git a/_data/nav.yml b/_data/nav.yml index 7c1a250d..dd7ef9d5 100644 --- a/_data/nav.yml +++ b/_data/nav.yml @@ -1,324 +1,108 @@ + - title: Home page: index.html -- title: Take a sip of Soda - page: soda/quick-start-sip.md - -- title: Get started - page: soda/get-started-roadmap.md +- title: Soda overview + page: soda/overview.md + +- title: Data testing + page: data-testing/data-testing.md subcategories: - - subtitle: Get started roadmap - page: soda/get-started-roadmap.md - - subtitle: Choose a flavor of Soda - page: soda/setup-guide.md - - subtitle: Install Soda Library - page: soda-library/install.md - - subtitle: Set up a Soda-hosted agent - page: soda-agent/managed-agent.md - - subtitle: Deploy a Soda Agent - page: soda-agent/deploy.md - - subtitle: Soda Agent extras - page: soda-agent/extras.md - - subtitle: Invoke Soda Library - page: soda-library/programmatic.md - - subtitle: Upgrade, redeploy, uninstall - page: soda/upgrade.md - -- title: Use case guides - page: soda/use-case-guides.md - subcategories: - - subtitle: Use case guides - page: soda/use-case-guides.md - - subtitle: Test data in Airflow - page: soda/quick-start-prod.md - - subtitle: Test data in ADF - page: soda/quick-start-adf.md - - subtitle: Test data in Dagster - page: soda/quick-start-dagster.md - - subtitle: Test data in Databricks - page: soda/quick-start-databricks-pipeline.md - - subtitle: Test before data migration - page: soda/quick-start-migration.md - - subtitle: Test data during development - page: soda/quick-start-dev.md - - subtitle: Self-serve Soda - page: soda/quick-start-end-user.md - - subtitle: Automate anomaly detection - page: soda/quick-start-automate.md - - subtitle: "How to: Build a Sigma dashboard" - page: api-docs/reporting-api-to-overview-dashboards.md - - subtitle: "How to: Build a Grafana dashboard" - page: api-docs/public-api-to-grafana.md - - subtitle: "How to: Invoke Soda in Databricks" - page: soda/quick-start-databricks.md - - subtitle: "How to: Add a Secrets Manager" - page: soda/quick-start-secrets.md - - subtitle: "How to: Generate API Keys" - page: soda-cloud/api-keys.md - - subtitle: "How to: Manage sensitive data" - page: soda/sensitive-data.md - - subtitle: "How to: Reroute failed rows" - page: soda/route-failed-rows.md - - subtitle: "How to: Double-onboard a data source" - page: soda-cloud/double-onboard-datasource.md - -- title: Write SodaCL checks - page: soda-cl/soda-cl-overview.md + - subtitle: Introduction + page: data-testing/introduction.md + subcategories: + - subtitle: What is a data contract + page: data-testing/what-is-data-contract.md + - subtitle: Data contract operating model + page: data-testing/data-contract-operating-model.md + - subtitle: Verify a first contract + page: data-testing/verify-first-contract.md + - subtitle: Get started with data testing + page: get-started/get-started-data-testing.md + - subtitle: Verify a contract in your pipeline + page: data-testing/verify-contract-in-pipeline.md + subcategories: + - subtitle: With the library + page: data-testing/with-library.md + - subtitle: With the agent + page: data-testing/with-agent.md + - subtitle: Send results to Soda Cloud + page: data-testing/send-results-to-soda-cloud.md + - subtitle: Verify a contract on a time schedule + page: data-testing/verify-contract-on-time-schedule.md + - subtitle: Write data contracts + page: data-testing/write-data-contracts.md + subcategories: + - subtitle: Create a data contract + page: data-testing/create-data-contract.md + - subtitle: Check missing values + page: data-testing/check-missing-values.md + - subtitle: Check validity + page: data-testing/check-validity.md + - subtitle: Configure notifications + page: data-testing/configure-notifications.md + - subtitle: Failed rows + page: data-testing/failed-rows.md + +- title: Observability + page: observability/observability.md subcategories: - - subtitle: Write SodaCL checks - page: soda-cl/soda-cl-overview.md - - subtitle: SodaCL tutorial - page: soda/quick-start-sodacl.md - - subtitle: Write checks with Ask AI - page: soda-cloud/ask-ai.md - - subtitle: Adopt check suggestions - page: soda-library/check-suggestions.md - - subtitle: Add automated monitoring - page: soda-cl/automated-monitoring.md - - subtitle: Profile data - page: soda-cl/profile.md - - subtitle: Sample data - page: soda-cl/sample-datasets.md - - subtitle: Compare data using SodaCL - page: soda-cl/compare.md - - subtitle: Custom check examples - page: soda-cl/custom-check-examples.md + - subtitle: Quickstart + page: observability/quickstart.md + - subtitle: Observability Guide + page: get-started/get-started-observability.md + subcategories: + - subtitle: Prerequisites + page: observability/prerequisites.md + - subtitle: Metric Monitors + page: observability/metric-monitors.md + subcategories: + - subtitle: Adjust sensitivity + page: observability/adjust-sensitivity.md + - subtitle: Define exclusion values + page: observability/define-exclusion-values.md + - subtitle: Give feedback to improve detection + page: observability/give-feedback.md + - subtitle: Manage incidents + page: observability/manage-incidents.md + - subtitle: Set up alerts + page: observability/set-up-alerts.md + - subtitle: Update settings + page: observability/update-settings.md -- title: Run scans and view results - page: soda-library/run-a-scan.md - subcategories: - - subtitle: Run a scan and view results - page: soda-library/run-a-scan.md - - subtitle: Manage failed rows samples - page: soda-cl/failed-row-samples.md - - subtitle: Manage scheduled scans - page: soda-cloud/scan-mgmt.md - - subtitle: Configure orchestrated scans - page: soda-library/orchestrate-scans.md - - subtitle: Python API reference - page: soda-library/python_api.md -- title: Organize, alert, investigate - page: soda-cloud/collaborate.md - subcategories: - - subtitle: Organize, alert, investigate - page: soda-cloud/collaborate.md - - subtitle: Activate anomaly dashboards - page: soda-cloud/anomaly-dashboard.md - - subtitle: Add check attributes - page: soda-cl/check-attributes.md - - subtitle: Set notification rules - page: soda-cloud/notif-rules.md - - subtitle: Organize datasets - page: soda-cloud/organize-datasets.md - - subtitle: Create and track incidents - page: soda-cloud/incidents.md - - subtitle: Manage global roles and user groups - page: soda-cloud/roles-global.md - - subtitle: Manage dataset roles - page: soda-cloud/roles-dataset.md -- title: Integrate Soda - page: soda/integrate-alation.md +- title: Soda Cloud + page: soda-administration/soda-cloud.md subcategories: - - subtitle: Integrate with Alation - page: soda/integrate-alation.md - - subtitle: Integrate with Atlan + - subtitle: Managing authorization + page: soda-administration/managing-authorization.md + subcategories: + - subtitle: SSO + page: soda-administration/sso.md + - subtitle: User groups + page: soda-administration/user-groupd.md + - subtitle: Permissions + page: soda-administration/permissions.md + - subtitle: Integrate with Soda + page: soda-administration/integrate-with-soda.md + subcategories: + - subtitle: Atlan page: soda/integrate-atlan.md - - subtitle: Integrate with dbt - page: soda/integrate-dbt.md - - subtitle: Integrate with GitHub - page: soda/integrate-github.md - - subtitle: Integrate with Jira - page: soda/integrate-jira.md - - subtitle: Integrate with Metaphor - page: soda/integrate-metaphor.md - - subtitle: Integrate with MS Teams - page: soda/integrate-msteams.md - - subtitle: Integrate with Purview - page: soda/integrate-purview.md - - subtitle: Integrate with ServiceNow - page: soda/integrate-servicenow.md - - subtitle: Integrate with Slack - page: soda/integrate-slack.md - - subtitle: Integrate with SSO - page: soda-cloud/sso.md - - subtitle: Integrate webhooks - page: soda/integrate-webhooks.md - -- title: SodaCL reference - page: soda-cl/metrics-and-checks.md - subcategories: - - subtitle: Metrics and checks - page: soda-cl/metrics-and-checks.md - - subtitle: Optional check configurations - page: soda-cl/optional-config.md - - subtitle: Anomaly detection checks - page: soda-cl/anomaly-detection.md - - subtitle: Anomaly score checks (Deprecated) - page: soda-cl/anomaly-score.md - - subtitle: Check template - page: soda-cl/check-template.md - - subtitle: Cross checks - page: soda-cl/cross-row-checks.md - - subtitle: Distribution checks - page: soda-cl/distribution.md - - subtitle: Failed rows checks - page: soda-cl/failed-rows-checks.md - - subtitle: Filters and variables - page: soda-cl/filters.md - - subtitle: For each - page: soda-cl/for-each.md - - subtitle: Freshness checks - page: soda-cl/freshness.md - - subtitle: Group by - page: soda-cl/group-by.md - - subtitle: Group evolution - page: soda-cl/group-evolution.md - - subtitle: Missing metrics - page: soda-cl/missing-metrics.md - - subtitle: Numeric metrics - page: soda-cl/numeric-metrics.md - - subtitle: Reconciliation checks - page: soda-cl/recon.md - - subtitle: Reference checks - page: soda-cl/reference.md - - subtitle: Schema checks - page: soda-cl/schema.md - - subtitle: User-defined checks - page: soda-cl/user-defined.md - - subtitle: Validity metrics - page: soda-cl/validity-metrics.md - - subtitle: Troubleshoot SodaCL - page: soda-cl/troubleshoot.md - -- title: Data source reference - page: soda/connect-athena.md - subcategories: - - subtitle: Connect to Athena - page: soda/connect-athena.md - - subtitle: Connect to BigQuery - page: soda/connect-bigquery.md - - subtitle: Connect to ClickHouse - page: soda/connect-clickhouse.md - - subtitle: Connect to Dask and Pandas - page: soda/connect-dask.md - - subtitle: Connect to Databricks - page: soda/connect-databricks.md - - subtitle: Connect to Denodo - page: soda/connect-denodo.md - - subtitle: Connect to Dremio - page: soda/connect-dremio.md - - subtitle: Connect to DuckDB - page: soda/connect-duckdb.md - - subtitle: Connect to Google CloudSQL - page: soda/connect-cloudsql.md - - subtitle: Connect to IBM DB2 - page: soda/connect-db2.md - - subtitle: Connect to a local file - page: soda/connect-file.md - - subtitle: Connect to MotherDuck - page: soda/connect-motherduck.md - - subtitle: Connect to MS SQL Server - page: soda/connect-mssql.md - - subtitle: Connect to MySQL - page: soda/connect-mysql.md - - subtitle: Connect to OracleDB - page: soda/connect-oracle.md - - subtitle: Connect to PostgreSQL - page: soda/connect-postgres.md - - subtitle: Connect to Presto - page: soda/connect-presto.md - - subtitle: Connect to Redshift - page: soda/connect-redshift.md - - subtitle: Connect to Snowflake - page: soda/connect-snowflake.md - - subtitle: Connect to Spark - page: soda/connect-spark.md - - subtitle: Connect to Synapse - page: soda/connect-synapse.md - - subtitle: Connect to Trino - page: soda/connect-trino.md - - subtitle: Connect to Vertica - page: soda/connect-vertica.md - - subtitle: Troubleshoot connections - page: soda/connect-troubleshoot.md - -- title: Soda Cloud API - page: api-docs/public-cloud-api-v1.md - subcategories: - - subtitle: Soda Cloud API v1 - page: api-docs/public-cloud-api-v1.md - - subtitle: GET checks into CSV - page: api-docs/api2csv-example.md - -- title: Soda Cloud Reporting API - page: api-docs/reporting-api-v1.md - subcategories: - - subtitle: Reporting API v1 - page: api-docs/reporting-api-v1.md - - subtitle: Migration guide - page: api-docs/reporting-api-v1-migration-guide.md + - subtitle: Collibra + page: soda-administration/integrate-colibra.md -- title: Create a data contract - page: soda/data-contracts.md - subcategories: - - subtitle: Set up data contracts - page: soda/data-contracts.md - - subtitle: Write a data contract - page: soda/data-contracts-write.md - - subtitle: Verify a data contract - page: soda/data-contracts-verify.md - - subtitle: Data contract check reference - page: soda/data-contracts-checks.md - -- title: Learning resources - page: soda/glossary.md - subcategories: - - subtitle: Glossary - page: soda/glossary.md - - subtitle: Soda product overview - page: soda/product-overview.md - - subtitle: How Soda works - page: soda-library/how-library-works.md - - subtitle: Soda Agent basic concepts - page: soda-agent/basics.md - - subtitle: Soda architecture - page: soda-cloud/soda-cloud-architecture.md - - subtitle: Active checks and datasets - page: soda/active-check.md - - subtitle: Data security and privacy - page: soda/data-privacy.md - - subtitle: Soda Library usage statistics - page: soda-library/usage-stats.md - - subtitle: What's new in docs? - page: soda/new-documentation.md - - subtitle: Support - page: soda/support.md - - subtitle: Soda Community Code of Conduct - page: soda/community-coc.md - - subtitle: Soda SQL is now Soda Core - page: soda/deprecated.md +- title: References + page: refs/references.md + subcategories: + - subtitle: Data source connection reference + page: refs/data-source-connection-ref.md + - subtitle: Soda contract reference + page: refs/soda-contract-ref.md + - subtitle: Soda Python library API reference + page: soda-library/python_api.md + - subtitle: Soda CLI reference + page: refs/soda-python-library-api-ref.md + - subtitle: Soda Cloud REST API reference + page: api-docs/public-cloud-api-v1.md -- title: Soda Core - page: soda-core/overview-main.md - -- title: Release notes - page: release-notes/all.md - subcategories: - - subtitle: Soda products - page: release-notes/all.md - - subtitle: Release states - page: release-notes/states.md - - subtitle: Soda Library - page: release-notes/soda-library.md - - subtitle: Soda Agent - page: release-notes/soda-agent.md - - subtitle: Soda Cloud - page: release-notes/soda-cloud.md - - subtitle: Soda Cloud API - page: release-notes/soda-cloud-api.md - - subtitle: Soda Cloud Reporting API - page: release-notes/reporting-api.md - - subtitle: Soda Core - page: release-notes/soda-core.md diff --git a/_includes/agent-basics.md b/_includes/agent-basics.md new file mode 100644 index 00000000..7f00b6c4 --- /dev/null +++ b/_includes/agent-basics.md @@ -0,0 +1,13 @@ +> **Soda Agent Basics** +>
+> A Soda Agent is a tool that empowers Soda Cloud users to securely access data sources to scan for data quality. +> There are two types of Soda Agents: +> 1. **Soda-hosted Agent:** This is an out-of-the-box, ready-to-use agent that Soda provides and manages for you. It's the quickest way to get started with Soda as it requires no installation or deployment. It supports connections to specific data sources like BigQuery, Databricks SQL, MS SQL Server, MySQL, PostgreSQL, Redshift, and Snowflake. [Soda-hosted agent (missing)](#) +> 2. **Self-hosted Agent:** This is a version of the agent that you deploy in your own Kubernetes cluster within your cloud environment (like AWS, Azure, or Google Cloud). It gives you more control and supports a wider range of data sources. [Self-hosted agent (missing)](#) +> +> A Soda Agent is essentially Soda Library (the core scanning technology) packaged as a containerized application that runs in Kubernetes. It acts as the bridge between your data sources and Soda Cloud, allowing users to: +> - Connect to data sources securely +> - Run scans to check data quality +> - Create and manage no-code checks directly in the Soda Cloud interface +> +> The agent only sends metadata (not your actual data) to Soda Cloud, keeping your data secure within your environment. Soda [Agent basic concepts (missing)](#) \ No newline at end of file diff --git a/_includes/banner-upgrade-cloud.md b/_includes/banner-upgrade-cloud.md new file mode 100644 index 00000000..e53e5827 --- /dev/null +++ b/_includes/banner-upgrade-cloud.md @@ -0,0 +1,4 @@ +
+ × + This feature is only supported in Soda Cloud. +
\ No newline at end of file diff --git a/_includes/connect-datasource.md b/_includes/connect-datasource.md new file mode 100644 index 00000000..aedf8786 --- /dev/null +++ b/_includes/connect-datasource.md @@ -0,0 +1,17 @@ +1. In Soda Cloud, go to **your avatar** > **Data Sources**. +2. Click **New Data Source**, then follow the guided steps to create the connection. +Use the table below to understand what each field means and how to complete it: +3. Complete the connection configuration. These settings are specific to each data source (PostgreSQL, MySQL, Snowflake, etc) and usually include connection details such as host, port, credentials, and database name. + +#### New Data Source Attributes + +| Field or Label | Guidance | +| ----------------------- | ---------- | +| Data Source Label | Provide a unique identifier for the data source. Soda Cloud uses the label you provide to define the immutable name of the data source against which it runs the Default Scan.| +| Agent | Select the Soda-hosted agent, or the name of a Soda Agent that you have previously set up in your secure environment. This identifies the Soda Agent to which Soda Cloud must connect in order to run its scan. | +| Check Schedule | Provide the scan frequency details Soda Cloud uses to execute scans according to your needs. If you wish, you can define the schedule as a cron expression. | +| Starting At (UTC) | Select the time of day to run the scan. The default value is midnight. | +| Custom Cron Expression | (Optional) Write your own cron expression to define the schedule Soda Cloud uses to run scans. | +| Column Profiling Scan Schedule | Specify the time of day at which Soda runs the Automation scan.| +| Automation Scan Schedule | Specify the time of day at which Soda runs the daily anomaly dashboard scan.| +| Partition column suggestion - Optional | Add any amount of partition column suggestions. If a suggested column name fully matches a column discovered during metric monitoring or profiling, that column will be used as the partition column. The order of the suggested columns matters, as they will be checked sequentially from top to bottom until a match is found. If no match is found, heuristics will be applied to determine the partition column. You can change the partition column at any time in the dataset settings.| \ No newline at end of file diff --git a/_includes/metrics-monitoring.md b/_includes/metrics-monitoring.md new file mode 100644 index 00000000..c89b8665 --- /dev/null +++ b/_includes/metrics-monitoring.md @@ -0,0 +1 @@ +## What is metrics monitoring? \ No newline at end of file diff --git a/_includes/row-level-monitoring.md b/_includes/row-level-monitoring.md new file mode 100644 index 00000000..7289a763 --- /dev/null +++ b/_includes/row-level-monitoring.md @@ -0,0 +1 @@ +## What is row level monitoring? \ No newline at end of file diff --git a/_includes/soda-agent.md b/_includes/soda-agent.md new file mode 100644 index 00000000..917123eb --- /dev/null +++ b/_includes/soda-agent.md @@ -0,0 +1 @@ +## What is Soda Agent? \ No newline at end of file diff --git a/_includes/what-are-metrics-monitors.md b/_includes/what-are-metrics-monitors.md new file mode 100644 index 00000000..cf59e2de --- /dev/null +++ b/_includes/what-are-metrics-monitors.md @@ -0,0 +1,23 @@ +**Metrics monitors** are the foundation of data observability in Soda. Soda automatically collects dataset level metrics and tracks how those evolve over time. + +![with-library](/assets/images/metric-monitors-dashboard.png){:height="350"} + +Soda then uses a [proprietary anomaly detection algorithm]({% link observability/metric-monitors.md %}#what-makes-sodas-anomaly-detection-the-most-accurate-and-fastest) to identify when metrics deviate from expected patterns. These deviations are surfaced in the **Metric Monitors** tab for each dataset. + +![with-library](/assets/images/metric-monitors-row-count.png){:height="350"} + +You can use metric monitoring to: +- Spot problems without writing checks +- Establish baselines for normal behavior +- Use opt-in alerts to notify data owners when something unusual happens +- Provide insight to business users without requiring code + +## Key capabilities +- **Built for scale:** +Soda collects dataset-level metrics efficiently by accessing database metadata and leveraging metadata history when available. It calculates all metrics using optimized methods to reduce the computational load on your database and deliver fast results. + +- **Get instant insights:** +Soda supports native backfilling and backtesting. You can calculate historical data quality metrics and apply anomaly detection algorithms retroactively. This builds a more complete picture of past data quality and helps surface new issues. + +- **Reduce false alerts:** +Soda’s proprietary algorithm is **70% more accurate** at detecting anomalies in data quality metrics than external frameworks such as Facebook Prophet. \ No newline at end of file diff --git a/_includes/what-is-observability.md b/_includes/what-is-observability.md new file mode 100644 index 00000000..68bbaa15 --- /dev/null +++ b/_includes/what-is-observability.md @@ -0,0 +1,12 @@ +**Data observability** is the practice of continuously monitoring your datasets for unexpected changes & anomalies to uncover data quality issues. It involves collecting and analyzing metrics about your datasets to understand their health over time. + +Instead of writing checks manually for each dataset, with data observability monitors automatically detect anomalies in: +- Row counts +- Insertion times +- Scheme changes + +**Data Observability helps you:** +- Detect incidents faster +- Scale coverage across more data +- Reduce time spent on manual testing +- Empower more team members to spot and act on issues \ No newline at end of file diff --git a/assets/images/add-notification-rule-bell.png b/assets/images/add-notification-rule-bell.png new file mode 100644 index 00000000..4bb48ca9 Binary files /dev/null and b/assets/images/add-notification-rule-bell.png differ diff --git a/assets/images/add-notification-rule-popup.png b/assets/images/add-notification-rule-popup.png new file mode 100644 index 00000000..a39626c8 Binary files /dev/null and b/assets/images/add-notification-rule-popup.png differ diff --git a/assets/images/connected-succesfully.png b/assets/images/connected-succesfully.png new file mode 100644 index 00000000..9b0990d5 Binary files /dev/null and b/assets/images/connected-succesfully.png differ diff --git a/assets/images/dataset-settings.png b/assets/images/dataset-settings.png index daa2bcbf..83b20142 100644 Binary files a/assets/images/dataset-settings.png and b/assets/images/dataset-settings.png differ diff --git a/assets/images/datasource-anomaly.png b/assets/images/datasource-anomaly.png new file mode 100644 index 00000000..a59234d2 Binary files /dev/null and b/assets/images/datasource-anomaly.png differ diff --git a/assets/images/datasource-discover.png b/assets/images/datasource-discover.png new file mode 100644 index 00000000..eda36283 Binary files /dev/null and b/assets/images/datasource-discover.png differ diff --git a/assets/images/datasource-discovery-profiling-anomaly.png b/assets/images/datasource-discovery-profiling-anomaly.png new file mode 100644 index 00000000..dd6e5215 Binary files /dev/null and b/assets/images/datasource-discovery-profiling-anomaly.png differ diff --git a/assets/images/datasource-owner.png b/assets/images/datasource-owner.png new file mode 100644 index 00000000..ef5f6f2d Binary files /dev/null and b/assets/images/datasource-owner.png differ diff --git a/assets/images/datasource-profile.png b/assets/images/datasource-profile.png new file mode 100644 index 00000000..638a4451 Binary files /dev/null and b/assets/images/datasource-profile.png differ diff --git a/assets/images/datasource-save-run.png b/assets/images/datasource-save-run.png new file mode 100644 index 00000000..2e64b0ef Binary files /dev/null and b/assets/images/datasource-save-run.png differ diff --git a/assets/images/exclusion-values.png b/assets/images/exclusion-values.png new file mode 100644 index 00000000..290d5afa Binary files /dev/null and b/assets/images/exclusion-values.png differ diff --git a/assets/images/flag-measurements.png b/assets/images/flag-measurements.png new file mode 100644 index 00000000..5a303f86 Binary files /dev/null and b/assets/images/flag-measurements.png differ diff --git a/assets/images/historical-metric-collection.png b/assets/images/historical-metric-collection.png new file mode 100644 index 00000000..f565494f Binary files /dev/null and b/assets/images/historical-metric-collection.png differ diff --git a/assets/images/metric-monitors-dashboard.png b/assets/images/metric-monitors-dashboard.png new file mode 100644 index 00000000..2a6b3ee7 Binary files /dev/null and b/assets/images/metric-monitors-dashboard.png differ diff --git a/assets/images/metric-monitors-row-count.png b/assets/images/metric-monitors-row-count.png new file mode 100644 index 00000000..caa3ffbc Binary files /dev/null and b/assets/images/metric-monitors-row-count.png differ diff --git a/assets/images/sensitivity-slider.png b/assets/images/sensitivity-slider.png new file mode 100644 index 00000000..2891b4f0 Binary files /dev/null and b/assets/images/sensitivity-slider.png differ diff --git a/assets/images/test-connection.png b/assets/images/test-connection.png new file mode 100644 index 00000000..ee91cb45 Binary files /dev/null and b/assets/images/test-connection.png differ diff --git a/assets/images/track-incident.png b/assets/images/track-incident.png new file mode 100644 index 00000000..a026a055 Binary files /dev/null and b/assets/images/track-incident.png differ diff --git a/data-testing/check-missing-values.md b/data-testing/check-missing-values.md new file mode 100644 index 00000000..f0e752ba --- /dev/null +++ b/data-testing/check-missing-values.md @@ -0,0 +1,8 @@ +--- +layout: default +title: Check missing values +parent: Write data contracts +grand_parent: Data testing +nav_order: 452 +--- +# Check missing values \ No newline at end of file diff --git a/data-testing/check-validity.md b/data-testing/check-validity.md new file mode 100644 index 00000000..82de7854 --- /dev/null +++ b/data-testing/check-validity.md @@ -0,0 +1,8 @@ +--- +layout: default +title: Check validity +parent: Write data contracts +grand_parent: Data testing +nav_order: 453 +--- +# Check validity \ No newline at end of file diff --git a/data-testing/configure-notifications.md b/data-testing/configure-notifications.md new file mode 100644 index 00000000..72de9ddb --- /dev/null +++ b/data-testing/configure-notifications.md @@ -0,0 +1,7 @@ +--- +layout: default +title: Configure notifications +parent: Data testing +nav_order: 460 +--- +# Configure notifications \ No newline at end of file diff --git a/data-testing/create-data-contract.md b/data-testing/create-data-contract.md new file mode 100644 index 00000000..2c88dd3c --- /dev/null +++ b/data-testing/create-data-contract.md @@ -0,0 +1,8 @@ +--- +layout: default +title: Create a data contract +parent: Write data contracts +grand_parent: Data testing +nav_order: 451 +--- +# Create a data contract \ No newline at end of file diff --git a/data-testing/data-contract-operating-model.md b/data-testing/data-contract-operating-model.md new file mode 100644 index 00000000..b630afab --- /dev/null +++ b/data-testing/data-contract-operating-model.md @@ -0,0 +1,8 @@ +--- +layout: default +title: Data contract operating model +parent: Introduction +grand_parent: Data testing +nav_order: 412 +--- +# Data contract operating model \ No newline at end of file diff --git a/data-testing/data-testing.md b/data-testing/data-testing.md new file mode 100644 index 00000000..8516f76d --- /dev/null +++ b/data-testing/data-testing.md @@ -0,0 +1,7 @@ +--- +layout: default +title: Data testing +nav_order: 400 + +--- +# Introduction \ No newline at end of file diff --git a/data-testing/failed-rows.md b/data-testing/failed-rows.md new file mode 100644 index 00000000..574a0286 --- /dev/null +++ b/data-testing/failed-rows.md @@ -0,0 +1,6 @@ +--- +layout: default +title: Failed rows +parent: Data testing +--- +# Failed rows \ No newline at end of file diff --git a/data-testing/introduction.md b/data-testing/introduction.md new file mode 100644 index 00000000..d196520d --- /dev/null +++ b/data-testing/introduction.md @@ -0,0 +1,7 @@ +--- +layout: default +title: Introduction +parent: Data testing +nav_order: 410 +--- +# Introduction \ No newline at end of file diff --git a/data-testing/send-results-to-soda-cloud.md b/data-testing/send-results-to-soda-cloud.md new file mode 100644 index 00000000..9f77546f --- /dev/null +++ b/data-testing/send-results-to-soda-cloud.md @@ -0,0 +1,7 @@ +--- +layout: default +title: Send result to Soda Cloud +parent: Data testing +nav_order: 430 +--- +# Send result to Soda Cloud \ No newline at end of file diff --git a/data-testing/verify-contract-in-pipeline.md b/data-testing/verify-contract-in-pipeline.md new file mode 100644 index 00000000..10850c2a --- /dev/null +++ b/data-testing/verify-contract-in-pipeline.md @@ -0,0 +1,7 @@ +--- +layout: default +title: Verify a contract in your pipeline +parent: Data testing +nav_order: 420 +--- +# Verify a contract in your pipeline \ No newline at end of file diff --git a/data-testing/verify-contract-on-time-schedule.md b/data-testing/verify-contract-on-time-schedule.md new file mode 100644 index 00000000..126b8df0 --- /dev/null +++ b/data-testing/verify-contract-on-time-schedule.md @@ -0,0 +1,7 @@ +--- +layout: default +title: Verify contract on time schedule +parent: Data testing +nav_order: 440 +--- +# Verify contract on time schedule \ No newline at end of file diff --git a/data-testing/verify-first-contract.md b/data-testing/verify-first-contract.md new file mode 100644 index 00000000..40da22a4 --- /dev/null +++ b/data-testing/verify-first-contract.md @@ -0,0 +1,8 @@ +--- +layout: default +title: Verify a first contract +parent: Introduction +grand_parent: Data testing +nav_order: 413 +--- +# Verify the first contract \ No newline at end of file diff --git a/data-testing/what-is-data-contract.md b/data-testing/what-is-data-contract.md new file mode 100644 index 00000000..ef4126d0 --- /dev/null +++ b/data-testing/what-is-data-contract.md @@ -0,0 +1,8 @@ +--- +layout: default +title: What is a data contract +parent: Introduction +grand_parent: Data testing +nav_order: 411 +--- +# What is a data contract \ No newline at end of file diff --git a/data-testing/with-agent.md b/data-testing/with-agent.md new file mode 100644 index 00000000..86557fc3 --- /dev/null +++ b/data-testing/with-agent.md @@ -0,0 +1,8 @@ +--- +layout: default +title: With the agent +parent: Verify a contract in your pipeline +grand_parent: Data testing +nav_order: 422 +--- +# Veriify a contract in your pipeline with the agent \ No newline at end of file diff --git a/data-testing/with-library.md b/data-testing/with-library.md new file mode 100644 index 00000000..1285d80e --- /dev/null +++ b/data-testing/with-library.md @@ -0,0 +1,8 @@ +--- +layout: default +title: With the library +parent: Verify a contract in your pipeline +grand_parent: Data testing +nav_order: 421 +--- +# Veriify a contract in your pipeline with the library \ No newline at end of file diff --git a/data-testing/write-data-contracts.md b/data-testing/write-data-contracts.md new file mode 100644 index 00000000..f27079ab --- /dev/null +++ b/data-testing/write-data-contracts.md @@ -0,0 +1,7 @@ +--- +layout: default +title: Write data contracts +parent: Data testing +nav_order: 450 +--- +# Verify a contract in your pipeline \ No newline at end of file diff --git a/get-started/get-started-data-testing.md b/get-started/get-started-data-testing.md new file mode 100644 index 00000000..e23810f8 --- /dev/null +++ b/get-started/get-started-data-testing.md @@ -0,0 +1,8 @@ +--- +layout: default +title: Getting started with data testing +nav_order: 415 +parent: Data testing +--- + +# Getting started with data testing \ No newline at end of file diff --git a/get-started/get-started.md b/get-started/get-started.md new file mode 100644 index 00000000..e69de29b diff --git a/index.html b/index.html index 8d0e5af9..7b3791a2 100644 --- a/index.html +++ b/index.html @@ -1,6 +1,7 @@ --- layout: default title: Home +nav_order: 100 description: Learn how to use Soda to test data quality. Use Soda Library with SodaCL to write checks for data quality and run scans from the command-line. permalink: / --- diff --git a/observability/adjust-sensitivity.md b/observability/adjust-sensitivity.md new file mode 100644 index 00000000..eb4be644 --- /dev/null +++ b/observability/adjust-sensitivity.md @@ -0,0 +1,26 @@ +--- +layout: default +title: Adjust sensitivity +description: Adjust sensitivity +parent: Metric Monitors +nav_order: 560 +--- + +# Adjust sensitivity + +Use the sensitivity slider to control how strictly the algorithm detects anomalies. Higher sensitivity widens the expected range, making the algorithm less likely to flag anomalies. Lower sensitivity narrows the range, making it more likely to flag small variations as anomalies. + +![with-library](/assets/images/sensitivity-slider.png){:height="350" width="350"} + +By default, the sensitivity is set to 3, meaning values are considered anomalous if they fall outside three standard deviations from the predicted value. The lower and upper bounds of the expected range are calculated as: + +```python +lower = point_forecast - z * sigma +upper = point_forecast + z * sigma +``` + +Click **Apply sensitivity** to update the setting for future scans. This does not affect past results. + +## What's Next? + +- [Explore how to define exclusion value rules]({% link observability/define-exclusion-values.md %}) \ No newline at end of file diff --git a/observability/define-exclusion-values.md b/observability/define-exclusion-values.md new file mode 100644 index 00000000..ee4243ca --- /dev/null +++ b/observability/define-exclusion-values.md @@ -0,0 +1,18 @@ +--- +layout: default +title: Set exclusion values +description: Adjust sensitivity +parent: Metric Monitors +nav_order: 561 +--- +# Set exclusion values + +Exclude specific values or value ranges from being flagged as anomalies. Use this to filter out known edge cases that should not trigger alerts. + +![with-library](/assets/images/exclusion-values.png){:height="350" width="350"} + +Click **+ Add exclusion**, select whether to exclude a single value or a range, and enter the corresponding value or limits. Click **Set Exclusion Values** to apply the changes. These exclusions only apply to future scans. + +## What's Next? + +- [Explore how to give feedback to the model to improve detection]({% link observability/give-feedback.md %}) \ No newline at end of file diff --git a/observability/give-feedback.md b/observability/give-feedback.md new file mode 100644 index 00000000..0fea1b13 --- /dev/null +++ b/observability/give-feedback.md @@ -0,0 +1,26 @@ +--- +layout: default +title: Give feedback to improve detection +description: Give feedback to improve detection +parent: Metric Monitors +nav_order: 562 +--- + +# Give feedback to improve detection + +Review and correct anomaly results by flagging individual measurements as either expected or anomaly. This feedback helps the model treat similar future values more intelligently. + +![with-library](/assets/images/flag-measurements.png){:height="350" width="350"} + +Use this to: + +- Mark a detected anomaly as a false positive by clicking Flag as expected +- Confirm an anomaly is a true positive by clicking Flag as anomaly +- Mark a measurement as a false negative by clicking Flag as anomaly +- Confirm a measurement is a true negative by clicking Flag as expected + +You can also link the measurement to an existing incident or create a new one. This helps track related anomalies and streamline investigations. To learn how to create and manage incidents check out the [Manage incidents guide]({% link observability/manage-incidents.md %}) + +## What's Next? + +- [Explore how to manage incidents for a streamlined resolution.]({% link observability/manage-incidents.md %}) \ No newline at end of file diff --git a/observability/manage-incidents.md b/observability/manage-incidents.md new file mode 100644 index 00000000..9e422a38 --- /dev/null +++ b/observability/manage-incidents.md @@ -0,0 +1,27 @@ +--- +layout: default +title: Manage incidents +description: Manage incidents +parent: Metric Monitors +nav_order: 563 +--- + +# Manage incidents + +When an anomaly is detected or a measurement does not look correct, you can create an incident in Soda Cloud to track your team's investigation and resolution of a data quality issue. + +![with-library](/assets/images/flag-measurements.png){:height="350" width="350"} + +1. Click on the measurement you wish to investigate, then select **Create Incident**. + +2. Provide a **Title**, **Severity**, and **Description** of your new incident, then save. + + +After creating the incident, you can track and update its status in the Incidents tab as your team works toward a resolution. + +![with-library](/assets/images/track-incident.png){:height="350"} + + +## What's Next? + +- [Set up alerts to be notified when an anomaly is detected]({% link observability/set-up-alerts.md %}) \ No newline at end of file diff --git a/observability/metric-monitors.md b/observability/metric-monitors.md new file mode 100644 index 00000000..e6abfb81 --- /dev/null +++ b/observability/metric-monitors.md @@ -0,0 +1,57 @@ +--- +layout: default +title: Metric Monitors +description: Plot elements and how to understand the results +parent: Observability Guide +nav_order: 550 +--- + +# Metric Monitors + +{% include what-are-metrics-monitors.md %} + +Below is a table of the Metric Monitors that Soda supports: + +| Metric name | Based on | How it's calculated | +| ---------------------- | -------- | --------------------------------------------------------------------------------------------------------------------------------- | +| **Total Row Count** | metadata | The total number of rows in the dataset at scan time obtained from metadata. | +| **Total Row Count Change** | metadata | The total number of rows at the previous scan time deducted from the current total row count obtained from metadata at scan time. | +| **Last Insertion Time**  | metadata | The time of last insert at the scan time obtained from metadata and deducted from the scan time. | +| **Schema Changes** | metadata | The number of changes in the dataset schema at the scan time compared to the previous scan. | +| **Partition Row Count** | data | The number of rows inserted in the last partition. | +| **Most Recent Timestamp** | data | The most recent timestamp in the time partition column at scan, deducted from scan time. | + +> For **Schema Changes**, the expected result is always to have no schema changes, regardless of whether there have been frequent schema changes in the past or not. + +## How does partitioning and profiling work? +When you set up the anomaly dashboard, Soda begins by partitioning your data. To maximize efficiency, Soda does not profile the entire dataset; instead, it partitions your data to profile only a representative sample. + +Profiling involves extracting metrics such as the mean, minimum, and maximum values in a column, and counting the number of missing values. + +Here's how Soda partitions your data for profiling: + +- **With a `TIME` type column:** + Soda identifies a column containing `TIME` type data and partitions the dataset to include only the last 30 days of data. + +- **Without a `TIME` type column:** + - If the dataset contains fewer than one million rows, Soda profiles the entire dataset. + - If the dataset contains more than one million rows, Soda randomly selects a sample of one million rows for profiling. + + +## What makes Soda's Anomaly Detection the most accurate and fastest? + +1. All components of the system have been developed from the ground up and ensembled internally, without relying on third-party frameworks such as Facebook Prophet, which tend to impose rigid modeling assumptions and lack interpretability. + +2. A key differentiator of our algorithm is the full transparency and control we have over the modeling stack. This enables us to rigorously evaluate, explain, and improve model behavior—crucial for high-stakes use cases like data quality monitoring. + +3. The algorithm has been benchmarked against Prophet and demonstrated a 70% improvement in detecting anomalous data quality metrics. This performance gain is essential in production environments where false positives and missed anomalies can significantly erode trust and create operational inefficiencies. + +4. The algorithm begins by characterizing the time series based on complexity and the presence of seasonality. A routing mechanism then dynamically selects the optimal modeling path. One of the core modeling strategies involves adaptive exponential smoothing, which allows for robust trend and seasonality capture. + +5. The model supports both automatic learning of new patterns and user-in-the-loop feedback for continuous refinement. + +## What's Next? + +- [Explore how to adjust the sensitivity of the algorithm]({% link observability/adjust-sensitivity.md %}) +- [Explore how to define exclusion value rules]({% link observability/define-exclusion-values.md %}) +- [Explore how to give feedback to the model to improve detection]({% link observability/give-feedback.md %}) \ No newline at end of file diff --git a/observability/observability-guide.md b/observability/observability-guide.md new file mode 100644 index 00000000..7ca2d2de --- /dev/null +++ b/observability/observability-guide.md @@ -0,0 +1,9 @@ +--- +layout: default +title: Observability Guide +description: +parent: Data Observability +nav_order: 515 +--- + +# Observability Guide \ No newline at end of file diff --git a/observability/observability.md b/observability/observability.md new file mode 100644 index 00000000..99db1b62 --- /dev/null +++ b/observability/observability.md @@ -0,0 +1,38 @@ +--- +layout: default +title: Data Observability +description: What is observability? +nav_order: 500 +--- + + +  +  + +# Data Observability + +*Last modified on {% last_modified_at %}* + +{% include banner-upgrade-cloud.md %} + +Use observability to monitor data quality at scale across all your datasets. +Observability helps you catch unexpected issues without needing to define every rule up front. + +Where data testing focuses on known expectations, observability helps you detect the unknown unknowns—like late-arriving records, schema changes, or sudden spikes in missing values. It offers broad, low-effort coverage and requires little configuration, making it easy to share data quality responsibilities across technical and non-technical teams. + +## What is data observability? + +{% include what-is-observability.md %} + + +## What are Metric Monitors? + +{% include what-are-metrics-monitors.md %} + + + +## What's Next? +To get started with Soda observability, follow one of these guides: + +- [Data observability quickstart]({% link observability/quickstart.md %}): Set up monitoring to detect anomalies in your datasets. +- [Data observability guide]({% link observability/observability-guide.md %}): Learn how to get the most out of Soda’s data observability platform. \ No newline at end of file diff --git a/observability/prerequisites.md b/observability/prerequisites.md new file mode 100644 index 00000000..e5ff47a8 --- /dev/null +++ b/observability/prerequisites.md @@ -0,0 +1,62 @@ +--- +layout: default +title: Prerequisites +description: How to set up Soda Agent? +parent: Observability Guide +nav_order: 520 +--- + +# Prerequisites +To use data observability, you need the following: + +- Create a Soda Cloud account. +- Set up a Soda Agent (optional). +- Connect a data source. + +## Create a Soda Cloud account +If you don’t have a Soda Cloud account, [book a demo](https://www.soda.io/schedule-a-demo). You’ll get a free trial to explore and test Soda. + +## Soda Agent (Optional) +This step is optional. Soda creates a Soda-hosted Agent with every account. +You can think of an Agent as the bridge between your data sources and Soda Cloud. A Soda-hosted Agent runs in Soda's cloud and securely connects to your data sources to scan for data quality issues. + +If you are an admin and prefer to deploy your own agent, you can configure a self-hosted agent: +- In Soda Cloud, go to **your avatar** > **Agents** +- Click **New Soda Agent** and follow the setup instructions +
+![soda-hosted-agent](/assets/images/soda-hosted-agent.png){:height="700px" width="700px"} + +{% include agent-basics.md %} + + +## Connect a Data Source +{% include connect-datasource.md %} + +### Supported databases for data observability + +Soda supports metric monitoring for multiple databases. Soda leverages metadata history when available. If metadata history isn't available for your data source, Soda builds history gradually as scans occur. + +#### Metric monitoring support + +- **Metadata-based metrics** + - [Snowflake]({% link soda/connect-snowflake.md %}) + - [BigQuery]({% link soda/connect-bigquery.md %}) + - [Databricks SQL]({% link soda/connect-spark.md %}#connect-to-spark-for-databricks-sql) + - [MS SQL Server]({% link soda/connect-mssql.md %}) + - [PostgreSQL]({% link soda/connect-postgres.md %}) + +- **Historical metric support** + - **From metadata** + - [Databricks SQL]({% link soda/connect-spark.md %}#connect-to-spark-for-databricks-sql) + + - **From query logs** + - [Snowflake]({% link soda/connect-snowflake.md %}) + - [BigQuery]({% link soda/connect-bigquery.md %}) + +- **Data-based metrics** + - All data sources are theoretically supported for data-based metric monitoring. + + +## What's Next? + +- [Explore how to analyse and keep track of your Metric Monitors.]({% link observability/metric-monitors.md %}) \ No newline at end of file diff --git a/observability/quickstart.md b/observability/quickstart.md new file mode 100644 index 00000000..bafa41b1 --- /dev/null +++ b/observability/quickstart.md @@ -0,0 +1,136 @@ +--- +layout: default +title: Quickstart Observability +description: Quickstart Observability +parent: Data Observability +nav_order: 511 +--- + +# Quickstart: Get Started with Observability + +*Last modified on {% last_modified_at %}* + +This quickstart walks you through enabling observability on a single dataset to help you explore Soda’s functionality as quickly as possible. + +You will: +- Create a Soda Cloud account +- Connect a data source +- Configure your first dataset to enable observability + +> 💡 We recommend enabling observability for a single dataset that updates daily and has been in use for a while. +> This gives you more meaningful results, faster. +> +> You can always update or remove the data source later—this is just a test connection to explore the platform. + +## Step 1: Create a Soda Cloud Account +1. Go to cloud.soda.io and sign up for a Soda Cloud account. If you already have an account, log in. +2. By default, Soda creates a Soda-hosted Agent for all new accounts. You can think of an Agent as the bridge between your data sources and Soda Cloud. A Soda-hosted Agent runs in Soda's cloud and securely connects to your data sources to scan for data quality issues. +3. If you are an admin and prefer to deploy your own agent, you can configure a self-hosted agent: + + - In Soda Cloud, go to **your avatar** > **Agents** + - Click **New Soda Agent** and follow the setup instructions + +
+![soda-hosted-agent](/assets/images/soda-hosted-agent.png){:height="700px" width="700px"} + +{% include agent-basics.md %} + +## Step 2: Add a Data Source +{% include connect-datasource.md %} + +Use the appropriate guide below to complete the connection: +* [Connect to BigQuery]({% link soda/connect-bigquery.md %}) +* [Connect to Databricks SQL]({% link soda/connect-spark.md %}#connect-to-spark-for-databricks-sql) +* [Connect to MS SQL Server]({% link soda/connect-mssql.md %}) +* [Connect to PostgreSQL]({% link soda/connect-postgres.md %}) +* [Connect to Snowflake]({% link soda/connect-snowflake.md %}) + +## Step 3: Test Data Source Connection +Click **Test Connection** at the top to verify that all connection settings are configured correctly. +![test-connection](/assets/images/test-connection.png){:height="150px"} + +If everything is configured properly, you’ll see a success screen like the one below. + +![connected-succesfully](/assets/images/connected-succesfully.png){:height="100px"} + +## Step 4: Configure Dataset Discovery, Profiling and Anomaly Detection +In this step, you define which datasets and columns Soda will monitor, and enable anomaly detection to automatically surface issues. + +### 4.1 Dataset Discovery +Dataset discovery collects metadata about each dataset, including its schema and the data types of each column. + +![datasource-discover](/assets/images/datasource-discover.png){:height="400px"} + +Specify the datasets you want to profile. Because dataset discovery can be resource-intensive, only include datasets that are important for observability. + +See [Compute consumption and cost considerations]({% link soda-cl/profile.md %}#compute-consumption-and-cost-considerations) for more detail. + +### 4.2 Column Profiling +Column profiling captures metrics such as the mean, minimum, and maximum values in a column, as well as the number of missing values. + +![datasource-profile](/assets/images/datasource-profile.png){:height="400px"} + +Use include/exclude patterns to specify which columns Soda should profile. These metrics feed into the anomaly dashboard. + +By default, Soda includes all datasets in the data source. If you’re just testing the functionality, you can leave the default settings and click Next to continue. + +### 4.3 Anomaly Detection +In the Detect Anomalies tab, define which datasets should be monitored for anomalies like schema changes or unusual metric behavior. + +![datasource-anomaly](/assets/images/datasource-anomaly.png){:height="400px"} + +Use include/exclude filters to specify the datasets to monitor with Metric Monitors. + +![historical-metric-collection](/assets/images/historical-metric-collection.png){:width="400px"} + +You can also enable historical metric collection to calculate past metrics and provide training data for the anomaly detection engine. This helps with: + +1. Assessing how the data quality metrics were performing in the past. +2. Using them as training data for the anomaly detection algorithms. + + +## Step 5: Assing a Data Source and Dataset Owner +Assign responsibility for maintaining the data source and each dataset. + +![datasource-owner](/assets/images/datasource-owner.png){:height="400px"} + +- **Data Source Owner:** Manages the connection settings and scan configurations for the data source. +- **Dataset Owner:** Becomes the default owner of each dataset for monitoring and collaboration. + +For more details, see [Roles and rights in Soda Cloud]({% link soda-cloud/roles-global.md %}). + +## Step 6: Test Connection and Save +- Click **Test Connection** to verify your configuration. +- Click **Save** to start profiling the selected datasets. + +![datasource-save-run](/assets/images/datasource-save-run.png){:height="400px"} + +Once saved, Soda runs a first scan using your profiling settings. This initial scan provides baseline measurements that Soda uses to begin learning patterns and identifying anomalies. + +## Step 7: View Metric Monitor Results +1. Go to the **Datasets** page in Soda Cloud. +2. Select a dataset you included in profiling. +3. Open the **Metric Monitors** tab to view automatically detected issues. + +After the historical metric collection scan is complete (this usually takes just a few minutes), you can review the results. + +![metric-monitors-dashboard](/assets/images/metric-monitors-dashboard.png){:height="700px"} + +On this screen, you’ll see the following metrics: + +| Metric name | Based on | How it's calculated | +| ---------------------- | -------- | --------------------------------------------------------------------------------------------------------------------------------- | +| **Total Row Count** | metadata | The total number of rows in the dataset at scan time obtained from metadata. | +| **Total Row Count Change** | metadata | The total number of rows at the previous scan time deducted from the current total row count obtained from metadata at scan time. | +| **Last Insertion Time**  | metadata | The time of last insert at the scan time obtained from metadata and deducted from the scan time. | +| **Schema Changes** | metadata | The number of changes in the dataset schema at the scan time compared to the previous scan. | +| **Partition Row Count** | data | The number of rows inserted in the last partition. | +| **Most Recent Timestamp** | data | The most recent timestamp in the time partition column at scan, deducted from scan time. + +### 🎉 Congratulations! You’ve set up your dataset and enabled observability. + +## What's Next? +Now that your first dataset is configured and observability is active, try: + +- [Explore detailed metrics in the anomaly guide]({% link observability/observability-guide.md %}) +- [Set up alerts for anomaly detection]({% link observability/set-up-alerts.md %}) diff --git a/observability/set-up-alerts.md b/observability/set-up-alerts.md new file mode 100644 index 00000000..a9514530 --- /dev/null +++ b/observability/set-up-alerts.md @@ -0,0 +1,27 @@ +--- +layout: default +title: Set up alerts +description: Set up alerts +parent: Metric Monitors +nav_order: 564 +--- + +# Set alert notification rules + +Ascribing to a "no noise" policy, Soda enables you to define rules to customize the alert notifications you receive when scan results warn or fail. For example, you can define a notification rule to instruct Soda Cloud to send an alert to your #sales-engineering Slack channel whenever an anomaly on the snowflake_sales data is detected. + +In Soda Cloud, navigate to your **Metric Monitors** dashboard, then click the **bell** icon for the metric monitor you want to set up an alert for. + + +![with-library](/assets/images/add-notification-rule-bell.png){:height="350"} + +Follow the guided steps to complete the new rule. + +![with-library](/assets/images/add-notification-rule-popup.png){:height="350"} + +Check out the integration guides to learn how to receive alerts on [Slack]({% link soda/integrate-slack.md %}) and [MS Teams]({% link soda/integrate-msteams.md %}). + + +## What's Next? + +- [Explore how to update any of the the Metric Monitors settings]({% link observability/update-settings.md %}) \ No newline at end of file diff --git a/observability/update-settings.md b/observability/update-settings.md new file mode 100644 index 00000000..5cd44b0d --- /dev/null +++ b/observability/update-settings.md @@ -0,0 +1,44 @@ +--- +layout: default +title: Update settings +description: Settings for metrics monitoring +parent: Metric Monitors +nav_order: 565 +--- + +# Update settings + +You can update the settings of a dataset used in the **Metric Monitored** page. This includes editing dataset attributes, specifying a time partition column for metric calculations and profiling, and choosing whether to collect failed row samples. + +## Edit dataset settings + +To update dataset settings: + +1. Go to the **Datasets** tab. +2. Click the **three-dot menu** next to the dataset you want to update. +3. Select **Edit settings**. + +![with-library](/assets/images/dataset-settings.png){:height="350"} + +The **Dataset Settings** panel opens with three tabs: + +### Attributes + +Update the dataset metadata: +- **Dataset label**: The display name of the dataset. +- **Source**: The location of the dataset. +- **Owned by**: The user responsible for the dataset. +- **Tags**: Add searchable tags to organize datasets. +- **Description**: Optionally provide context about the dataset’s purpose or usage. + +### Profiling & Metric Monitoring + +Set the **Time Partition Column** to enable time-based metric calculations and profiling. This column should contain timestamps without time zones and is typically required for daily-partitioned data. + +### Failed Row Samples + +Configure how to collect samples of rows that fail checks: +- **Failed rows sample collection**: Choose whether to inherit the organization-wide setting or override it. +- **Collect failed row samples for**: Choose which columns to include in the sample. + +Click **Save** to apply your changes. \ No newline at end of file diff --git a/refs/data-source-connection-ref.md b/refs/data-source-connection-ref.md new file mode 100644 index 00000000..c763dce9 --- /dev/null +++ b/refs/data-source-connection-ref.md @@ -0,0 +1,8 @@ +--- +layout: default +title: Data source connection reference +description: Data source connection reference +parent: References +--- + +# Data source connetion reference \ No newline at end of file diff --git a/refs/references.md b/refs/references.md new file mode 100644 index 00000000..98481831 --- /dev/null +++ b/refs/references.md @@ -0,0 +1,6 @@ +--- +layout: default +title: References + +--- +# References \ No newline at end of file diff --git a/refs/soda-cli-ref.md b/refs/soda-cli-ref.md new file mode 100644 index 00000000..3eafb0b3 --- /dev/null +++ b/refs/soda-cli-ref.md @@ -0,0 +1,8 @@ +--- +layout: default +title: Soda CLI reference +description: Soda CLI reference +parent: References +--- + +# Soda CLI reference \ No newline at end of file diff --git a/refs/soda-cloud-rest-api-ref.md b/refs/soda-cloud-rest-api-ref.md new file mode 100644 index 00000000..44e85fa7 --- /dev/null +++ b/refs/soda-cloud-rest-api-ref.md @@ -0,0 +1,8 @@ +--- +layout: default +title: Soda Cloud REST API reference +description: Soda Cloud REST API reference +parent: References +--- + +# Soda Cloud REST API reference \ No newline at end of file diff --git a/refs/soda-contract-ref.md b/refs/soda-contract-ref.md new file mode 100644 index 00000000..9ac38a75 --- /dev/null +++ b/refs/soda-contract-ref.md @@ -0,0 +1,8 @@ +--- +layout: default +title: Soda Contract reference +description: Soda Contract reference +parent: References +--- + +# Soda Contract reference \ No newline at end of file diff --git a/refs/soda-python-library-api-ref.md b/refs/soda-python-library-api-ref.md new file mode 100644 index 00000000..2998778d --- /dev/null +++ b/refs/soda-python-library-api-ref.md @@ -0,0 +1,8 @@ +--- +layout: default +title: Soda Python library API reference +description: Soda Python library API reference +parent: References +--- + +# Soda Python library API reference \ No newline at end of file diff --git a/soda-administration/integrate-atlan.md b/soda-administration/integrate-atlan.md new file mode 100644 index 00000000..43298b90 --- /dev/null +++ b/soda-administration/integrate-atlan.md @@ -0,0 +1,9 @@ +--- +layout: default +title: Atlan +description: Integrate Atlan +parent: Integrate with Soda +grand_parent: Soda Cloud +--- + +# Atlan \ No newline at end of file diff --git a/soda-administration/integrate-collibra.md b/soda-administration/integrate-collibra.md new file mode 100644 index 00000000..42a6af91 --- /dev/null +++ b/soda-administration/integrate-collibra.md @@ -0,0 +1,9 @@ +--- +layout: default +title: Collibra +description: Integrate Collibra +parent: Integrate with Soda +grand_parent: Soda Cloud +--- + +# Collibra \ No newline at end of file diff --git a/soda-administration/integrate-with-soda.md b/soda-administration/integrate-with-soda.md new file mode 100644 index 00000000..996b5d3c --- /dev/null +++ b/soda-administration/integrate-with-soda.md @@ -0,0 +1,9 @@ +--- +layout: default +title: Integrate with Soda +description: Integrate with Soda +parent: Soda Cloud +nav_order: 620 +--- + +# Integrate with Soda \ No newline at end of file diff --git a/soda-administration/managing-authorization.md b/soda-administration/managing-authorization.md new file mode 100644 index 00000000..95662b60 --- /dev/null +++ b/soda-administration/managing-authorization.md @@ -0,0 +1,9 @@ +--- +layout: default +title: Managing authorization +description: Managing authorization +parent: Soda Cloud +nav_order: 610 +--- + +# Managing authorization \ No newline at end of file diff --git a/soda-administration/permissions.md b/soda-administration/permissions.md new file mode 100644 index 00000000..140a4106 --- /dev/null +++ b/soda-administration/permissions.md @@ -0,0 +1,10 @@ +--- +layout: default +title: Permissions +description: Permissions +parent: Managing authorization +grand_parent: Soda Cloud +nav_order: 613 +--- + +# Permissions \ No newline at end of file diff --git a/soda-administration/soda-cloud.md b/soda-administration/soda-cloud.md new file mode 100644 index 00000000..745393e4 --- /dev/null +++ b/soda-administration/soda-cloud.md @@ -0,0 +1,8 @@ +--- +layout: default +title: Soda Cloud +description: Soda Cloud +nav_order: 600 +--- + +# Soda Cloud \ No newline at end of file diff --git a/soda-administration/sso.md b/soda-administration/sso.md new file mode 100644 index 00000000..c48b0ac5 --- /dev/null +++ b/soda-administration/sso.md @@ -0,0 +1,10 @@ +--- +layout: default +title: SSO +description: SSO +parent: Managing authorization +grand_parent: Soda Cloud +nav_order: 611 +--- + +# SSO \ No newline at end of file diff --git a/soda-administration/user-groups.md b/soda-administration/user-groups.md new file mode 100644 index 00000000..ca9ddb4e --- /dev/null +++ b/soda-administration/user-groups.md @@ -0,0 +1,10 @@ +--- +layout: default +title: User groups +description: User groups +parent: Managing authorization +grand_parent: Soda Cloud +nav_order: 612 +--- + +# User groups \ No newline at end of file diff --git a/soda/connect-troubleshoot.md b/soda/connect-troubleshoot.md index 9b7d6b56..4fe04707 100644 --- a/soda/connect-troubleshoot.md +++ b/soda/connect-troubleshoot.md @@ -2,7 +2,7 @@ layout: default title: Troubleshoot data source connections description: -parent: +parent: Data source reference --- # Troubleshoot data source connections diff --git a/soda/get-started-roadmap.md b/soda/get-started-roadmap.md index 2bae1f27..80379707 100644 --- a/soda/get-started-roadmap.md +++ b/soda/get-started-roadmap.md @@ -1,6 +1,7 @@ --- layout: default title: Get started roadmap +nav_order: 200 description: Get started with Soda! Use this curated set of instructions to quickly get data quality tests up and running. parent: Get started --- diff --git a/soda/overview.md b/soda/overview.md new file mode 100644 index 00000000..58b2f7e5 --- /dev/null +++ b/soda/overview.md @@ -0,0 +1,100 @@ +--- +layout: default +title: Soda Overview +description: Soda Overview +nav_order: 300 +--- + +  +  + +# Soda Overview +*Last modified on {% last_modified_at %}* + +Soda helps data teams build reliable data products and pipelines. + +## What Soda does +You can use Soda to test data as it flows through your pipelines and monitor data quality over time. Embed tests directly in your workflows or use Soda’s built-in observability features to detect and resolve data issues early. + +Soda helps you answer key questions about your data: + +- Is the data fresh? +- Is any data missing? +- Are there duplicate records? +- Did something go wrong during a transformation? +- Are all values within expected ranges? +- Are data quality metrics changing over time? Are there anomalies in freshness, row counts, or missing values? + +## How Soda approaches Data Quality + +Soda follows two complementary approaches to managing data quality: data testing and data observability. Together, they help you prevent data issues and detect unexpected changes in production. + +### Data Testing +Data testing is a proactive approach to catch data quality issues before they impact downstream systems. It belongs early in your data lifecycle—during development, deployment, or transformation. + +**Use data testing to:** +- Validate data during CI/CD workflows +- Compare source and target tables for reconciliation +- Check assumptions in transformation logic +- Enforce data contracts between teams and systems + +Data tests are explicit, rule-based checks that you can define based on known expectations. + +### Data Observability +Data observability is a reactive approach to monitor data in production and catch unexpected issues as they emerge. It helps answer the question: What is happening with my data right now, and how is that changing over time? + +**Use data observability to:** +- Detect anomalies in data quality metrics such as freshness, row counts, or null values +- Monitor metric trends and seasonality +- Identify late-arriving or missing records +- Get alerted when values deviate from historical norms + +## How Soda fits into your stack + +Soda integrates with all major data platforms, including: + +- **Databases and data warehouses:** BigQuery, Snowflake, Redshift, Databricks, PostgreSQL, Spark, Dask, PostgreSQL, Presto, DuckDB and more. +- **Data catalog and metadata tools:** Atlaion, Atlan, Collibra, data.world, Zeenea and more. +- **Orchestration platforms:** Airflow, Azure Data Factory, Dagster, dbt, Prefect and more. +- **Cloud providers:** AWS, Google Cloud, Azure. +- **BI Tools:** Looker, Tableua, PowerBi. +- **Messaging and Ticketing:** Jira, Opsgenie, PagerDuty, ServiceNow, MicroSoft Teams and Slack. + +You can set up data quality tests programmatically using Soda Library, or configure them through the Soda Cloud user interface—without writing code. Test results are pushed to Soda Cloud for monitoring, collaboration, and alerting. + +### Soda's deployment options + +You can deploy Soda in three ways, depending on your team’s scale, security needs, and infrastructure preferences. + +#### Self-operated deployment + +Install Soda Library locally and connect it to Soda Cloud using API keys. +Soda Library scans your datasets and pushes metadata to Soda Cloud. There, your team can view check results, collaborate on incidents, and integrate with tools like Slack. + +By default, your data stays within your private network. See [Data security and privacy (missing)](#) for more details. To learn more about how to set up a self-operated deployment check out [Self-operated deployment guide (missing)](#) + +![with-library](/assets/images/with-library.png){:height="500px" width="500px"} + +#### Soda-hosted deployment + +Use Soda Cloud to connect directly to your data sources. Soda-hosted deployment gives you a secure, managed way to scan data, create no-code checks, and share insights—all from the UI. + +This option supports BigQuery, Databricks SQL, MS SQL Server, MySQL, PostgreSQL, Redshift, and Snowflake. To learn more about how to set up a soda-hosted deployment check out [Soda-hosted deployment guide (missing)](#) + +![with-managed-agent](/assets/images/with-managed-agent.png){:height="60px" width="600px"} + +#### Self-hosted deployment + +Run Soda Library inside your own Kubernetes cluster in AWS, Google Cloud, or Azure. + +This deployment gives infrastructure teams full control over how Soda accesses data while still enabling Soda Cloud users to write and view checks. Checks can be written programmatically or through the UI. To learn more about how to set up a self-hosted deployment check out [Self-hosted deployment guide (missing)](#) + +![with-agent](/assets/images/with-agent.png){:height="60px" width="600px"} + + +## What's Next? + +To get started with Soda, follow one of these quickstarts based on your needs: + +- [Data testing quickstart](#): Learn how to define and run checks in your workflows. +- [Data observability quickstart]({% link observability/quickstart.md %}): Set up monitoring to detect anomalies in your datasets. \ No newline at end of file diff --git a/soda/quick-start-sip.md b/soda/quick-start-sip.md index f0f8ac97..a295accb 100644 --- a/soda/quick-start-sip.md +++ b/soda/quick-start-sip.md @@ -2,7 +2,7 @@ layout: default title: Take a sip of Soda description: Follow this tutorial to set up and run a simple Soda scan for data quality using example data. -parent: Home +parent: Get started redirect_from: - /soda/core-interactive-demo.html - /soda/quick-start-soda-core.html