diff --git a/rust/otap-dataflow/README.md b/rust/otap-dataflow/README.md
index 4b9067600d..80dcf4f7f8 100644
--- a/rust/otap-dataflow/README.md
+++ b/rust/otap-dataflow/README.md
@@ -1,4 +1,4 @@
-# OTAP Dataflow Library
+# OTAP Dataflow Engine
 
 [![build](https://github.com/open-telemetry/otel-arrow/actions/workflows/rust-ci.yml/badge.svg)](https://github.com/open-telemetry/otel-arrow/actions/workflows/rust-ci.yml)
 [![build](https://github.com/open-telemetry/otel-arrow/actions/workflows/rust-audit.yml/badge.svg)](https://github.com/open-telemetry/otel-arrow/actions/workflows/rust-audit.yml)
@@ -10,7 +10,7 @@
 
 ## Overview
 
-The OTAP Dataflow library is a set of core Rust crates which combine
+The OTAP Dataflow Engine is a set of core Rust crates which combine
 to produce an OpenTelemetry pipeline support, for use as an embedded
 software component, providing a framework for collecting OpenTelemetry
 data.
@@ -337,6 +337,7 @@ docker build --build-context otel-arrow=../../ -f Dockerfile -t df_engine .
 ## Contributing
 
 - [Contribution Guidelines](CONTRIBUTING.md)
+- [Internal Telemetry Guidelines](docs/telemetry/README.md)
 - Code of Conduct (TBD)
 
 Before submitting a PR, please run the following commands:
diff --git a/rust/otap-dataflow/crates/otap/src/attributes_processor.rs b/rust/otap-dataflow/crates/otap/src/attributes_processor.rs
index f36e2896fd..d0e4fd06e2 100644
--- a/rust/otap-dataflow/crates/otap/src/attributes_processor.rs
+++ b/rust/otap-dataflow/crates/otap/src/attributes_processor.rs
@@ -299,21 +299,12 @@ impl local::Processor<OtapPdata> for AttributesProcessor {
                 _ => Ok(()),
             },
             Message::PData(pdata) => {
-                if let Some(m) = self.metrics.as_mut() {
-                    m.msgs_consumed.inc();
-                }
-
                 // Fast path: no actions to apply
                 if self.is_noop() {
                     let res = effect_handler
                         .send_message(pdata)
                         .await
                         .map_err(|e| e.into());
-                    if res.is_ok() {
-                        if let Some(m) = self.metrics.as_mut() {
-                            m.msgs_forwarded.inc();
-                        }
-                    }
                     return res;
                 }
 
@@ -354,16 +345,10 @@ impl local::Processor<OtapPdata> for AttributesProcessor {
                     }
                 }
 
-                let res = effect_handler
+                effect_handler
                     .send_message(OtapPdata::new(context, records.into()))
                     .await
-                    .map_err(|e| e.into());
-                if res.is_ok() {
-                    if let Some(m) = self.metrics.as_mut() {
-                        m.msgs_forwarded.inc();
-                    }
-                }
-                res
+                    .map_err(|e| e.into())
             }
         }
     }
@@ -1171,8 +1156,6 @@ mod telemetry_tests {
                 tokio::time::sleep(std::time::Duration::from_millis(50)).await;
 
                 // Inspect current metrics; fields with non-zero values should be present
-                let mut found_consumed = false;
-                let mut found_forwarded = false;
                 let mut found_renamed_entries = false;
                 let mut found_deleted_entries = false;
                 let mut found_domain_signal = false;
@@ -1181,8 +1164,6 @@ mod telemetry_tests {
                     if desc.name == "attributes.processor.metrics" {
                         for (field, v) in iter {
                             match (field.name, v.to_u64_lossy()) {
-                                ("msgs.consumed", x) if x >= 1 => found_consumed = true,
-                                ("msgs.forwarded", x) if x >= 1 => found_forwarded = true,
                                 ("renamed.entries", x) if x >= 1 => found_renamed_entries = true,
                                 ("deleted.entries", x) if x >= 1 => found_deleted_entries = true,
                                 ("domains.signal", x) if x >= 1 => found_domain_signal = true,
@@ -1192,8 +1173,6 @@ mod telemetry_tests {
                     }
                 });
 
-                assert!(found_consumed, "msgs.consumed should be >= 1");
-                assert!(found_forwarded, "msgs.forwarded should be >= 1");
                 assert!(found_renamed_entries, "renamed.entries should be >= 1");
                 assert!(found_deleted_entries, "deleted.entries should be >= 1");
                 assert!(found_domain_signal, "domains.signal should be >= 1");
diff --git a/rust/otap-dataflow/crates/otap/src/attributes_processor/metrics.rs b/rust/otap-dataflow/crates/otap/src/attributes_processor/metrics.rs
index 1ea9059960..3ff48134d7 100644
--- a/rust/otap-dataflow/crates/otap/src/attributes_processor/metrics.rs
+++ b/rust/otap-dataflow/crates/otap/src/attributes_processor/metrics.rs
@@ -10,14 +10,6 @@ use otap_df_telemetry_macros::metric_set;
 #[metric_set(name = "attributes.processor.metrics")]
 #[derive(Debug, Default, Clone)]
 pub struct AttributesProcessorMetrics {
-    /// PData messages consumed by this processor.
-    #[metric(unit = "{msg}")]
-    pub msgs_consumed: Counter<u64>,
-
-    /// PData messages forwarded by this processor.
-    #[metric(unit = "{msg}")]
-    pub msgs_forwarded: Counter<u64>,
-
     /// Number of failed transform attempts.
     #[metric(unit = "{op}")]
     pub transform_failed: Counter<u64>,
diff --git a/rust/otap-dataflow/crates/otap/src/transform_processor.rs b/rust/otap-dataflow/crates/otap/src/transform_processor.rs
index 3557443a2e..2afe7d082b 100644
--- a/rust/otap-dataflow/crates/otap/src/transform_processor.rs
+++ b/rust/otap-dataflow/crates/otap/src/transform_processor.rs
@@ -173,7 +173,6 @@ impl Processor<OtapPdata> for TransformProcessor {
                 }
             },
             Message::PData(pdata) => {
-                self.metrics.msgs_consumed.inc();
                 let (context, payload) = pdata.into_parts();
                 let payload = if !self.should_process(&payload) {
                     // skip handling this pdata
@@ -200,8 +199,7 @@ impl Processor<OtapPdata> for TransformProcessor {
 
                 effect_handler
                     .send_message(OtapPdata::new(context, payload))
-                    .await
-                    .inspect(|_| self.metrics.msgs_forwarded.inc())?;
+                    .await?;
             }
         };
 
@@ -347,17 +345,13 @@ mod test {
                 // Allow the collector to pull from the channel
                 tokio::time::sleep(std::time::Duration::from_millis(50)).await;
 
-                let mut msgs_consumed = 0;
-                let mut msgs_forwarded = 0;
                 let mut msgs_transformed = 0;
                 let mut msgs_transform_failed = 0;
                 registry.visit_current_metrics(|desc, _attrs, iter| {
-                    if desc.name == "transform.processor.metrics" {
+                    if desc.name == "transform.processor" {
                         for (field, v) in iter {
                             let val = v.to_u64_lossy();
                             match field.name {
-                                "msgs.consumed" => msgs_consumed += val,
-                                "msgs.forwarded" => msgs_forwarded += val,
                                 "msgs.transformed" => msgs_transformed += val,
                                 "msgs.transform.failed" => msgs_transform_failed += val,
                                 _ => {}
@@ -366,8 +360,6 @@ mod test {
                     }
                 });
 
-                assert_eq!(msgs_consumed, 1);
-                assert_eq!(msgs_forwarded, 1);
                 assert_eq!(msgs_transformed, 1);
                 assert_eq!(msgs_transform_failed, 0)
             });
diff --git a/rust/otap-dataflow/crates/otap/src/transform_processor/metrics.rs b/rust/otap-dataflow/crates/otap/src/transform_processor/metrics.rs
index a5fa7576a2..95d9b729b1 100644
--- a/rust/otap-dataflow/crates/otap/src/transform_processor/metrics.rs
+++ b/rust/otap-dataflow/crates/otap/src/transform_processor/metrics.rs
@@ -7,17 +7,9 @@ use otap_df_telemetry::instrument::Counter;
 use otap_df_telemetry_macros::metric_set;
 
 /// Metrics for the TransformProcessor node.
-#[metric_set(name = "transform.processor.metrics")]
+#[metric_set(name = "transform.processor")]
 #[derive(Debug, Default, Clone)]
 pub struct Metrics {
-    /// PData messages consumed by this processor.
-    #[metric(unit = "{msg}")]
-    pub msgs_consumed: Counter<u64>,
-
-    /// PData messages forwarded by this processor.
-    #[metric(unit = "{msg}")]
-    pub msgs_forwarded: Counter<u64>,
-
     /// Number of messages successfully transformed.
     #[metric(unit = "{msg}")]
     pub msgs_transformed: Counter<u64>,
diff --git a/rust/otap-dataflow/crates/telemetry/README.md b/rust/otap-dataflow/crates/telemetry/README.md
index 6496c03e20..9413f2f73e 100644
--- a/rust/otap-dataflow/crates/telemetry/README.md
+++ b/rust/otap-dataflow/crates/telemetry/README.md
@@ -1,5 +1,7 @@
 # Telemetry SDK (schema-first, multivariate, NUMA-aware)
 
+Status: draft under active development.
+
 A low-overhead, NUMA-aware telemetry SDK that turns a declarative schema into a
 type-safe Rust API for emitting richly structured, multivariate metrics. It is
 designed for engines that run a thread-per-core and require predictable latency
@@ -62,3 +64,6 @@ See the [telemetry-macros crate](../telemetry-macros) for details.
 - NUMA-aware aggregation.
 
 ![Architecture Phase 2](assets/Metrics%20Phase%202.svg)
+
+Note: The recent telemetry guidelines defined in `/docs/telemetry` are
+still being implemented in this SDK. Expect changes and improvements over time.
diff --git a/rust/otap-dataflow/docs/telemetry/README.md b/rust/otap-dataflow/docs/telemetry/README.md
new file mode 100644
index 0000000000..042609a278
--- /dev/null
+++ b/rust/otap-dataflow/docs/telemetry/README.md
@@ -0,0 +1,277 @@
+# Internal telemetry documentation and policy
+
+Status: **Draft** under active development.
+
+## Scope
+
+This documentation applies to all telemetry produced by the OTAP dataflow engine
+(runtime and its core libraries):
+
+- resource metadata (service, host, container, process)
+- metrics
+- events (structured logs with an event name)
+- traces (when implemented)
+
+## Normative language
+
+The keywords MUST, SHOULD, and MAY are to be interpreted as normative
+requirements in all the documentation within this directory.
+
+## Overview
+
+Internal telemetry is a first-class concern of this project. As with any complex
+system, reliable operation, performance analysis, and effective debugging
+require intentional and well-designed instrumentation. This document defines the
+principles, guidelines, and implementation details governing the project's
+internal telemetry.
+
+We follow an **observability by design** approach: observability requirements
+are defined early and evolve alongside the system itself. All main entities or
+components are expected to be instrumented consistently, using well-defined
+schemas and conventions, so that emitted telemetry is coherent, actionable, and
+suitable for long-term analysis and optimization.
+
+This approach is structured around the following lifecycle:
+
+1) **Set clear goals**: Define observability objectives up front. Identify which
+   signals are required and why.
+2) **Automate**: Use tooling to derive code, documentation, tests, and schemas
+   from shared conventions.
+3) **Validate**: Detect observability and schema issues early through CI and
+   automated checks, not in production.
+4) **Iterate**: Refine telemetry based on real-world usage, feedback, and
+   evolving system requirements.
+
+Telemetry is treated as a **stable interface** of the system. As with any public
+API, backward compatibility, semantic clarity, and versioning discipline are
+essential. Changes to telemetry should be intentional, reviewed, and aligned
+with the overall observability model.
+
+See the [Stability and Compatibility Guide](stability-compatibility-guide.md)
+for the stability model, compatibility rules, and deprecation process.
+
+## Goals
+
+Internal telemetry MUST enable:
+
+- reliable operation and incident response
+- performance analysis and regression detection
+- capacity planning and saturation detection
+- change impact analysis (deploys, config reloads, topology changes)
+- long-term trend analysis with stable schema and naming
+
+Telemetry MUST NOT compromise:
+
+- system safety and correctness
+- performance budgets on hot paths
+- confidentiality (PII, secrets, sensitive payloads)
+
+## Core principles
+
+The principles below define how internal telemetry is designed, implemented,
+validated, and evolved in this project. They are intentionally opinionated and
+serve as a shared contract between contributors, tooling, and runtime behavior.
+
+### 1. Schema-first
+
+All telemetry is defined **schema-first**. Entities, signals, attributes, and
+their relationships MUST be described explicitly in a schema before or alongside
+their implementation.
+
+Schemas are treated as versioned artifacts and as the primary source of truth
+for:
+
+- instrumentation requirements,
+- validation rules,
+- documentation generation,
+- and client SDK generation.
+
+Ad hoc or implicit telemetry definitions are discouraged, as they undermine
+consistency, tooling, and long-term maintainability.
+
+### 2. Entity-centric
+
+Telemetry is modeled around **entities**, which represent stable, identifiable
+subjects of observation. Signals describe the state, behavior, or performance of
+one or more entities at a given point in time.
+
+This project favors:
+
+- clear separation between **entity attributes** (stable context) and
+  **signal-specific attributes** (dynamic context),
+- bounded and well-justified attribute cardinality,
+- stable identifiers to support correlation across signals, restarts, and
+  system boundaries.
+
+Entity modeling is a prerequisite for producing telemetry that is interpretable,
+composable, and operationally useful at scale.
+
+### 3. Type-safe and performance-focused instrumentation
+
+The telemetry SDK is **type-safe by construction** and **performance-aware**.
+
+Instrumentation APIs should:
+
+- prevent invalid or non-compliant telemetry at compile time whenever possible,
+- minimize overhead on hot paths,
+- avoid unnecessary allocations and dynamic behavior,
+- make the cost of instrumentation explicit and predictable.
+
+Correctness, efficiency, and safety take precedence over convenience.
+
+### 4. Runtime-safe and failure-resilient telemetry
+
+Telemetry is designed to be runtime-safe and failure-resilient by default.
+
+Instrumentation and telemetry pipelines MUST be non-fatal, bounded, and isolated
+from critical execution paths.
+
+Specifically:
+
+- Export failures MUST NOT break or stall the dataflow engine.
+- Telemetry pipelines MUST use bounded buffers.
+- Under pressure, the default behavior SHOULD be to drop telemetry rather than
+  block critical work.
+- Drops SHOULD be observable via counters, categorized by drop reason, and
+  optionally via debug events.
+
+The telemetry system MUST NOT introduce deadlocks, unbounded memory growth, or
+process termination. Similar principles have been established in the official
+OpenTelemetry documentation
+for [error handling](https://opentelemetry.io/docs/specs/otel/error-handling/#basic-error-handling-principles).
+
+### 5. Alignment with OpenTelemetry semantic conventions
+
+This project adopts **OpenTelemetry semantic conventions** as the baseline
+vocabulary and modeling framework.
+
+Where existing conventions are sufficient, they are reused directly. Where
+project-specific concepts are required, they are defined in a **custom semantic
+convention registry**, aligned with OpenTelemetry principles and formats.
+
+This registry formally describes:
+
+- the entities relevant to the project,
+- the signals emitted by the system,
+- the allowed attributes, types, units, and stability guarantees.
+
+### 6. First-class support for multivariate metrics
+
+The internal telemetry model and SDK natively support **multivariate metric
+sets**.
+
+This enables:
+
+- efficient sharing of attribute tuples,
+- coherent modeling of related measurements,
+- reduced duplication explosion compared to naive univariate metrics.
+
+Multivariate metrics are treated as a fundamental modeling capability rather
+than a post-processing optimization.
+
+Note: OTLP and OTAP protocols do not yet have first-class support for
+multivariate metrics. The SDK and exporters handle the necessary translation and
+encoding. We plan to contribute multivariate support to OpenTelemetry protocols
+in the future. In the meantime, this project serves as a proving ground for the
+concept.
+
+### 7. Tooling-driven validation and documentation with Weaver
+
+Telemetry correctness and completeness are enforced through **tooling, not
+convention alone**.
+
+This project plans to integrate with **Weaver** to:
+
+- validate emitted telemetry against the versioned semantic convention registry,
+- perform registry compliance checks in CI,
+- execute live checks during tests to ensure that expected signals are actually
+  produced,
+- generate authoritative documentation in Markdown or HTML from the registry.
+
+An administrative endpoint exposes the live, resolved schema at runtime to
+support inspection, debugging, and tooling integration.
+
+Security and deployment guidance for this endpoint is in the
+[Security and Privacy Guide](security-privacy-guide.md).
+
+Registry compliance checks and live checks are not yet enforced in CI. See
+[Implementation Gaps](implementation-gaps.md).
+
+### 8. Automated client SDK generation (longer term)
+
+In the longer term, the custom semantic convention registry will be used to
+generate **schema-driven, type-safe, and optimized Rust client SDKs** via
+Weaver. Meanwhile, we built a type-safe Rust instrumentation API on top of the
+standard client SDK.
+
+The objective is to:
+
+- eliminate manual duplication between schema and code,
+- ensure strict alignment between instrumentation and specification,
+- provide contributors with safe, ergonomic APIs that encode observability
+  rules directly in types.
+
+This is considered a strategic investment and will be introduced incrementally.
+
+### 9. Telemetry as a stable interface
+
+Telemetry is treated as a **stable interface of the system**.
+Refer to [Stability and Compatibility Guide](stability-compatibility-guide.md).
+
+## Instrumentation guides
+
+**Instrumentation** is the act of adding telemetry signals (metrics, events,
+traces) to the codebase to observe the system behavior and performance.
+
+The [entity model](entity-model.md) defines the observed things, the "nouns" of
+our system, and how signals describe them. Entities are described by attributes
+that provide context to metrics, events, and traces, and a single signal can
+involve multiple entities at once. **Attribute cardinality must be bounded** to
+keep telemetry efficient and aggregations meaningful. Identifier stability
+matters for correlation across signals and restarts; refer to the stability
+guarantees in the entity model when adding new attributes.
+
+The naming conventions, units and general guidelines are in the
+[semantic conventions guide](semantic-conventions-guide.md). Contributors SHOULD
+read it before introducing new telemetry.
+
+The guides below provide a framework for defining **good, consistent, secure,
+and actionable signals**. They are not an exhaustive list of every signal and
+attribute in the project, but a shared reference for how to introduce and evolve
+telemetry:
+
+- [Attributes Guide](attributes-guide.md)
+- [System Metrics Guide](metrics-guide.md)
+- [System Events Guide](events-guide.md)
+- [System Traces Draft - Not For Review](tracing-draft-not-for-review.md)
+- [Stability and Compatibility Guide](stability-compatibility-guide.md)
+- [Security and Privacy Guide](security-privacy-guide.md)
+
+## Implementation details
+
+For implementation details of the telemetry SDK, including macros, schema
+handling, and the dataflow for metric collection, see the
+[telemetry implementation description](/crates/telemetry/README.md).
+
+Note: This SDK is internal to the project and optimized for our use cases. It is
+not intended for public use (at least not yet). It may change without notice.
+
+The documentation in this directory focuses on the intented design and policy
+aspects of internal telemetry. The current implementation does not yet fully
+realize all goals and principles described here, but it is evolving rapidly.
+The [implementation gaps](implementation-gaps.md) document tracks the progress.
+
+## Contributor workflow
+
+When adding or changing telemetry:
+
+1) Update the semantic convention registry first (schema-first).
+2) Regenerate documentation and code (when applicable).
+3) Run CI validation when available (registry checks, live checks in tests).
+4) If the change is breaking, bump the registry version and add a migration
+   note.
+
+Implementation status of this workflow (what is enforced, generated, and
+validated) is tracked in [implementation-gaps.md](implementation-gaps.md).
+Coordinate with maintainers when making changes that are not yet
+tooling-supported.
diff --git a/rust/otap-dataflow/docs/telemetry/attributes-guide.md b/rust/otap-dataflow/docs/telemetry/attributes-guide.md
new file mode 100644
index 0000000000..20599baa02
--- /dev/null
+++ b/rust/otap-dataflow/docs/telemetry/attributes-guide.md
@@ -0,0 +1,163 @@
+# Attributes guide
+
+Status: Draft
+
+This document consolidates project decisions and guidance on attributes:
+naming, placement, cardinality, normalization, and lifecycle.
+
+It complements:
+
+- [semantic-conventions-guide.md](semantic-conventions-guide.md) for upstream
+  naming rules
+- [entity-model.md](entity-model.md) for the entity attribute sets and
+  relationships
+- [stability-compatibility-guide.md](stability-compatibility-guide.md) for
+  evolution rules
+- [security-privacy-guide.md](security-privacy-guide.md) for sensitive-data
+  constraints
+
+## Attribute categories
+
+Attributes fall into three categories.
+
+### 1) Resource attributes
+
+Describe the producing service and runtime environment.
+
+- MUST be attached at the resource level
+- MUST NOT be duplicated on every signal
+- SHOULD reuse upstream semantic conventions (`service.*`, `host.*`,
+  `process.*`, `container.*`)
+
+### 2) Entity attributes
+
+Identify stable in-process entities (pipelines, nodes, channels, runtime
+threads).
+
+- MUST be attached/translated as scope attributes in OTLP exports
+- MUST NOT be duplicated on every signal
+- MUST be stable for the lifetime of the entity
+- MUST be bounded and known at entity creation time
+- MUST be the foundation of metric set identity for core system telemetry
+
+### 3) Signal-specific attributes
+
+Provide additional bounded context needed to interpret a measurement or event
+occurrence.
+
+- MAY be used when required for interpretation
+- MUST be bounded and documented
+- MUST remain meaningful under aggregation (metrics) and filtering (events)
+
+## Naming and namespaces
+
+### Reuse upstream first
+
+- Reuse existing OpenTelemetry semantic attributes whenever possible.
+- Do not redefine upstream attributes with different meaning.
+
+### Project-defined namespace
+
+Project-defined entity attributes MUST be namespaced to avoid collisions with
+upstream conventions.
+
+Policy:
+
+- Use `otelcol.*` for project-defined attributes.
+- Do not introduce new un-prefixed top-level namespaces for custom entities.
+
+### Closed sets (enums)
+
+When an attribute represents a categorical dimension:
+
+- The value set MUST be a documented closed set.
+- Values MUST be lowercase and stable.
+- Avoid synonyms that fragment cardinality (`fail` vs `error` vs `failed`).
+
+Adding enum values for stable telemetry follows the compatibility rules in
+[stability-compatibility-guide.md](stability-compatibility-guide.md).
+
+## Placement rules
+
+- Resource attributes belong on the resource attributes.
+- Entity attributes belong on the scope attributes.
+- Signal-specific attributes belong only where they apply, and must be bounded.
+
+Do not duplicate information:
+
+- If a value is already present as an entity attribute, do not repeat it as a
+  signal-specific attribute.
+- Prefer a single canonical key.
+
+### Core rule
+
+Attributes attached to core system metrics MUST have bounded cardinality.
+
+Before adding an attribute, ask:
+
+- If I aggregate across this attribute, does the result still make sense?
+- Is the value space bounded and known at entity creation time?
+
+If not, the attribute is mis-modeled for core system metrics.
+
+### Prohibited by default in core system metrics
+
+The following are prohibited as metric attributes unless explicitly approved and
+normalized:
+
+- user_id, session_id, request_id
+- raw URL path, raw query string
+- raw SQL, raw error messages, unbounded file paths
+- unbounded plugin configuration values
+
+Important node: This restriction may be relaxed in the future through an
+explicit opt-in mechanism, for example to allow controlled propagation of
+selected attributes from baggage into metrics or logs under well-defined
+processing rules.
+
+### Normalization patterns
+
+When context is useful but high cardinality, normalize:
+
+- URL path -> route template
+- SQL query -> normalized fingerprint
+- IP address -> prefix or bucket
+- error message -> error class or error type
+
+## Errors and exceptions
+
+### Error classification
+
+Prefer low-cardinality classification:
+
+- Use `error.type` (or an equivalent stable classifier) when applicable.
+- Avoid raw error messages as attributes in stable telemetry.
+
+### Exceptions
+
+When recording an actual exception:
+
+- Use `exception.type` and `exception.message`.
+- `exception.stacktrace` must follow
+  [security-privacy-guide.md](security-privacy-guide.md) gating rules.
+
+## Attributes vs event body
+
+This project distinguishes between queryable attributes and potentially large
+bodies:
+
+- Put small, queryable fields in attributes.
+- Put large payloads in the body only when strictly required.
+- Do not duplicate the same data in both places.
+
+## Checklist
+
+When introducing a new attribute:
+
+- It is categorized (resource, entity, signal-specific).
+- It reuses upstream semantic attributes when available.
+- If project-defined, it uses the `otelcol.*` namespace.
+- Cardinality is bounded and documented.
+- For enums, the closed set is documented and stable.
+- It follows security and privacy rules (no sensitive data).
+- If stable, the change follows the compatibility rules.
diff --git a/rust/otap-dataflow/docs/telemetry/entity-model.md b/rust/otap-dataflow/docs/telemetry/entity-model.md
new file mode 100644
index 0000000000..cfc34b198b
--- /dev/null
+++ b/rust/otap-dataflow/docs/telemetry/entity-model.md
@@ -0,0 +1,184 @@
+# OTAP dataflow engine entity model
+
+## Introduction
+
+This document describes the entity model used by this project to organize and
+categorize the collected telemetry data.
+
+As a reminder, an entity is a stable, identifiable subject of observation to
+which telemetry signals relate. OpenTelemetry SDKs and protocols do not define
+a first-class Entity object. Instead, entities are implicitly modeled through
+sets of attributes. OpenTelemetry Semantic Conventions can be used to define the
+entities that exist for a given project. A single signal can involve multiple
+entities.
+
+| Concept    | Role                                          |
+|------------|-----------------------------------------------|
+| Entity     | The observed thing                            |
+| Signal     | The observation                               |
+| Attributes | Properties describing the entity or the event |
+
+Note: This document will be replaced by formal OpenTelemetry Semantic
+Conventions in the future. For now, it serves as an internal reference for the
+project.
+
+## Related guides
+
+- Attribute policy: [attributes-guide.md](attributes-guide.md)
+- Stability
+  rules: [stability-compatibility-guide.md](stability-compatibility-guide.md)
+- Implementation status: [implementation-gaps.md](implementation-gaps.md)
+
+## Attribute ownership
+
+- Resource attributes describe the producing service/process/host/container and
+  MUST be attached at the resource level.
+- Entity attributes identify in-process entities (pipelines, nodes, channels)
+  and MUST be stable for the entity lifetime.
+- Signal-specific attributes (when used) MUST be bounded and documented
+  alongside the signal.
+
+Project-specific entity attributes use the `otelcol.*` prefix to avoid
+collisions with existing and future semantic conventions.
+
+## Project entities
+
+### Service
+
+The logical service representing the OTAP Engine.
+
+Attributes (resource level):
+
+- `service.name`: The name of the service (e.g. "otap_engine").
+- `service.instance.id`: A unique identifier for the service instance.
+
+### Host
+
+The physical or virtual machine where the OTAP Engine is running.
+
+Attributes (resource level):
+
+- `host.id`: A unique identifier for the host machine.
+- `host.name`: The hostname of the machine.
+
+### Container
+
+The container instance where the OTAP Engine is running (if applicable).
+
+Attributes (resource level):
+
+- `container.id`
+
+### Process
+
+The process instance of the OTAP Engine running on the host or in the container.
+
+Attributes (resource level):
+
+- `process.pid`
+- `process.creation.time`
+
+### OTAP execution engine
+
+The OTAP pipeline execution engine running in the process.
+
+Attributes:
+
+- `otelcol.numa_node.logical_number`: NUMA node identifier.
+- `cpu.logical_number` (was named core.id): Core CPU identifier.
+- `thread.id`: Thread identifier.
+
+### Pipeline
+
+A data processing pipeline running within the OTAP Execution Engine.
+
+Attributes:
+
+- `otelcol.pipeline_group.id`: Pipeline group unique identifier.
+- `otelcol.pipeline.id`: Pipeline unique identifier.
+
+### Node
+
+A processing unit within a pipeline. There are three types of nodes:
+
+- Receiver: Ingests and translates data from external sources
+- Processor: Transforms, filters, batches, or enriches data
+- Exporter: Delivers processed data to external systems
+
+Attributes:
+
+- `otelcol.node.id`: Node unique identifier (in scope of the pipeline).
+- `otelcol.node.urn`: Node plugin URN.
+- `otelcol.node.type`: Node type (e.g. "receiver", "processor", "exporter").
+
+### Channels
+
+Channels connect nodes within a pipeline. There are two types of channels:
+
+- Control Channel: Used for orchestration commands (e.g. config_update, ack,
+  timer_tick, shutdown)
+- PData Channel: Used for ingesting batches of telemetry signals (metrics, logs,
+  events, spans)
+
+Channels are observed via two endpoint perspectives: sender and receiver.
+
+- Sender-side signals attach the sender node identity plus `otelcol.channel.*`
+  attributes.
+- Receiver-side signals attach the receiver node identity plus
+  `otelcol.channel.*` attributes.
+- `otelcol.channel.id` connects sender and receiver signals that belong to the
+  same channel.
+
+Attributes:
+
+- `otelcol.channel.id`: Unique channel identifier (in scope of the pipeline).
+- `otelcol.channel.kind`: Channel payload kind ("control" or "pdata").
+- `otelcol.channel.mode`: Concurrency mode of the channel ("local" or "shared").
+- `otelcol.channel.type`: Channel type ("mpsc", "mpmc", "spsc", "spmc").
+- `otelcol.channel.impl`: Channel implementation ("tokio", "flume", "internal").
+- `otelcol.channel.sender.out.port`: Output port of the sender node.
+
+The `otelcol.channel.id` format depends on the channel kind:
+
+- Control Channel: `control:{node.id}`
+- PData Channel: `pdata:{source_node.id}:{out_port}`
+
+## Stability and identity guarantees
+
+Unless noted otherwise, identifiers are stable for the lifetime of their entity
+and may change on restart or reconfiguration.
+
+- `service.instance.id`: Unique per process start (changes on restart).
+- `service.name`: Stable per deployment; not guaranteed unique.
+- `host.name`: Human-readable hostname; not guaranteed globally unique and may
+  change if the host is renamed.
+- `container.id`: Stable for the container lifetime.
+- `process.pid`, `process.creation.time`: Stable for the process lifetime.
+- `otelcol.numa_node.logical_number`, `cpu.logical_number`: Stable for a host
+  boot; may change with CPU or NUMA reconfiguration.
+- `thread.id`: Stable for the thread lifetime; may be reused after thread exit.
+- `otelcol.pipeline_group.id`, `otelcol.pipeline.id`, `otelcol.node.id`: Stable
+  across configuration reloads; intended to remain consistent for the same
+  logical pipeline graph.
+- `otelcol.channel.id`: Identifies the source + output port only and is stable
+  across configuration reloads as long as the source node id and port are
+  unchanged.
+- `otelcol.channel.sender.out.port`: Stable across configuration reloads for a
+  given pipeline graph.
+
+## Entity relationships
+
+Relationships are implicit and expressed through co-located attribute sets on
+the same signal. The entity model can be read as a containment chain plus a DAG
+of channels.
+
+Containment chain:
+Service -> Process -> Execution Engine -> Pipeline Group -> Pipeline -> Node
+
+Channels connect nodes:
+
+- `otelcol.channel.id` identifies the source node + output port only; fan-out
+  receivers share the same `otelcol.channel.id`.
+- Node identity is carried by the `otelcol.node.*` attributes on each signal.
+- Endpoint role is implied by the metric set (e.g. `channel.sender` vs
+  `channel.receiver`), not by a channel attribute.
diff --git a/rust/otap-dataflow/docs/telemetry/events-guide.md b/rust/otap-dataflow/docs/telemetry/events-guide.md
new file mode 100644
index 0000000000..f096e39a67
--- /dev/null
+++ b/rust/otap-dataflow/docs/telemetry/events-guide.md
@@ -0,0 +1,169 @@
+# System events guide
+
+This guide defines how to add **system events** for the OTAP engine. It
+complements the [semantic conventions guide](semantic-conventions-guide.md) and
+the [entity model](entity-model.md).
+
+## Related guides
+
+- Attribute policy (including attributes vs event body guidance):
+  [Attributes Guide](attributes-guide.md)
+- Stability model and compatibility rules for event schemas:
+  [Stability and Compatibility Guide](stability-compatibility-guide.md)
+- Sensitive data and stacktrace gating:
+  [Security and Privacy Guide](security-privacy-guide.md)
+
+## What events are for
+
+Events are discrete occurrences that benefit from context and correlation but do
+not need to be aggregated as metrics. In OTLP, the event name MUST be carried in
+the LogRecord `event_name` field. Do not introduce new telemetry that sets
+`event.name` as an attribute.
+
+Use events to record:
+
+- Controller/Pipeline actions (config reload, shutdown, ack, timer ticks).
+- State transitions (batch flush, backpressure, queue full).
+- Exceptional outcomes (errors, retries, drops).
+
+If the signal is high-volume or needs aggregation, prefer metrics. If the
+event is part of a dataflow trace, use a regular event with a trace ID, not a
+span event record, as span events are
+becoming [deprecated](https://github.com/open-telemetry/opentelemetry-specification/blob/main/oteps/4430-span-event-api-deprecation-plan.md).
+
+Exception rule (traces):
+
+- If you are recording an actual exception on a span, the regular event name
+  MUST be `exception` and the standard exception attributes MUST be used.
+
+In this project, events are preferred to unstructured logs. Event names are
+codified (see below), and their attributes consist of the attributes of the
+relevant entity or entities (stable context), combined with event-specific
+attributes (dynamic context).
+
+Treat event names as schema identifiers. Evolution rules are defined in
+[Stability and Compatibility Guide](stability-compatibility-guide.md).
+
+## Event naming
+
+Event names MUST be low-cardinality and stable. Follow the semantic conventions
+guide for naming:
+
+- Lowercase and dot-separated. It identifies a class of event, not an instance.
+- Keep the name stable and "type-like". Treat it as a schema identifier.
+- Use verbs for actions (e.g. `pipeline.config.reload`).
+- Avoid embedding IDs or dynamic values in the name. Encode variability as
+  attributes.
+- Avoid synonyms that fragment cardinality across names (`finish` vs `complete`,
+  `error` vs `fail`). Pick one verb set and stick to it.
+
+More precisely, in this project, event names SHOULD follow this pattern:
+`otelcol.<entity>[.<thing>].<verb>`
+
+Where:
+
+- `otelcol.` is the project prefix/namespace used for events and other custom
+  telemetry.
+- `<entity>` is the primary entity involved (e.g. `pipeline`, `node`,
+  `channel`). See the [entity model](entity-model.md) for the list of entities.
+- `<thing>` is an optional sub-entity, subject, or stage (e.g. `build`, `run`,
+  `receiver`, `exporter`).
+- `<verb>` is the action or occurrence (e.g. `start`, `complete`, `fail`,
+  `reload`, `shutdown`).
+
+Note: OpenTelemetry Events are represented as LogRecords with an event name.
+In OTLP, this is carried in the LogRecord `event_name` field (not in the body).
+
+## Attributes and context
+
+Always attach the relevant entity attributes (stable context):
+
+- Pipeline attributes for pipeline-level events.
+- Node attributes for node-level events.
+- Channel attributes for channel-related events.
+
+Optionally, add occurrence-specific attributes (dynamic context):
+
+- Prefer enums or stable categorical values whenever possible.
+- Use standard exception attributes for errors (`exception.type`,
+  `exception.message`). Stacktrace gating rules are in
+  [Security and Privacy Guide](security-privacy-guide.md).
+- Follow [Security and Privacy Guide](security-privacy-guide.md) to avoid
+  sensitive data.
+
+## Severity and placement
+
+When events are exported as logs, set an appropriate severity.
+
+Regarding severity, some events may be logged at different levels depending on
+their severity or impact. For example, a `node.shutdown` event may be logged at
+INFO level during a graceful shutdown, but at ERROR level if the shutdown is due
+to a critical failure. When exporting events as logs, choose the log level that
+best reflects the significance of the event.
+
+## Stages
+
+The following stages are recommended for event names:
+
+- `pipeline`:
+  - `build`: Pipeline construction phase.
+  - `run`: Pipeline execution phase.
+  - `report`: Pipeline metrics reporting phase.
+- `node`:
+  - `build`: Node construction phase.
+  - `run`: Node execution phase.
+- `channel`:
+  - `send`: Channel send phase.
+  - `recv`: Channel receive phase.
+
+This list is not exhaustive. Choose stages that best describe the context while
+maintaining clarity and consistency.
+
+## Verbs
+
+The following verbs are recommended for event names:
+
+- `create`: The creation of an entity or resource.
+- `init`: The initialization of an entity or resource.
+- `start`: The beginning of an operation or process.
+- `complete`: The successful end of an operation or process.
+- `fail`: An operation or process that ended with an error.
+- `stop`: The beginning of a stop or shutdown process.
+- `pause`: The pausing of an operation or process.
+- `resume`: The resumption of an operation or process.
+- `apply`: An application of configuration or state.
+- `flush`: A batch or buffer flush.
+- `drop`: A drop occurrence.
+- `backpressure`: A backpressure occurrence.
+- `retry`: A retry attempt.
+- `ack`: An acknowledgment occurrence.
+- `nack`: A negative acknowledgment occurrence.
+- `tick`: A timer tick occurrence.
+- `sleep`: A sleep occurrence.
+- `cancel`: An operation was intentionally stopped by an external decision
+  before it finished. Triggered by a caller, operator, controller, or policy.
+  Usually expected and often benign. Not an error in itself.
+- `abort`: An operation was forced to stop due to an internal safety condition
+  or unrecoverable state. Triggered inside the system. Indicates something went
+  wrong or became unsafe. Usually unexpected.
+- `timeout`: A timeout occurrence.
+
+This list is not exhaustive. Choose verbs that best describe the action while
+maintaining clarity and consistency. Avoid synonyms that fragment cardinality
+across names. Don't introduce alternatives such as `finish` or `error`. Use
+one success verb `complete`, one failure verb `fail`, one external
+termination verb `cancel`, and one internal safety verb `abort`.
+
+## Checklist for new events
+
+- The event name follows the semantic conventions guide and the
+  `otelcol.<entity>[.<thing>].<verb>` pattern.
+- The event name is stable, low-cardinality, and contains no IDs or dynamic
+  values.
+- The event represents a discrete occurrence; use metrics instead for
+  high-volume signals.
+- Relevant entity attributes are included (pipeline/node/channel/etc).
+- Dynamic attributes are bounded and avoid sensitive or high-cardinality data.
+- Error events use standard exception attributes; stacktraces only at debug or
+  lower.
+- Severity is appropriate and consistent with the event meaning.
diff --git a/rust/otap-dataflow/docs/telemetry/implementation-gaps.md b/rust/otap-dataflow/docs/telemetry/implementation-gaps.md
new file mode 100644
index 0000000000..03d3374ca2
--- /dev/null
+++ b/rust/otap-dataflow/docs/telemetry/implementation-gaps.md
@@ -0,0 +1,59 @@
+# Implementation gaps
+
+Status: Draft
+
+This document consolidates known gaps between the telemetry documentation and
+what is currently implemented or enforced.
+
+Goal:
+
+- Keep all "not yet implemented" notes and process gaps in one place
+- Avoid sprinkling implementation status across multiple guides
+
+## How to use this document
+
+- Treat the other guides as the intended design and policy.
+- Treat this document as the current status tracker.
+- When a gap is closed, remove it here and update any affected guide text if
+  needed.
+
+## Gaps and open work
+
+### Signals and data model
+
+| Area                 | Gap                                                     | Impact                                                   |
+|----------------------|---------------------------------------------------------|----------------------------------------------------------|
+| Metrics              | Histograms not supported yet                            | Limits latency and size distributions                    |
+| Metrics              | Bounded signal-specific metric attributes not supported | Limits modeling of small enum dimensions on core metrics |
+| Multivariate metrics | OTLP and OTAP lack first-class multivariate metric sets | Limits protocol efficiency; some semantics may be lossy  |
+| Tracing              | Traces not implemented (draft only)                     | Limits end-to-end causality and latency debugging        |
+
+### Resource identity and entity attributes
+
+| Area             | Gap                                                               |
+|------------------|-------------------------------------------------------------------|
+| Service identity | `service.name` not set everywhere                                 |
+| Service identity | `process.instance.id` used instead of `service.instance.id`       |
+| Execution engine | `thread.id` not set                                               |
+| Execution engine | `core.id` used instead of `cpu.logical_number`                    |
+| Execution engine | `numa.node.id` used instead of `otelcol.numa_node.logical_number` |
+| Channels         | `otelcol.channel.sender.out.port` not set                         |
+| Channels         | Channel id format not enforced                                    |
+
+### Tooling and process
+
+| Area           | Gap                                                                                                                                                                                                                                                                                              | Impact                                                        |
+|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------|
+| Validation     | Registry compliance checks and live checks not covered                                                                                                                                                                                                                                           | Drift between schema and emitted telemetry                    |
+| Stability      | Stability level not declared for all signals                                                                                                                                                                                                                                                     | Hard to apply compatibility discipline                        |
+| Deprecation    | Migration windows and dual emission not implemented                                                                                                                                                                                                                                              | Breaking changes may slip into stable telemetry               |
+| SDK generation | Automated client SDK generation not implemented yet                                                                                                                                                                                                                                              | Manual duplication between schema and code                    |
+| Registry files | Semantic convention registry is not yet available as files; definitions live in macros. A set of files will be created and an optimized client SDK will be generated to mimic the current macro-based SDK, giving us time to refine the optimized SDK code before generating it from a registry. | Limited tooling and visibility until the registry files exist |
+
+### Open questions
+
+| Topic                      | Question                                                              |
+|----------------------------|-----------------------------------------------------------------------|
+| Bounded dynamic attributes | How do we implement them?                                             |
+| Metrics endpoint           | What is the default deployment posture (off by default vs protected)? |
+| Schema endpoint            | What is the default deployment posture (off by default vs protected)? |
diff --git a/rust/otap-dataflow/docs/telemetry/metrics-guide.md b/rust/otap-dataflow/docs/telemetry/metrics-guide.md
new file mode 100644
index 0000000000..88e64b6b25
--- /dev/null
+++ b/rust/otap-dataflow/docs/telemetry/metrics-guide.md
@@ -0,0 +1,181 @@
+# System metrics guide
+
+This guide defines how to add and evolve system metrics for the OTAP dataflow
+engine. It complements
+the [semantic conventions guide](semantic-conventions-guide.md)
+and the [entity model](entity-model.md).
+
+System metrics are intended to describe the behavior of stable entities over
+time. This document summarizes the patterns we follow when instrumenting system
+metrics in the engine.
+
+In this documentation, core system metrics/telemetry refers to telemetry used
+to operate a system in a reliable way and to understand the behavior of the
+main entities/components of the observed system. It is not for product
+analytics or business telemetry.
+
+## Related guides
+
+- Attribute policy: [attributes-guide.md](attributes-guide.md)
+- Stability
+  rules: [stability-compatibility-guide.md](stability-compatibility-guide.md)
+- Implementation status: [implementation-gaps.md](implementation-gaps.md)
+
+## Entity-centric modeling
+
+Start by naming the entity the metric describes. A metric set should map to a
+single entity type (pipeline, node, channel sender, channel receiver, runtime
+thread, and so on). Metric identity should remain stable while values evolve.
+
+Examples of stable entities:
+
+- CPU core, NUMA node, runtime thread
+- Pipeline, node, channel endpoint (sender or receiver)
+- Queue, buffer, connection pool
+
+### Entity vs event vs request
+
+Metrics are for entity behavior, not request identity.
+
+- Events such as reloads, errors, or state changes are better captured as
+  events and can be counted with metrics only when attributes stay stable.
+- Requests and transactions are high-cardinality and short-lived. Use traces,
+  events, or exemplars instead of encoding request identifiers in metrics.
+- Prefer metrics when the signal is high volume or when trends matter more than
+  individual occurrences. Use events or traces for discrete, low-volume
+  occurrences.
+
+## Metric and metric set
+
+*Metrics* in this project use the instrument types supported by our internal
+telemetry SDK (see [crates/telemetry](/crates/telemetry/README.md) for details):
+
+- Counter: monotonic counts of events or outcomes, recorded as deltas.
+- UpDownCounter: signed deltas that can increase or decrease over time.
+- ObserveCounter: monotonic counts recorded as observed cumulative values.
+- ObserveUpDownCounter: observed values that may go up or down.
+- Gauge: instantaneous measurements (last-value), used for capacity,
+  utilization, queue depth.
+
+Histogram support status is tracked in
+[Implementation Gaps](implementation-gaps.md).
+
+ObserveUpDownCounter and Gauge both report values that can rise or fall, but
+they aggregate differently.
+
+- A Gauge uses last-value aggregation,
+- An ObserveUpDownCounter is a sampled cumulative value that aggregates by
+  summing deltas over time.
+
+In this project, ObserveUpDownCounter is used for observed totals like
+`otelcol.pipeline.metrics.memory_usage` and
+`otelcol.tokio.runtime.task_active_count`, while Gauge is used for instantaneous
+values like `otelcol.pipeline.metrics.cpu_utilization` and
+`channel.receiver.capacity`.
+
+Guideline:
+
+- Use Gauge for point-in-time levels (queue depth, active tasks, memory in use).
+- Use (Observe)Counter for counts (items processed, drops).
+- Use ObserveUpDownCounter only when you have a strong reason to preserve the
+  "observed cumulative" interpretation across collection intervals.
+
+A *metric set* is a collection of metrics related to a single entity being
+observed. That entity often belongs to a larger system of entities, so metric
+set attributes are usually a composition of multiple entity attributes (for
+example, resource + engine + pipeline + node + channel). All metrics in a set
+share the same attribute set, which contains only entity-related attributes. In
+this project, core metrics prioritize entity identity. However, bounded
+signal-specific attributes MAY be used when they are necessary to interpret the
+measurement (for example, a small enum such as a "state" dimension). When used,
+signal-specific attributes MUST be:
+
+- bounded and documented as a closed set
+- meaningful under aggregation
+- preferably namespaced under the metric namespace as recommended by OTel naming
+  guidance
+
+Support status for bounded signal-specific attributes is tracked in
+[implementation-gaps.md](implementation-gaps.md).
+
+Metric naming must follow the
+[semantic conventions guide](semantic-conventions-guide.md). Descriptions and
+units are mandatory. Units must follow UCUM conventions and use braces notation
+only for annotation units (e.g. `{batch}`, `{signal}`). See the [Units](#units)
+section below for details.
+
+Metric set naming should follow the pattern `otelcol.<entity>` or
+`otelcol.<entity>.<subentity>` when applicable. Examples of metric sets in this
+project:
+
+- For generic entities:
+  - `otelcol.pipeline`, `otelcol.node`
+  - `otelcol.channel.sender`, `otelcol.channel.receiver`
+  - ...
+- For specific node types:
+  - `otelcol.node.retry`
+  - `otelcol.node.batch`
+  - `otelcol.node.otlp_receiver`
+  - `otelcol.node.otlp_exporter`
+  - ...
+
+## Attributes and entity context
+
+Metric attributes MUST follow the project-wide attribute policy in
+[Attributes Guide](attributes-guide.md).
+
+Metric-specific rule: attributes attached to core system metrics MUST remain
+meaningful under aggregation.
+
+Normalization patterns are documented in
+[Attributes Guide](attributes-guide.md).
+
+## Units
+
+Units must be specified for every metric as part of its metadata. They must
+follow UCUM conventions and use braces notation only for annotation units.
+
+The most common units in this project are:
+
+- Named units:
+  - `By`: bytes
+  - `s`: seconds (preferred over `ms` for time durations)
+- Annotation units:
+  - `{batch}`: batches of telemetry signals
+  - `{signal}`: individual telemetry signals (metrics, logs, traces)
+  - `{metric}`: individual metric data points
+  - `{log}`: individual log records
+  - `{event}`: individual event records (log with an event name)
+  - `{span}`: individual trace spans
+
+## Performance considerations
+
+Metric sets are optimized for low overhead:
+
+- The same attribute set is shared across all metrics in a metric set.
+- A metric set instance registers its attributes once during setup, and the
+  collection phase reports only scalar values.
+- On the hot path, we increment or set values in pre-allocated non-atomic slots,
+  avoiding dynamic lookups and allocations.
+- Metric sets are per-core to avoid cross-core contention, and the cold path
+  (flush, aggregate, encode) is NUMA-aware and batch-oriented.
+- Reset-on-flush and sparse enumeration minimize work by touching only non-zero
+  fields and dirty counters.
+
+More details about the telemetry SDK implementation are in
+[crates/telemetry](../../crates/telemetry/README.md).
+
+## Metric stability and compatibility
+
+Metrics and metric sets MUST follow the stability model in
+[stability-compatibility-guide.md](stability-compatibility-guide.md).
+
+### Checklist for new metrics
+
+- The metric name follows the semantic conventions guide.
+- The instrument type matches the intended meaning.
+- Units are specified and valid.
+- Attributes are stable and cardinality is bounded.
+- The metric can be interpreted using the entity model attributes.
+- Failure-oriented metrics SHOULD include a low-cardinality error classifier
+  when applicable (`error.type`).
diff --git a/rust/otap-dataflow/docs/telemetry/security-privacy-guide.md b/rust/otap-dataflow/docs/telemetry/security-privacy-guide.md
new file mode 100644
index 0000000000..1869f34c49
--- /dev/null
+++ b/rust/otap-dataflow/docs/telemetry/security-privacy-guide.md
@@ -0,0 +1,135 @@
+# Security and privacy guide
+
+Status: Draft
+
+This document defines security and privacy constraints for internal telemetry.
+
+Telemetry is a high leverage system: it can accidentally become a data
+exfiltration path, leak secrets, or expose sensitive topology. These rules are
+designed to prevent that.
+
+## Principles
+
+- Telemetry MUST NOT include secrets, credentials, or sensitive personal data.
+- Telemetry SHOULD include only what is required for operations and debugging.
+- Telemetry MUST be safe under failure and safe under adversarial inputs.
+- Access to telemetry SHOULD follow least privilege.
+
+## Data that must never appear in telemetry
+
+The following MUST NOT be recorded in metrics, events, traces, or attributes:
+
+- credentials, API keys, bearer tokens, cookies, session secrets
+- private keys, certificates, signing material
+- raw customer payloads or unredacted message bodies
+- email addresses, phone numbers, full names, physical addresses
+- full IP addresses if they can identify individuals (normalize or bucket
+  instead)
+- raw URLs containing query parameters or user-provided path segments
+- raw SQL queries, raw stack traces in high volume contexts (see Exceptions)
+
+If in doubt, do not emit it.
+
+Important note: The system must allow users, if they choose to do so, to log
+certain sensitive data (e.g. `user_id`) only when it is gated behind an explicit
+debug mode.
+
+## Allowed data with constraints
+
+The following are generally allowed if they remain bounded and non-sensitive:
+
+- stable internal IDs for entities (pipelines, nodes, channels), as defined in
+  the entity model
+- categorical outcomes and reasons (closed enums), for example drop reasons
+- normalized forms of user input, for example route templates instead of raw
+  paths
+- bounded numeric values describing system state and performance
+
+## Normalization and redaction
+
+When context is useful but high cardinality or sensitive, normalize:
+
+- URL path -> route template
+- SQL -> normalized fingerprint
+- IP address -> prefix or bucket
+- error message -> error class or error type
+
+Do not emit raw content that can include secrets or user identifiers.
+
+## Exceptions and stack traces
+
+Exceptions often include sensitive data. Rules:
+
+- Use structured exception attributes (e.g. `exception.type`) when needed.
+- `exeption.message` MUST NOT include sensitive data or raw user input.
+- `exception.stacktrace` SHOULD be gated behind:
+  - debug severity, or
+  - an explicit configuration flag
+- Stack traces MUST NOT be emitted on hot paths by default.
+
+## Events, body size, and spill risk
+
+Events are exported as logs.
+
+- Prefer small, queryable fields in attributes.
+- Large payloads SHOULD go into the event body only when strictly required.
+- Do not emit unbounded or repetitive bodies at high volume.
+
+Recommended practice:
+
+- Keep event bodies small and bounded.
+- When details are required, emit a stable error type in attributes and keep the
+  long detail behind debug-level gating.
+
+## Trace correlation
+
+When exporting events as logs and trace context exists:
+
+- include trace correlation (trace id and span id) so operators can pivot
+- do not copy trace ids into custom attributes unless required by tooling
+
+Trace and Span ids are not secrets but they can be used to join information
+across
+systems. Treat them as internal identifiers.
+
+## Schema endpoint security
+
+If the system exposes a runtime endpoint that returns the current signals, or
+resolved schema:
+
+- They SHOULD be protected by authentication and authorization, or limited to
+  trusted network boundaries.
+- They MUST be configurable to disable access entirely.
+- They MUST implement rate limiting to prevent abuse.
+- They MUST NOT expose secrets or raw configuration values.
+- Treat the endpoint as sensitive because it can reveal topology and
+  identifiers.
+
+## Metrics and diagnostic endpoints (/metrics, /status)
+
+If the system exposes metrics scrape endpoints (for example Prometheus-style) or
+diagnostic endpoints:
+
+- They SHOULD be protected by authentication and authorization, or limited to
+  trusted network boundaries.
+- They MUST NOT expose secrets or raw configuration values.
+- They SHOULD be designed to avoid unbounded responses (for example unbounded
+  label sets or dumping full topology on every request).
+- If an endpoint includes topology identifiers (pipelines, nodes, channels),
+  treat it as sensitive.
+
+## Data retention
+
+- Data retention SHOULD be appropriate for the sensitivity of the data.
+- If telemetry can include customer-adjacent signals, apply stricter retention
+  and access constraints.
+
+## Review checklist
+
+For any telemetry addition or change:
+
+- No secrets or personal data is recorded.
+- Attributes are bounded and normalized where appropriate.
+- Stack traces are gated and not emitted by default on hot paths.
+- Schema endpoint exposure is safe for the target deployment.
+- Documentation includes any special handling or risk notes.
diff --git a/rust/otap-dataflow/docs/telemetry/semantic-conventions-guide.md b/rust/otap-dataflow/docs/telemetry/semantic-conventions-guide.md
new file mode 100644
index 0000000000..0671d60f23
--- /dev/null
+++ b/rust/otap-dataflow/docs/telemetry/semantic-conventions-guide.md
@@ -0,0 +1,275 @@
+# OpenTelemetry semantic conventions - contributor guide
+
+This document summarizes the **core rules and guidelines contributors must
+follow in this project** when defining **metric names**, **units**,
+**attributes**, and **event metadata**.
+
+Its goal is to **ease contributor work** by providing a clear, concise, and
+opinionated reference tailored to this project.
+
+**Important**
+All rules and conventions described here are **derived directly from the
+official OpenTelemetry Semantic Conventions Guides**. These guides (see links
+below) are the **ultimate source of truth**, and this document must never be
+considered authoritative on its own.
+
+Primary references:
+
+- General naming conventions:
+  [https://opentelemetry.io/docs/specs/semconv/general/naming/](https://opentelemetry.io/docs/specs/semconv/general/naming/)
+- Metric semantic conventions:
+  [https://opentelemetry.io/docs/specs/semconv/general/metrics/](https://opentelemetry.io/docs/specs/semconv/general/metrics/)
+
+Contributors are expected to **consult the upstream OTel documentation**
+whenever ambiguity exists or when introducing new semantics.
+
+## Related project guides
+
+Project-specific policy (prefixing, attribute lifecycle, stability, security) is
+consolidated in:
+
+- [Entity Model](entity-model.md)
+- [Attributes Guide](attributes-guide.md)
+- [Metrics Guide](metrics-guide.md)
+- [Events Guide](events-guide.md)
+- [Stability and Compatibility Guide](stability-compatibility-guide.md)
+- [Security and Privacy Guide](security-privacy-guide.md)
+
+## 1. General naming conventions
+
+These rules apply to **metric names**, **attribute names**, **event names**, and
+other semantic identifiers.
+
+### Core rules
+
+- Names MUST be lowercase.
+- Use **dot (`.`) separators** to express hierarchy and namespaces.
+- Use **underscores (`_`) only inside a single namespace segment** to separate
+  words.
+- Names must:
+
+  - Start with a letter
+  - End with an alphanumeric character
+  - Not contain consecutive delimiters (`..`, `__`)
+- Avoid ambiguous, overloaded, or generic names.
+- Abbreviations are allowed **only when widely understood** (e.g. `http`, `cpu`,
+  `db`).
+- A semantic identifier must have **one clear meaning** and must not conflict
+  with existing conventions.
+
+Source:
+[https://opentelemetry.io/docs/specs/semconv/general/naming/](https://opentelemetry.io/docs/specs/semconv/general/naming/)
+
+### Reserved namespaces
+
+- The `otel.*` namespace is reserved.
+- Custom metric, event, and attribute names SHOULD use a project-specific prefix
+  and MUST NOT clash with existing semantic convention namespaces. We use the
+  same prefix as the OTel Collector: `otelcol.*`.
+
+## 2. Metric naming and semantics
+
+### Metric names
+
+- Metric names **must follow general naming conventions**.
+- Names should represent **what is being measured**, not how it is aggregated.
+- Prefer **nouns** or **noun phrases**.
+- Do **not encode units in metric names** when unit metadata is available.
+- Do **not append `_total`** or other backend-specific suffixes in OTel metrics.
+- Do **not pluralize** metric names unless they represent a count of discrete
+  entities.
+
+Examples:
+
+```plain
+http.server.request.duration
+system.cpu.time
+process.memory.usage
+```
+
+Source:
+[https://opentelemetry.io/docs/specs/semconv/general/metrics/](https://opentelemetry.io/docs/specs/semconv/general/metrics/)
+
+---
+
+### Metric attributes
+
+- Attributes add **dimensions**, not meaning.
+- Reuse existing semantic attributes whenever possible.
+- Attribute names must follow the same naming rules as metrics.
+- **Avoid attributes that introduce high cardinality unless explicitly
+  required.**
+- Attribute sets must remain meaningful under aggregation.
+
+Example:
+
+```plain
+http.server.request.duration{http.request.method="GET"}
+```
+
+---
+
+### Instrument semantics
+
+- Counters represent **monotonically increasing values**.
+- UpDownCounters represent values that may increase or decrease.
+- Gauges represent **instantaneous measurements**.
+- Histograms represent **distributions of measurements**.
+
+The instrument type must align with the semantic meaning of the metric.
+
+---
+
+## 3. Units guidelines
+
+### General rules
+
+- Units **must not be embedded in metric names**.
+- Units must be provided as metric metadata.
+- Units should follow **UCUM conventions**.
+- Units must be **unambiguous and self-contained**.
+
+Examples:
+
+- `s` for seconds
+- `By` for bytes
+- `1` for dimensionless ratios
+
+Source:
+[https://opentelemetry.io/docs/specs/semconv/general/metrics/#units](https://opentelemetry.io/docs/specs/semconv/general/metrics/#units)
+
+---
+
+### Duration and time
+
+- Durations should be expressed in **seconds (`s`)**.
+- Time counters should also use `s`.
+
+Example:
+
+```plain
+process.cpu.time   unit: s
+```
+
+---
+
+### Ratios and utilization
+
+- Ratios and utilization metrics are **dimensionless**.
+- Use unit `1`.
+
+Example:
+
+```plain
+system.cpu.utilization   unit: 1
+```
+
+---
+
+### Counts
+
+- Count metrics should use **curly-brace units** when applicable.
+- Use singular semantic units.
+
+Examples:
+
+```plain
+{request}
+{batch}
+{signal}
+{error}
+{connection}
+```
+
+---
+
+## 4. Events and attributes
+
+### Event naming
+
+- Event names must be **low cardinality** and stable.
+- Names must follow general naming conventions.
+- Events represent **discrete occurrences**, not continuous measurements.
+
+Note:
+In OTLP, events are represented as LogRecords with the `event_name` field set.
+The `event.name` attribute is deprecated and should not be used for new
+telemetry.
+
+Examples:
+
+```plain
+http.request.start
+otelcol.pipeline.config.apply
+connection.close
+```
+
+---
+
+### Event attributes
+
+- Attributes provide structured context for events.
+- Attribute naming rules are identical to metric attribute rules.
+- Use arrays for multiple values when appropriate.
+- Avoid duplicating information already present in metric streams unless
+  required.
+- For a given event name, removing an attribute or renaming an existing
+  attribute is considered a breaking change and must be handled with appropriate
+  versioning or migration strategy.
+
+Source:
+[https://opentelemetry.io/docs/specs/semconv/general/naming/](https://opentelemetry.io/docs/specs/semconv/general/naming/)
+
+---
+
+## 5. Examples and best practices
+
+### Good metric examples
+
+```plain
+http.server.request.duration   unit: s
+system.memory.usage            unit: By
+system.cpu.utilization         unit: 1
+```
+
+### Good attribute examples
+
+```plain
+http.method = "GET"
+http.status_code = 200
+network.transport = "tcp"
+```
+
+### Anti-patterns
+
+Avoid:
+
+- Units in names: `http_request_duration_seconds`
+- Backend-specific suffixes: `_total`, `_count`
+- Overloaded names with multiple meanings
+- High-cardinality attributes by default
+
+---
+
+## 6. Contributor checklist
+
+Before introducing a new metric or event, verify:
+
+- The name follows OTel naming rules.
+- Existing semantic conventions do not already cover the use case.
+- Units are expressed via metadata and follow UCUM.
+- Instrument type matches semantic intent.
+- Attributes are reusable, well-scoped, and low cardinality.
+- Meaning remains clear under aggregation.
+
+Error conventions (cross-signal)
+
+- Use `error.type` as a low-cardinality classifier for failures when applicable.
+- Successful operations SHOULD NOT set `error.type`.
+- For exceptions:
+  - logs use `exception.type` and/or `exception.message`, and may include
+    `exception.stacktrace` in context where security allows.
+  - span exception events MUST be named `exception`
+
+When in doubt, **refer to the upstream OpenTelemetry Semantic Conventions**,
+which remain the authoritative source.
diff --git a/rust/otap-dataflow/docs/telemetry/stability-compatibility-guide.md b/rust/otap-dataflow/docs/telemetry/stability-compatibility-guide.md
new file mode 100644
index 0000000000..4d4bc406a8
--- /dev/null
+++ b/rust/otap-dataflow/docs/telemetry/stability-compatibility-guide.md
@@ -0,0 +1,216 @@
+# Stability and compatibility guide
+
+Status: Draft
+
+This document defines the stability model, compatibility rules, and change
+process for internal telemetry in the OTAP dataflow engine.
+
+Telemetry is treated as a stable interface. This guide defines what that means
+in practice and how we evolve telemetry without breaking operators and
+downstream consumers.
+
+## Scope
+
+This guide applies to all telemetry schema elements defined in our semantic
+convention registry and emitted by the system, including:
+
+- metric names, units, instrument semantics, and attribute sets
+- metric sets: the shared attribute set + grouped metrics for an entity
+- event names (LogRecord `event_name`), event attributes, and event body shape
+- trace span names and attributes (when tracing is implemented)
+- project-defined entity attributes and their semantics
+
+## Stability levels
+
+Every schema element that is intended for reuse by operators or downstream
+tooling MUST declare a stability level:
+
+- **experimental**
+  - may change without backward compatibility guarantees
+  - intended for iteration and proving utility
+- **stable**
+  - only backward compatible evolution is allowed
+  - breaking changes require versioning and a migration plan
+- **deprecated**
+  - still emitted for a migration window
+  - has a documented replacement
+  - has a planned removal milestone
+
+### What must carry stability
+
+At minimum, stability MUST be declared for:
+
+- each metric
+- each metric set
+- each event name
+- each project-defined attribute that is part of stable signals
+- each trace span name (when implemented)
+
+## Compatibility rules
+
+### General rule
+
+For stable telemetry, changes MUST preserve the ability for existing dashboards,
+alerts, and queries to continue working with the same meaning.
+
+### Backward compatible changes (generally allowed)
+
+For **stable** signals, the following are typically backward compatible:
+
+- adding a new metric to an existing metric set
+- adding a new optional attribute whose cardinality is bounded and documented
+- adding a new enum value to a documented closed set when existing meaning
+  remains valid
+- clarifying descriptions without changing meaning
+
+### Breaking changes (require migration)
+
+For **stable** signals, the following are breaking changes and require:
+
+- a registry version bump (see Versioning)
+- a migration plan
+- dual emission where practical (old and new) during a migration window
+
+Breaking changes include:
+
+- renaming metrics, metric sets, event names, span names, or attribute keys
+- changing units
+- changing instrument semantic meaning (counter vs gauge semantics,
+  monotonicity, temporality assumptions)
+- removing a metric, event, span, or attribute
+- changing the meaning of an attribute value
+- widening attribute cardinality such that aggregations change meaning or cost
+
+Important node: At this stage, the use of the Event Body is intentionally
+limited. Recent
+OpenTelemetry [Semantic Conventions](https://github.com/open-telemetry/semantic-conventions/blob/main/docs/general/events.md)
+recommend avoiding reliance on the Body for structured data, favoring explicitly
+named attributes instead. This guidance may be refined as instrumentation
+practices and upstream OpenTelemetry recommendations evolve.
+
+## Compatibility by signal type
+
+### Metrics
+
+Metric identity is defined by:
+
+- metric name
+- unit
+- instrument semantic meaning
+- attribute keys and their meaning (including enum value sets)
+
+Stable metrics MUST follow these rules:
+
+- Name MUST NOT change. If a rename is required, add a new metric and deprecate
+  the old one.
+- Unit MUST NOT change. If a unit correction is required, add a new metric and
+  deprecate the old one.
+- Attribute keys for a stable metric MUST remain compatible:
+  - You MAY add a new optional bounded attribute.
+  - You MUST NOT remove an attribute or repurpose it.
+- Enum-like attributes MUST be documented as closed sets. Adding values is
+  allowed if aggregation meaning remains safe.
+
+### Metric sets
+
+A metric set is a collection of metrics sharing the same entity attribute set.
+
+For stable metric sets:
+
+- the metric set name MUST remain stable
+- the shared entity attribute set MUST remain stable
+- adding a new metric is allowed (additive evolution)
+- changing the shared attribute set is breaking unless it is strictly additive
+  and optional
+
+### Events
+
+Event names act as schema identifiers.
+
+For stable events:
+
+- the event name MUST remain stable
+- required attributes MUST remain required
+- removing or renaming attributes is breaking
+- adding new optional bounded attributes is allowed
+- event body shape SHOULD remain compatible:
+  - avoid changing body from string to object (or vice versa) for stable
+    events
+  - if richer payload is required, prefer introducing a new event name and
+    deprecating the old one
+
+### Traces
+
+(When implemented)
+
+For stable spans:
+
+- span names MUST remain stable
+- required attributes MUST remain required
+- avoid repurposing attribute meaning
+- exception span events MUST use the canonical exception event naming and
+  attributes (see semantic conventions guide)
+
+## Deprecation process
+
+When deprecating stable telemetry, follow this process:
+
+- Mark the signal as **deprecated** in the registry.
+- Introduce the replacement signal first.
+- Emit both old and new during a migration window.
+- Provide migration guidance:
+  - mapping table (old -> new)
+  - example queries and dashboard update notes
+- Remove the deprecated signal only after the migration window ends.
+
+### Recommended migration window
+
+Default guideline:
+
+- at least 2 releases for internal dashboards
+- longer if external consumers exist or long-lived dashboards depend on it
+
+## Versioning model
+
+The semantic convention registry MUST be versioned.
+
+Recommended approach:
+
+- Use SemVer for the registry version:
+  - MAJOR for breaking changes to stable telemetry
+  - MINOR for backward compatible additions
+  - PATCH for documentation corrections or strictly non-semantic fixes
+
+A release that includes a breaking change to stable telemetry MUST:
+
+- bump the registry MAJOR version
+- include migration guidance in release notes
+- include dual emission where practical for at least one migration window
+
+## Migration patterns
+
+Prefer these patterns:
+
+- **dual emission**
+  - emit old and new signals together temporarily
+- **alias and translate at export**
+  - if exporter can map old key to new key without losing meaning
+- **side-by-side dashboards**
+  - validate new telemetry before switching alerts
+
+Avoid:
+
+- silent renames
+- silent unit changes
+- implicit meaning changes without versioning
+
+## Review checklist
+
+For any telemetry change:
+
+- Stability level is declared or updated.
+- Compatibility impact is assessed (additive vs breaking).
+- Breaking changes include a migration plan and version bump.
+- Additions have bounded cardinality and documented meaning.
+- Docs and generated artifacts are updated.
+- CI validation passes.
diff --git a/rust/otap-dataflow/docs/tracing-proposal.md b/rust/otap-dataflow/docs/telemetry/tracing-draft-not-for-review.md
similarity index 87%
rename from rust/otap-dataflow/docs/tracing-proposal.md
rename to rust/otap-dataflow/docs/telemetry/tracing-draft-not-for-review.md
index 06da4e1181..fc3e3cebda 100644
--- a/rust/otap-dataflow/docs/tracing-proposal.md
+++ b/rust/otap-dataflow/docs/telemetry/tracing-draft-not-for-review.md
@@ -1,6 +1,9 @@
-# Proposal: OpenTelemetry-based Tracing for our dataflow engine
+# Proposal: OpenTelemetry-based tracing for our dataflow engine
 
-## Problem Statement
+Status: Experimental - Please do not use this proposal at this stage. This
+document will most likely be deeply updated.
+
+## Problem statement
 
 Modern observability pipelines are increasingly complex. In our system, these
 pipelines are defined as Directed Acyclic Graphs (DAGs) of interconnected
@@ -23,11 +26,10 @@ traces that could be initiated outside the dataflow engine. This topic will need
 to be studied further at a later time.
 
 > Note: An RFC on the same topic was issued for the Go Collector. This document
-> is a refinement of
->
-that [one](https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/rfcs/component-universal-telemetry.md).
+> is a refinement of that
+> [one](https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/rfcs/component-universal-telemetry.md).
 
-## System Overview
+## System overview
 
 The dataflow engine is structured as a DAG of the following node types:
 
@@ -51,7 +53,7 @@ Each node features:
 > mandatory name. However, at a higher abstraction level (e.g. in semantic
 > conventions), they are treated as a separate signal type.
 
-## Tracing Modes
+## Tracing modes
 
 - **No Tracing**: No spans emitted.
 - **Tail Sampling**: Only selected traces (e.g. errors, random sample) are
@@ -63,10 +65,12 @@ Each node features:
 > defined here, it is possible to envision an extremely efficient implementation
 > since we control all the dataflow nodes involved in an end-to-end trace.
 
-**Metrics are derived from spans before sampling**, ensuring high-fidelity
-monitoring regardless of trace sampling.
+~~**Metrics are derived from spans before sampling**, ensuring high-fidelity
+monitoring regardless of trace sampling.~~ This approach has not been adopted
+for the moment because it would involve too much memory allocation on the hot
+path.
 
-## Mapping OTEL Tracing Primitives to the DAG
+## Mapping OTEL tracing primitives to the DAG
 
 ### Trace
 
@@ -95,18 +99,18 @@ monitoring regardless of trace sampling.
   - State transitions (batch full/flush/drop)
   - Output port selection, backpressure, errors
 
-### Span Links
+### Span links
 
 - Definition: Capture relationships between spans when batches are split (
   fan-out) or merged (fan-in) across nodes.
 - Purpose: Enables lineage and parallel flow reconstruction.
 
-### Control Plane Integration
+### Control plane integration
 
 - Control actions generate events or standalone spans, linked to affected
   data spans as needed, providing operational audit trails.
 
-### Channel Utilization Tracking
+### Channel utilization tracking
 
 - Utilization Metrics (either maintained as direct metrics or derived from span,
   events/attributes):
@@ -133,7 +137,7 @@ The following attribute conventions are proposed for metrics:
 - otelcol.component.outcome: success, failure, refused
 - more to come...
 
-## Possible Visualization & Analysis
+## Possible visualization & analysis
 
 - Trace views show the full DAG traversal for any batch, including processing
   time, routing, control actions, and errors.
@@ -143,7 +147,7 @@ The following attribute conventions are proposed for metrics:
 > Idea: A connector like the service graph connector could perhaps be used to
 > graphically represent the dataflow.
 
-## Example Trace Structure
+## Example trace structure
 
 ```text
 Trace: batch-1234
@@ -169,6 +173,6 @@ Fan-out/fan-in cases are modeled with span links.
 - **Operational insight**: Track the impact and results of control-plane
   actions.
 
-## Implementation Details
+## Implementation details
 
 Not yet defined