NVIDIA · namitdhameja · Feb 26, 2026
diff --git a/docs/source/fault_tolerance/usage_guide.rst b/docs/source/fault_tolerance/usage_guide.rst
@@ -117,33 +117,79 @@ Validation behavior:
   - Other existing types (e.g., devices/symlinks): performs ``stat`` access
 
 
-Attribution service integration
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Attribution integration
+^^^^^^^^^^^^^^^^^^^^^^
 
-Enable artifact analysis (e.g., logs) during rendezvous health checks by pointing to a running attribution service.
-The feature is enabled by specifying both host and port.
+Enable artifact analysis (e.g., logs) during rendezvous to make RESTART/STOP decisions.
+Use ``--ft-attribution-loganalysis [lib|mcp|url]`` (default: lib) for integration flexibility:
 
-* CLI:
+* ``lib`` (default): Direct calling via API in-process.
+* ``mcp``: Log analysis in a separate MCP subprocess.
+* ``url``: HTTP attribution service (host:port or http(s)://host:port).
 
-  - ``--ft-attrsvc-host <HOST>`` (alias: ``--ft_attrsvc_host``)
-  - ``--ft-attrsvc-port <PORT>`` (alias: ``--ft_attrsvc_port``)
+* CLI:
 
-  Example:
+  - ``--ft-attribution-loganalysis`` (alias: ``--ft_attribution_loganalysis``): Enable log analysis attribution.
+    Accepts ``lib``, ``mcp``, or a URL string. No value = lib (default).
+  - ``--ft-attribution-timeout`` (alias: ``--ft_attribution_timeout``): Wait/timeout in seconds;
+    skip result if exceeded (default: 60).
+  - ``--ft-attribution-dry-run`` (alias: ``--ft_attribution_dry_run``): Dry run. Run the full
+    attribution chain (log analysis, Slack, dataflow) but do not apply the restart/stop decision.
+    Log what would happen instead. Useful for validating the pipeline without affecting behavior.
+  - ``--ft-slack-token-file`` (alias: ``--ft_slack_token_file``): Path to file containing Slack bot token.
+    When not set, uses ``SLACK_BOT_TOKEN`` or ``SLACK_BOT_TOKEN_FILE`` env vars.
+  - ``--ft-slack-channel`` (alias: ``--ft_slack_channel``): Slack channel for alerts.
+    When not set, uses ``SLACK_CHANNEL`` env var.
+  - ``--ft-dataflow-index`` (alias: ``--ft_dataflow_index``): Elasticsearch/dataflow index for posting
+    attribution results (lib/mcp only). Requires ``nvdataflow`` (install via ``pip install nvidia-resiliency-ext[dataflow]``).
+    When not set, dataflow posting is disabled.
+
+  Examples:
 
   .. code-block:: bash
 
-     ft_launcher \
-       --ft-attrsvc-host 127.0.0.1 \
-       --ft-attrsvc-port 8000 \
-       train.py
+     # Lib mode (in-process); default
+     ft_launcher --ft-attribution-loganalysis train.py
+     ft_launcher --ft-attribution-loganalysis lib train.py
+
+     # MCP mode (log analysis in separate subprocess)
+     ft_launcher --ft-attribution-loganalysis mcp train.py
+
+     # URL mode (HTTP attribution service)
+     ft_launcher --ft-attribution-loganalysis http://127.0.0.1:8000 train.py
+
+     # Service with custom timeout
+     ft_launcher --ft-attribution-loganalysis http://127.0.0.1:8000 --ft-attribution-timeout 90 train.py
+
+     # Lib mode with Slack and dataflow (token from file; channel from env)
+     ft_launcher --ft-attribution-loganalysis lib --ft-slack-token-file /etc/secrets/slack-token train.py
+
+     # Lib mode with explicit Slack channel and dataflow index
+     ft_launcher --ft-attribution-loganalysis lib \
+       --ft-slack-token-file /etc/secrets/slack-token --ft-slack-channel "#alerts" \
+       --ft-dataflow-index my-attribution-index train.py
+
+     # Dry run: exercise full attribution chain without applying restart/stop decision
+     ft_launcher --ft-attribution-loganalysis lib --ft-attribution-dry-run train.py
 
-* YAML: under the ``fault_tolerance`` section
+* YAML: under the ``fault_tolerance`` section use ``attribution_loganalysis``, ``attribution_timeout_seconds``,
+  ``slack``, and ``dataflow_index``:
 
   .. code-block:: yaml
 
      fault_tolerance:
-       attrsvc_host: "127.0.0.1"
-       attrsvc_port: 8000
+       attribution_loganalysis: "lib"        # or "mcp", or "http://127.0.0.1:8000" for service
+       attribution_timeout_seconds: 60
+       attribution_dry_run: false           # true = run chain but don't apply action; log only
+       slack:
+         bot_token_file: "/etc/secrets/slack-token"  # or bot_token for inline (less secure)
+         channel: "#alerts"
+       dataflow_index: "my-attribution-index"       # optional; requires nvdataflow
+
+* Environment (fallback when CLI/YAML not set):
+
+  - ``SLACK_BOT_TOKEN`` or ``SLACK_BOT_TOKEN_FILE``: Slack bot token for lib/mcp alerts.
+  - ``SLACK_CHANNEL``: Slack channel for alerts.
 
 GPU Memory Reclaim
 ^^^^^^^^^^^^^^^^^^

diff --git a/pyproject.toml b/pyproject.toml
@@ -49,8 +49,15 @@ defusedxml = "*"
 langchain-nvidia-ai-endpoints = ">=0.3.15"
 mcp = ">=1.15.0"
 logsage = ">=0.1.5"
+slack-bolt = ">=1.23.0"
+slack-sdk = ">=3.35.0"
+nvdataflow = {version = "*", optional = true}
 grpcio = "^1.76.0"
 grpcio-tools = "^1.76.0"
+httpx = ">=0.24.0"
+
+[tool.poetry.extras]
+dataflow = ["nvdataflow"]
 
 [tool.poetry.scripts]
 ft_launcher = "nvidia_resiliency_ext.fault_tolerance.launcher:main"

diff --git a/services/nvrx_attrsvc/NVRX_ATTRSVC_SPEC.md b/services/nvrx_attrsvc/NVRX_ATTRSVC_SPEC.md
@@ -278,6 +278,7 @@ src/nvidia_resiliency_ext/attribution/
 │   ├── __init__.py          # Exports config, configure(), ResultPoster, post_results, Slack
 │   ├── config.py            # PostprocessingConfig singleton and configure()
 │   ├── base.py              # ResultPoster, post_results (generic framework)
+│   ├── dataflow.py          # nvdataflow posting (post, get_nvdataflow_post_fn)
 │   └── slack.py             # Slack notifications for terminal failures
 │
 └── mcp_integration/         # MCP client/server for LLM communication
@@ -306,7 +307,7 @@ File organization by layer:
         app.py
 
     POSTPROCESSING:
-        config.setup() wires lib postprocessing (ResultPoster(dataflow.post), Slack); dataflow.py
+        config.setup() wires lib postprocessing (ResultPoster(post_fn=postprocessing.dataflow.post), Slack)
 
 PYTHON API REFERENCE:
 
@@ -2477,14 +2478,15 @@ This is optional and proprietary - implemented in separate module for easy
 exclusion or replacement.
 
 Files (see section 2 PROJECT STRUCTURE):
-    - nvrx_attrsvc/dataflow.py                # Elasticsearch posting via nvdataflow
+    - nvidia_resiliency_ext.attribution.postprocessing.dataflow  # nvdataflow posting (post, get_nvdataflow_post_fn)
     - nvrx_attrsvc/config.py setup()         # Wires lib postprocessing via configure(poster, cluster_name, dataflow_index, slack_*)
     - nvidia_resiliency_ext/attribution/postprocessing/  # config, configure(), ResultPoster, post_results, Slack
 
 Configuration:
     - CLUSTER_NAME, DATAFLOW_INDEX: env prefix NVRX_ATTRSVC_ (e.g. NVRX_ATTRSVC_CLUSTER_NAME)
-    - SLACK_BOT_TOKEN, SLACK_CHANNEL: no prefix (env vars SLACK_BOT_TOKEN, SLACK_CHANNEL)
-    - If DATAFLOW_INDEX empty, dataflow posting disabled; if SLACK_BOT_TOKEN empty, Slack disabled
+    - SLACK_BOT_TOKEN, SLACK_BOT_TOKEN_FILE, SLACK_CHANNEL: no prefix
+    - SLACK_BOT_TOKEN_FILE takes precedence (path to file containing token)
+    - If DATAFLOW_INDEX empty, dataflow posting disabled; if Slack token empty, Slack disabled
     - Slack notifications sent for auto_resume = "STOP - DONT RESTART IMMEDIATE"
 
 When triggered:

diff --git a/services/nvrx_attrsvc/README.md b/services/nvrx_attrsvc/README.md
@@ -53,6 +53,7 @@ Environment variables (prefix: `NVRX_ATTRSVC_`):
 | Variable | Default | Description |
 |----------|---------|-------------|
 | `SLACK_BOT_TOKEN` | `""` | Slack bot OAuth token (empty = disabled) |
+| `SLACK_BOT_TOKEN_FILE` | `""` | Path to file containing token (preferred over SLACK_BOT_TOKEN) |
 | `SLACK_CHANNEL` | `""` | Slack channel for terminal failure alerts |
 
 When configured, sends alerts to Slack for jobs with `auto_resume = "STOP - DONT RESTART IMMEDIATE"`.
@@ -271,7 +272,7 @@ asyncio.run(main())
 | `app.py` | FastAPI routes and middleware |
 | `service.py` | `AttributionService` - wraps LogAnalyzer |
 | `config.py` | `Settings` (pydantic), `setup()` wires postprocessing (poster + Slack) from cfg |
-| `dataflow.py` | NVIDIA-proprietary Elasticsearch posting |
+| lib `postprocessing.dataflow` | nvdataflow Elasticsearch posting (used via ResultPoster) |
 | `deploy/run_attrsvc.sh` | Run service with logging (background) |
 | `deploy/snapshot_attrsvc.sh` | Periodic endpoint snapshot for debugging |
 | `deploy/Dockerfile` | Docker build instructions |

diff --git a/services/nvrx_attrsvc/config.py b/services/nvrx_attrsvc/config.py
@@ -71,12 +71,17 @@ class Settings(BaseSettings):
         default="", description="Dataflow/elasticsearch index for posting results"
     )
 
-    # Slack integration (optional - set SLACK_BOT_TOKEN to enable; env vars have no NVRX_ATTRSVC_ prefix)
+    # Slack integration (optional - set SLACK_BOT_TOKEN or SLACK_BOT_TOKEN_FILE to enable; no NVRX_ATTRSVC_ prefix)
     SLACK_BOT_TOKEN: str = Field(
         default="",
         description="Slack bot token (empty = disabled)",
         validation_alias="SLACK_BOT_TOKEN",
     )
+    SLACK_BOT_TOKEN_FILE: str = Field(
+        default="",
+        description="Path to file containing Slack bot token (preferred over SLACK_BOT_TOKEN)",
+        validation_alias="SLACK_BOT_TOKEN_FILE",
+    )
     SLACK_CHANNEL: str = Field(
         default="",
         description="Slack channel for alerts",
@@ -217,19 +222,19 @@ def setup() -> Settings:
     logging.getLogger("nvidia_resiliency_ext.attribution.mcp_integration").setLevel(logging.WARNING)
     logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
 
-    # Wire postprocessing config (lib singleton)
-    from nvidia_resiliency_ext.attribution.postprocessing import ResultPoster, configure
-
-    from . import dataflow
+    # Wire postprocessing config (lib singleton); slack resolved from env (SLACK_BOT_TOKEN/SLACK_BOT_TOKEN_FILE)
+    from nvidia_resiliency_ext.attribution.postprocessing import (
+        ResultPoster,
+        configure_postprocessing_resolved,
+    )
+    from nvidia_resiliency_ext.attribution.postprocessing.dataflow import post as dataflow_post
 
-    configure(
-        default_poster=ResultPoster(post_fn=dataflow.post),
+    configure_postprocessing_resolved(
+        default_poster=ResultPoster(post_fn=dataflow_post),
         cluster_name=cfg.CLUSTER_NAME or "",
         dataflow_index=cfg.DATAFLOW_INDEX or "",
-        slack_bot_token=cfg.SLACK_BOT_TOKEN or "",
-        slack_channel=cfg.SLACK_CHANNEL or "",
+        slack_token=None,
+        slack_channel=cfg.SLACK_CHANNEL or None,
+        cluster_name_env=None,
     )
-    if cfg.SLACK_BOT_TOKEN:
-        logger.info(f"Slack notifications enabled for channel: {cfg.SLACK_CHANNEL}")
-
     return cfg
diff --git a/services/nvrx_attrsvc/dataflow.py b/services/nvrx_attrsvc/dataflow.py
diff --git a/src/nvidia_resiliency_ext/attribution/log_analyzer/config.py b/src/nvidia_resiliency_ext/attribution/log_analyzer/config.py
@@ -22,6 +22,7 @@
 
 from enum import Enum
 
+# --- Library constants ---
 # TTL constants (see spec Section 3.2)
 TTL_PENDING_SECONDS = 7 * 24 * 60 * 60  # 1 week - pending job expiry
 TTL_TERMINATED_SECONDS = 60 * 60  # 1 hour - terminated job expiry (after GET)