Add structured nuisance-model quality diagnostics

Yurashku · Yurashku · commit 98d921db99f6 · 2026-04-13T17:16:23.000+03:00
diff --git a/README.md b/README.md
@@ -56,6 +56,15 @@ pip install -e .
 
 В `compare_policies(...).to_dict()` и `diagnostics` возвращаются `propensity_source` и `propensity_column` (если применимо).
 
+### Nuisance model diagnostics
+
+В high-level summary добавлен блок `nuisance_diagnostics`:
+- behavior-side quality для estimated propensity path (например multiclass log-loss, top-1 agreement);
+- outcome-side quality (`accept`: log-loss/Brier/AUC, `cltv`: RMSE/MAE/R²);
+- маркеры `applicable` и `is_out_of_fold`, чтобы явно различать logged path и cross-fit OOF path.
+
+Важно: diagnostics по весам/overlap и diagnostics качества nuisance дополняют друг друга; ни один из блоков сам по себе не гарантирует корректность оценки на реальных данных.
+
 
 ## Simulation validation harness (synthetic oracle checks)
 
diff --git a/docs/architecture.md b/docs/architecture.md
@@ -121,3 +121,17 @@ Harness поддерживает сравнение методов (`replay`, `i
 Назначение — внутренний validation/regression инструмент для развития библиотеки. Это не универсальная гарантия корректности на произвольных real-world логах.
 
 Подробнее: `docs/validation_harness.md`.
+
+
+## 10) Nuisance-model quality diagnostics
+
+Дополнительно к overlap/weight diagnostics введён отдельный слой `nuisance diagnostics` для качества моделей `pi_hat` и `mu_hat`.
+
+Зачем: хорошие ESS/overlap сами по себе не гарантируют, что nuisance-модели адекватны. Поэтому в structured outputs добавляются behavior/outcome quality метрики и warning rules.
+
+Ключевые принципы:
+- logged propensity path: behavior model quality помечается как not applicable;
+- estimated propensity path: behavior quality считается и добавляется в summary;
+- cross-fit mode: diagnostics отмечаются как OOF (fold-aware provenance).
+
+Этот слой не меняет формулы estimators и служит для trust-quality интерпретации результатов.
diff --git a/docs/validation_harness.md b/docs/validation_harness.md
@@ -11,7 +11,8 @@
 - `Delta_CI` coverage (если CI рассчитан);
 - частота significance decision (`is_significant`);
 - diagnostics-поля (например, `weight_ess_ratio`, `weight_p99`);
-- provenance (`propensity_source_used`, `propensity_column_used`).
+- provenance (`propensity_source_used`, `propensity_column_used`);
+- nuisance-quality summaries (например behavior log-loss, outcome log-loss/RMSE) для сравнения режимов.
 
 На уровне aggregate (по `mode` и `estimator`):
 - mean bias, std, RMSE для `V_B` и `delta`;
diff --git a/src/policyscope/comparison.py b/src/policyscope/comparison.py
@@ -12,6 +12,7 @@
 from policyscope.diagnostics import compute_policy_diagnostics, PolicyDiagnostics
 from policyscope.inference import infer_policy_comparison_bootstrap
 from policyscope.estimators import value_on_policy
+from policyscope.nuisance_diagnostics import NuisanceDiagnostics, compute_nuisance_diagnostics
 from policyscope.nuisance import (
     CrossFitNuisanceBundle,
     PropensitySource,
@@ -48,6 +49,7 @@ class PolicyComparisonSummary:
     notes: tuple[str, ...] = field(default_factory=tuple)
     propensity_source: Optional[str] = None
     propensity_column: Optional[str] = None
+    nuisance_diagnostics: Optional[NuisanceDiagnostics] = None
 
     def to_dict(self) -> dict:
         out = {
@@ -83,6 +85,8 @@ def to_dict(self) -> dict:
             out["propensity_source"] = self.propensity_source
         if self.propensity_column is not None:
             out["propensity_column"] = self.propensity_column
+        if self.nuisance_diagnostics is not None:
+            out["nuisance_diagnostics"] = self.nuisance_diagnostics.to_dict()
         return out
 
 
@@ -226,6 +230,19 @@ def point_on(part: pd.DataFrame) -> float:
         propensity_col=propensity_col,
     )
 
+    nuisance_diag = compute_nuisance_diagnostics(
+        df,
+        target=target,
+        estimator=estimator,
+        feature_cols=feature_cols,
+        action_col=action_col,
+        propensity_source=diag.propensity_source or resolved_source,
+        behavior_predictions=(
+            nuisance_bundle.behavior if nuisance_bundle is not None and nuisance_bundle.behavior is not None else resolved_behavior
+        ),
+        nuisance_bundle=nuisance_bundle,
+    )
+
     if not with_ci:
         return PolicyComparisonSummary(
             estimator=estimator,
@@ -237,6 +254,7 @@ def point_on(part: pd.DataFrame) -> float:
             notes=propensity_notes + tuple(diag.warnings),
             propensity_source=diag.propensity_source or resolved_source,
             propensity_column=diag.propensity_column or resolved_propensity_col,
+            nuisance_diagnostics=nuisance_diag,
         )
 
     def estimator_pair(part: pd.DataFrame):
@@ -272,6 +290,7 @@ def estimator_pair(part: pd.DataFrame):
         notes=notes,
         propensity_source=diag.propensity_source or resolved_source,
         propensity_column=diag.propensity_column or resolved_propensity_col,
+        nuisance_diagnostics=nuisance_diag,
     )
 
 
diff --git a/src/policyscope/nuisance_diagnostics.py b/src/policyscope/nuisance_diagnostics.py
@@ -0,0 +1,228 @@
+"""Nuisance-model quality diagnostics for behavior and outcome models."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Optional, Sequence
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import brier_score_loss, log_loss, mean_absolute_error, mean_squared_error, r2_score, roc_auc_score
+
+from policyscope.estimators import mu_hat_predict
+from policyscope.nuisance import (
+    BehaviorPredictions,
+    CrossFitNuisanceBundle,
+    OutcomePredictions,
+    fit_outcome_nuisance_bundle,
+)
+
+
+@dataclass(frozen=True)
+class BehaviorModelDiagnostics:
+    applicable: bool
+    propensity_source: str
+    is_out_of_fold: bool
+    multiclass_log_loss: Optional[float] = None
+    top1_accuracy: Optional[float] = None
+    mean_logged_action_prob: Optional[float] = None
+    warnings: tuple[str, ...] = field(default_factory=tuple)
+
+    def to_dict(self) -> dict:
+        return {
+            "applicable": self.applicable,
+            "propensity_source": self.propensity_source,
+            "is_out_of_fold": self.is_out_of_fold,
+            "multiclass_log_loss": self.multiclass_log_loss,
+            "top1_accuracy": self.top1_accuracy,
+            "mean_logged_action_prob": self.mean_logged_action_prob,
+            "warnings": list(self.warnings),
+        }
+
+
+@dataclass(frozen=True)
+class OutcomeModelDiagnostics:
+    applicable: bool
+    target: str
+    is_binary_target: bool
+    is_out_of_fold: bool
+    log_loss: Optional[float] = None
+    brier_score: Optional[float] = None
+    roc_auc: Optional[float] = None
+    rmse: Optional[float] = None
+    mae: Optional[float] = None
+    r2: Optional[float] = None
+    warnings: tuple[str, ...] = field(default_factory=tuple)
+
+    def to_dict(self) -> dict:
+        return {
+            "applicable": self.applicable,
+            "target": self.target,
+            "is_binary_target": self.is_binary_target,
+            "is_out_of_fold": self.is_out_of_fold,
+            "log_loss": self.log_loss,
+            "brier_score": self.brier_score,
+            "roc_auc": self.roc_auc,
+            "rmse": self.rmse,
+            "mae": self.mae,
+            "r2": self.r2,
+            "warnings": list(self.warnings),
+        }
+
+
+@dataclass(frozen=True)
+class NuisanceDiagnostics:
+    behavior: BehaviorModelDiagnostics
+    outcome: OutcomeModelDiagnostics
+    warnings: tuple[str, ...] = field(default_factory=tuple)
+
+    def to_dict(self) -> dict:
+        return {
+            "behavior": self.behavior.to_dict(),
+            "outcome": self.outcome.to_dict(),
+            "warnings": list(self.warnings),
+        }
+
+
+def _compute_behavior_diagnostics(
+    df: pd.DataFrame,
+    *,
+    action_col: str,
+    behavior_predictions: Optional[BehaviorPredictions],
+    propensity_source: Optional[str],
+) -> BehaviorModelDiagnostics:
+    if behavior_predictions is None or propensity_source not in {"estimated", "auto"}:
+        return BehaviorModelDiagnostics(
+            applicable=False,
+            propensity_source=propensity_source or "unknown",
+            is_out_of_fold=False,
+            warnings=("behavior_model_not_applicable_for_logged_propensity",),
+        )
+
+    y = df[action_col].to_numpy()
+    p_taken = np.clip(behavior_predictions.pA_taken, 1e-12, 1.0)
+    ll = float(-np.mean(np.log(p_taken)))
+    top1 = None
+    if behavior_predictions.pA_all is not None:
+        top1 = float(np.mean(np.argmax(behavior_predictions.pA_all, axis=1) == y))
+    warnings: list[str] = []
+    if ll > 1.2:
+        warnings.append("weak_behavior_log_loss")
+    if top1 is not None and top1 < 0.4:
+        warnings.append("weak_behavior_top1_accuracy")
+
+    return BehaviorModelDiagnostics(
+        applicable=True,
+        propensity_source=propensity_source or behavior_predictions.propensity_source or "estimated",
+        is_out_of_fold=bool(behavior_predictions.is_out_of_fold),
+        multiclass_log_loss=ll,
+        top1_accuracy=top1,
+        mean_logged_action_prob=float(np.mean(p_taken)),
+        warnings=tuple(warnings),
+    )
+
+
+def _compute_outcome_diagnostics(
+    df: pd.DataFrame,
+    *,
+    target: str,
+    feature_cols: Optional[Sequence[str]],
+    action_col: str,
+    estimator: str,
+    outcome_predictions: Optional[OutcomePredictions],
+) -> OutcomeModelDiagnostics:
+    if estimator not in {"dm", "dr", "sndr", "switch_dr"}:
+        return OutcomeModelDiagnostics(
+            applicable=False,
+            target=target,
+            is_binary_target=False,
+            is_out_of_fold=False,
+            warnings=("outcome_model_not_used_for_estimator",),
+        )
+
+    y = df[target].to_numpy()
+    is_binary = np.array_equal(np.unique(y), np.array([0, 1])) or np.array_equal(np.unique(y), np.array([0.0, 1.0]))
+
+    if outcome_predictions is None:
+        mu_bundle = fit_outcome_nuisance_bundle(df, target=target, feature_cols=feature_cols, action_col=action_col)
+        pred = mu_hat_predict(mu_bundle.mu_model, df, df[action_col].to_numpy(), target)
+        is_oof = False
+    else:
+        pred = outcome_predictions.mu_logged_action
+        is_oof = bool(outcome_predictions.is_out_of_fold)
+
+    warnings: list[str] = []
+    if is_binary:
+        p = np.clip(pred, 1e-12, 1 - 1e-12)
+        ll = float(log_loss(y, p, labels=[0, 1]))
+        br = float(brier_score_loss(y, p))
+        try:
+            auc = float(roc_auc_score(y, p))
+        except ValueError:
+            auc = None
+        if ll > 0.69:
+            warnings.append("weak_outcome_log_loss")
+        if br > 0.25:
+            warnings.append("weak_outcome_brier")
+        if auc is not None and auc < 0.6:
+            warnings.append("weak_outcome_auc")
+        return OutcomeModelDiagnostics(
+            applicable=True,
+            target=target,
+            is_binary_target=True,
+            is_out_of_fold=is_oof,
+            log_loss=ll,
+            brier_score=br,
+            roc_auc=auc,
+            warnings=tuple(warnings),
+        )
+
+    rmse = float(np.sqrt(mean_squared_error(y, pred)))
+    mae = float(mean_absolute_error(y, pred))
+    r2 = float(r2_score(y, pred))
+    if r2 < 0.0:
+        warnings.append("weak_outcome_r2")
+    return OutcomeModelDiagnostics(
+        applicable=True,
+        target=target,
+        is_binary_target=False,
+        is_out_of_fold=is_oof,
+        rmse=rmse,
+        mae=mae,
+        r2=r2,
+        warnings=tuple(warnings),
+    )
+
+
+def compute_nuisance_diagnostics(
+    df: pd.DataFrame,
+    *,
+    target: str,
+    estimator: str,
+    feature_cols: Optional[Sequence[str]],
+    action_col: str,
+    propensity_source: Optional[str],
+    behavior_predictions: Optional[BehaviorPredictions] = None,
+    nuisance_bundle: Optional[CrossFitNuisanceBundle] = None,
+) -> NuisanceDiagnostics:
+    """Compute structured nuisance quality diagnostics for official outputs."""
+    if behavior_predictions is None and nuisance_bundle is not None:
+        behavior_predictions = nuisance_bundle.behavior
+    outcome_predictions = nuisance_bundle.outcome if nuisance_bundle is not None else None
+
+    behavior = _compute_behavior_diagnostics(
+        df,
+        action_col=action_col,
+        behavior_predictions=behavior_predictions,
+        propensity_source=propensity_source,
+    )
+    outcome = _compute_outcome_diagnostics(
+        df,
+        target=target,
+        feature_cols=feature_cols,
+        action_col=action_col,
+        estimator=estimator,
+        outcome_predictions=outcome_predictions,
+    )
+    warnings = tuple(list(behavior.warnings) + list(outcome.warnings))
+    return NuisanceDiagnostics(behavior=behavior, outcome=outcome, warnings=warnings)
diff --git a/src/policyscope/validation.py b/src/policyscope/validation.py
@@ -46,6 +46,9 @@ class ValidationRunRow:
     propensity_column_used: Optional[str]
     ess_ratio: Optional[float]
     weight_p99: Optional[float]
+    behavior_log_loss: Optional[float]
+    outcome_log_loss: Optional[float]
+    outcome_rmse: Optional[float]
 
 
 @dataclass(frozen=True)
@@ -90,6 +93,9 @@ def _aggregate_rows(rows: list[ValidationRunRow]) -> pd.DataFrame:
             significance_rate=("is_significant", "mean"),
             mean_ess_ratio=("ess_ratio", "mean"),
             mean_weight_p99=("weight_p99", "mean"),
+            mean_behavior_log_loss=("behavior_log_loss", "mean"),
+            mean_outcome_log_loss=("outcome_log_loss", "mean"),
+            mean_outcome_rmse=("outcome_rmse", "mean"),
         )
         .reset_index()
     )
@@ -187,6 +193,21 @@ def run_simulation_validation(
                         propensity_column_used=summary.propensity_column,
                         ess_ratio=diag.get("weight_ess_ratio"),
                         weight_p99=diag.get("weight_p99"),
+                        behavior_log_loss=(
+                            summary.nuisance_diagnostics.behavior.multiclass_log_loss
+                            if summary.nuisance_diagnostics is not None
+                            else None
+                        ),
+                        outcome_log_loss=(
+                            summary.nuisance_diagnostics.outcome.log_loss
+                            if summary.nuisance_diagnostics is not None
+                            else None
+                        ),
+                        outcome_rmse=(
+                            summary.nuisance_diagnostics.outcome.rmse
+                            if summary.nuisance_diagnostics is not None
+                            else None
+                        ),
                     )
                 )
 
diff --git a/tests/test_docs_consistency.py b/tests/test_docs_consistency.py
@@ -15,6 +15,7 @@ def test_architecture_doc_exists_and_mentions_domain_model():
     assert "compare_policies" in text
     assert 'propensity_source="auto"' in text
     assert "logged vs estimated propensity" in text.lower()
+    assert "nuisance-model quality diagnostics" in text.lower()
 
 
 def test_readme_mentions_p_value_method():
@@ -24,6 +25,7 @@ def test_readme_mentions_p_value_method():
     assert "weight_ess_ratio" in text
     assert "compare_policies_multi_target" in text
     assert "propensity source modes" in text
+    assert "nuisance model diagnostics" in text
     assert 'propensity_source="auto"' in text
 
 
diff --git a/tests/test_nuisance_diagnostics.py b/tests/test_nuisance_diagnostics.py