From a2e1eca5f15ae2cc04ae8125125d099a075b0781 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 16:14:48 +0400
Subject: [PATCH 01/29] Relax spacy upper bound, make unbabel-comet optional
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove spacy<3.8.0 upper bound: spacy 3.8+ uses thinc 8.3+/9.x
  which is compatible with numpy 2.x (required by vLLM and other
  modern ML packages). The old pin forced thinc 8.2.x → numpy 1.x,
  creating unresolvable conflicts with vLLM/torch/cupy.

- Make unbabel-comet optional: comment out from requirements.txt
  and guard the import in generation_metrics/__init__.py. The Comet
  metric class is only used for translation evaluation and is not
  needed by most users. Users who need it can install separately
  with `pip install unbabel-comet --no-deps`.

- Move `from evaluate import load` to lazy import inside Comet.__init__
  so the module can be imported without unbabel-comet installed.
---
 requirements.txt                                | 4 ++--
 src/lm_polygraph/generation_metrics/__init__.py | 6 +++++-
 src/lm_polygraph/generation_metrics/comet.py    | 3 ++-
 3 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/requirements.txt b/requirements.txt
index 520bc1750..fba670f3d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,10 +29,10 @@ openai>=1.52.0
 wget
 sentence-transformers
 bert-score>=0.3.13
-unbabel-comet<3
+# unbabel-comet<3  # Optional: install separately with `pip install unbabel-comet --no-deps`
 nltk>=3.7,<4
 evaluate>=0.4.2
-spacy>=3.4.0,<3.8.0
+spacy>=3.4.0
 fastchat
 diskcache>=5.6.3
 boostedprob
diff --git a/src/lm_polygraph/generation_metrics/__init__.py b/src/lm_polygraph/generation_metrics/__init__.py
index d9d66c958..063ce18e0 100644
--- a/src/lm_polygraph/generation_metrics/__init__.py
+++ b/src/lm_polygraph/generation_metrics/__init__.py
@@ -3,7 +3,11 @@
 from .model_score import ModelScoreSeqMetric, ModelScoreTokenwiseMetric
 from .bart_score import BartScoreSeqMetric
 from .accuracy import AccuracyMetric
-from .comet import Comet
+
+try:
+    from .comet import Comet
+except ImportError:
+    Comet = None
 from .alignscore import AlignScore
 from .openai_fact_check import OpenAIFactCheck
 from .bert_score import BertScoreMetric
diff --git a/src/lm_polygraph/generation_metrics/comet.py b/src/lm_polygraph/generation_metrics/comet.py
index 0fcd9b3e2..5942b2b3f 100644
--- a/src/lm_polygraph/generation_metrics/comet.py
+++ b/src/lm_polygraph/generation_metrics/comet.py
@@ -1,6 +1,5 @@
 import re
 import numpy as np
-from evaluate import load
 
 from typing import List, Dict
 from .generation_metric import GenerationMetric
@@ -14,6 +13,8 @@ class Comet(GenerationMetric):
 
     def __init__(self, source_ignore_regex=None, lang="en"):
         super().__init__(["greedy_texts", "input_texts"], "sequence")
+        from evaluate import load
+
         self.scorer = load("comet")
         self.source_ignore_regex = (
             re.compile(source_ignore_regex) if source_ignore_regex else None

From a408e7ef61ad08872d3e6536570cc1bcb67499ad Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 16:17:59 +0400
Subject: [PATCH 02/29] Add optional dependencies section to README for
 unbabel-comet

---
 README.md        | 9 +++++++++
 requirements.txt | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2f9d65132..7661e8d72 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,15 @@ The latest tagged version is also available via PyPI:
 pip install lm-polygraph
 ```
 
+### Optional dependencies
+
+Some features require additional packages that are not installed by default:
+
+- **COMET metric** (translation evaluation): `unbabel-comet` has constrained `transformers` version requirements, so it is installed separately:
+  ```shell
+  pip install unbabel-comet --no-deps
+  ```
+
 ## <a name="basic_usage"></a>Basic usage
 1. Initialize the base model (encoder-decoder or decoder-only) and tokenizer from HuggingFace or a local file, and use them to initialize the WhiteboxModel for evaluation:
 ```python
diff --git a/requirements.txt b/requirements.txt
index fba670f3d..03f294b85 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,7 +29,7 @@ openai>=1.52.0
 wget
 sentence-transformers
 bert-score>=0.3.13
-# unbabel-comet<3  # Optional: install separately with `pip install unbabel-comet --no-deps`
+# unbabel-comet<3  # Optional, see README for installation instructions
 nltk>=3.7,<4
 evaluate>=0.4.2
 spacy>=3.4.0

From fd22daf54ae22607b35e1e34887345e3fab3ba92 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 16:20:28 +0400
Subject: [PATCH 03/29] Add transformers 5.x to CI test matrix

Run tests against both default transformers (from requirements.txt)
and transformers 5.x to catch compatibility issues early.
Lint runs only once (on default version).

Relates to #445.
---
 .github/workflows/python-app.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 76dc643a5..0abc8d718 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -17,6 +17,11 @@ jobs:
 
     runs-on: ubuntu-latest
 
+    strategy:
+      fail-fast: false
+      matrix:
+        transformers-version: ["default", "5.*"]
+
     steps:
     - uses: actions/checkout@v3
     - name: Set up Python 3.12
@@ -28,7 +33,12 @@ jobs:
         python -m pip install --upgrade pip
         pip install flake8 pytest hydra-core==1.3.2
         pip install .
+    - name: Override transformers version
+      if: matrix.transformers-version != 'default'
+      run: |
+        pip install "transformers==${{ matrix.transformers-version }}"
     - name: Lint
+      if: matrix.transformers-version == 'default'
       uses: py-actions/flake8@v2
       with:
         args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401"

From 7d631ba4e532f8562403cfcf19bb1365e92e6587 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 16:31:06 +0400
Subject: [PATCH 04/29] Fix transformers 5.0 compatibility for ensemble utils
 and visual model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

transformers 5.0 removed/renamed several classes:
- beam_search submodule removed: BeamScorer no longer exists
- Output classes renamed:
  - BeamSearchOutput → GenerateBeamEncoderDecoderOutput
  - BeamSearchDecoderOnlyOutput → GenerateBeamDecoderOnlyOutput
  - SampleOutput → GenerateNonBeamOutput
  - SampleDecoderOnlyOutput → GenerateDecoderOnlyOutput
  - GreedySearchOutput → GenerateNonBeamOutput
  - GreedySearchDecoderOnlyOutput → GenerateDecoderOnlyOutput
- AutoModelForVision2Seq removed

All imports now use try/except with aliases to support both 4.x and 5.x.

Relates to #445.
---
 .../model_adapters/visual_whitebox_model.py   |  7 ++++-
 .../utils/ensemble_utils/ensemble_beam.py     | 26 ++++++++++++++-----
 .../utils/ensemble_utils/ensemble_greedy.py   | 18 +++++++++----
 .../utils/ensemble_utils/ensemble_sample.py   | 18 +++++++++----
 4 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/src/lm_polygraph/model_adapters/visual_whitebox_model.py b/src/lm_polygraph/model_adapters/visual_whitebox_model.py
index 9601bca27..3675294f5 100644
--- a/src/lm_polygraph/model_adapters/visual_whitebox_model.py
+++ b/src/lm_polygraph/model_adapters/visual_whitebox_model.py
@@ -8,12 +8,17 @@
 import torch
 from PIL import Image
 from transformers import (
-    AutoModelForVision2Seq,
     AutoProcessor,
     GenerationConfig,
     LogitsProcessorList,
 )
 
+try:
+    from transformers import AutoModelForVision2Seq
+except ImportError:
+    # transformers >= 5.0 removed AutoModelForVision2Seq
+    AutoModelForVision2Seq = None
+
 from lm_polygraph.utils.generation_parameters import GenerationParameters
 from lm_polygraph.utils.dataset import Dataset
 from lm_polygraph.utils.model import Model
diff --git a/src/lm_polygraph/utils/ensemble_utils/ensemble_beam.py b/src/lm_polygraph/utils/ensemble_utils/ensemble_beam.py
index 94f180daa..2ed5d2731 100644
--- a/src/lm_polygraph/utils/ensemble_utils/ensemble_beam.py
+++ b/src/lm_polygraph/utils/ensemble_utils/ensemble_beam.py
@@ -8,7 +8,13 @@
 from torch import nn
 
 from transformers import GenerationMixin
-from transformers.generation.beam_search import BeamScorer
+
+try:
+    from transformers.generation.beam_search import BeamScorer
+except ImportError:
+    # transformers >= 5.0 removed BeamScorer entirely
+    BeamScorer = None
+
 from transformers.generation.logits_process import (
     LogitsProcessorList,
 )
@@ -16,11 +22,19 @@
     StoppingCriteriaList,
     validate_stopping_criteria,
 )
-from transformers.generation.utils import (
-    BeamSearchOutput,
-    BeamSearchDecoderOnlyOutput,
-    ModelOutput,
-)
+from transformers.generation.utils import ModelOutput
+
+try:
+    from transformers.generation.utils import (
+        BeamSearchOutput,
+        BeamSearchDecoderOnlyOutput,
+    )
+except ImportError:
+    # transformers >= 5.0 renamed these classes
+    from transformers.generation.utils import (
+        GenerateBeamEncoderDecoderOutput as BeamSearchOutput,
+        GenerateBeamDecoderOnlyOutput as BeamSearchDecoderOnlyOutput,
+    )
 
 
 class EnsembleBeamSearchMixin(GenerationMixin):
diff --git a/src/lm_polygraph/utils/ensemble_utils/ensemble_greedy.py b/src/lm_polygraph/utils/ensemble_utils/ensemble_greedy.py
index db7b31cf1..027d9eec9 100644
--- a/src/lm_polygraph/utils/ensemble_utils/ensemble_greedy.py
+++ b/src/lm_polygraph/utils/ensemble_utils/ensemble_greedy.py
@@ -15,11 +15,19 @@
     validate_stopping_criteria,
 )
 from transformers.generation.streamers import BaseStreamer
-from transformers.generation.utils import (
-    GreedySearchOutput,
-    GreedySearchDecoderOnlyOutput,
-    ModelOutput,
-)
+from transformers.generation.utils import ModelOutput
+
+try:
+    from transformers.generation.utils import (
+        GreedySearchOutput,
+        GreedySearchDecoderOnlyOutput,
+    )
+except ImportError:
+    # transformers >= 5.0 renamed these classes
+    from transformers.generation.utils import (
+        GenerateNonBeamOutput as GreedySearchOutput,
+        GenerateDecoderOnlyOutput as GreedySearchDecoderOnlyOutput,
+    )
 
 
 class EnsembleGreedyMixin(GenerationMixin):
diff --git a/src/lm_polygraph/utils/ensemble_utils/ensemble_sample.py b/src/lm_polygraph/utils/ensemble_utils/ensemble_sample.py
index 07b142dc8..1106171e6 100644
--- a/src/lm_polygraph/utils/ensemble_utils/ensemble_sample.py
+++ b/src/lm_polygraph/utils/ensemble_utils/ensemble_sample.py
@@ -16,11 +16,19 @@
     validate_stopping_criteria,
 )
 from transformers.generation.streamers import BaseStreamer
-from transformers.generation.utils import (
-    SampleOutput,
-    SampleDecoderOnlyOutput,
-    ModelOutput,
-)
+from transformers.generation.utils import ModelOutput
+
+try:
+    from transformers.generation.utils import (
+        SampleOutput,
+        SampleDecoderOnlyOutput,
+    )
+except ImportError:
+    # transformers >= 5.0 renamed these classes
+    from transformers.generation.utils import (
+        GenerateNonBeamOutput as SampleOutput,
+        GenerateDecoderOnlyOutput as SampleDecoderOnlyOutput,
+    )
 
 
 class EnsembleSampleMixin(GenerationMixin):

From 781bb2115d1614c4c7d0e03297902c9d4977ed3c Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 16:38:24 +0400
Subject: [PATCH 05/29] Use AutoModelForImageTextToText as fallback for
 AutoModelForVision2Seq

---
 src/lm_polygraph/model_adapters/visual_whitebox_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lm_polygraph/model_adapters/visual_whitebox_model.py b/src/lm_polygraph/model_adapters/visual_whitebox_model.py
index 3675294f5..0b674712f 100644
--- a/src/lm_polygraph/model_adapters/visual_whitebox_model.py
+++ b/src/lm_polygraph/model_adapters/visual_whitebox_model.py
@@ -16,8 +16,8 @@
 try:
     from transformers import AutoModelForVision2Seq
 except ImportError:
-    # transformers >= 5.0 removed AutoModelForVision2Seq
-    AutoModelForVision2Seq = None
+    # transformers >= 5.0 renamed AutoModelForVision2Seq → AutoModelForImageTextToText
+    from transformers import AutoModelForImageTextToText as AutoModelForVision2Seq
 
 from lm_polygraph.utils.generation_parameters import GenerationParameters
 from lm_polygraph.utils.dataset import Dataset

From 448485c8d881460a88a317956070104ed4316722 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 16:46:24 +0400
Subject: [PATCH 06/29] Replace batch_encode_plus with direct tokenizer call

batch_encode_plus was removed from newer transformers tokenizers.
The direct __call__ (tokenizer(...)) is equivalent and works on
all versions.
---
 src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py | 2 +-
 src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py  | 4 ++--
 src/lm_polygraph/stat_calculators/semantic_matrix.py         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py
index bb0cfe3b2..187dd62f6 100644
--- a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py
+++ b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py
@@ -22,7 +22,7 @@ def eval_nli_model(
     with torch.no_grad():
         for k in range(0, len(nli_set), deberta.batch_size):
             batch = nli_set[k : k + deberta.batch_size]
-            encoded = deberta.deberta_tokenizer.batch_encode_plus(
+            encoded = deberta.deberta_tokenizer(
                 batch, padding=True, return_tensors="pt"
             ).to(deberta.device)
             logits = deberta.deberta(**encoded).logits.detach()
diff --git a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
index 07d458d5a..5843416b6 100644
--- a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
+++ b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
@@ -47,14 +47,14 @@ def calculate_semantic_matrix(self, batch_pairs, batch_invs):
                 probs_b = []
                 for first_texts, second_texts in tqdm(dl):
                     batch = list(zip(first_texts, second_texts))
-                    encoded = tokenizer.batch_encode_plus(
+                    encoded = tokenizer(
                         batch, padding=True, return_tensors="pt"
                     ).to(deberta.device)
                     logits = deberta.deberta(**encoded).logits
                     probs_f.append(softmax(logits))
 
                     batch = list(zip(second_texts, first_texts))
-                    encoded = tokenizer.batch_encode_plus(
+                    encoded = tokenizer(
                         batch, padding=True, return_tensors="pt"
                     ).to(deberta.device)
                     logits = deberta.deberta(**encoded).logits
diff --git a/src/lm_polygraph/stat_calculators/semantic_matrix.py b/src/lm_polygraph/stat_calculators/semantic_matrix.py
index 37bb969a5..14a9fb5ce 100644
--- a/src/lm_polygraph/stat_calculators/semantic_matrix.py
+++ b/src/lm_polygraph/stat_calculators/semantic_matrix.py
@@ -102,7 +102,7 @@ def __call__(
                 logits_all = []
                 for first_texts, second_texts in dl:
                     batch = list(zip(first_texts, second_texts))
-                    encoded = tokenizer.batch_encode_plus(
+                    encoded = tokenizer(
                         batch, padding=True, return_tensors="pt"
                     ).to(device)
                     logits = deberta.deberta(**encoded).logits

From 0630902fc52622f010b09aef4ef99bae035a9494 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 16:52:37 +0400
Subject: [PATCH 07/29] Fix black formatting for tokenizer calls

---
 .../stat_calculators/greedy_semantic_matrix.py       | 12 ++++++------
 src/lm_polygraph/stat_calculators/semantic_matrix.py |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
index 5843416b6..1f3ccbe96 100644
--- a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
+++ b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
@@ -47,16 +47,16 @@ def calculate_semantic_matrix(self, batch_pairs, batch_invs):
                 probs_b = []
                 for first_texts, second_texts in tqdm(dl):
                     batch = list(zip(first_texts, second_texts))
-                    encoded = tokenizer(
-                        batch, padding=True, return_tensors="pt"
-                    ).to(deberta.device)
+                    encoded = tokenizer(batch, padding=True, return_tensors="pt").to(
+                        deberta.device
+                    )
                     logits = deberta.deberta(**encoded).logits
                     probs_f.append(softmax(logits))
 
                     batch = list(zip(second_texts, first_texts))
-                    encoded = tokenizer(
-                        batch, padding=True, return_tensors="pt"
-                    ).to(deberta.device)
+                    encoded = tokenizer(batch, padding=True, return_tensors="pt").to(
+                        deberta.device
+                    )
                     logits = deberta.deberta(**encoded).logits
                     probs_b.append(softmax(logits))
 
diff --git a/src/lm_polygraph/stat_calculators/semantic_matrix.py b/src/lm_polygraph/stat_calculators/semantic_matrix.py
index 14a9fb5ce..b03d4967b 100644
--- a/src/lm_polygraph/stat_calculators/semantic_matrix.py
+++ b/src/lm_polygraph/stat_calculators/semantic_matrix.py
@@ -102,9 +102,9 @@ def __call__(
                 logits_all = []
                 for first_texts, second_texts in dl:
                     batch = list(zip(first_texts, second_texts))
-                    encoded = tokenizer(
-                        batch, padding=True, return_tensors="pt"
-                    ).to(device)
+                    encoded = tokenizer(batch, padding=True, return_tensors="pt").to(
+                        device
+                    )
                     logits = deberta.deberta(**encoded).logits
                     probs.append(softmax(logits))
                     logits_all.append(logits)

From eac1c35077083f634c6733aeebea21d2dd1a04ad Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 17:15:15 +0400
Subject: [PATCH 08/29] Set seed=42 in test_polygraph_eval_seq_ue config

The test_all_seq_ue test uses do_sample=True for sampling-based
estimators (MonteCarloSequenceEntropy, PTrueSampling, etc.).
With seed=null, torch.multinomial occasionally fails with
"probability tensor contains inf, nan or element < 0" due to
non-deterministic logit values from bloomz-560m on CPU.
Setting a fixed seed makes the test deterministic and reproducible.
---
 test/configs/test_polygraph_eval_seq_ue.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/configs/test_polygraph_eval_seq_ue.yaml b/test/configs/test_polygraph_eval_seq_ue.yaml
index b259f071f..cd15fb68e 100644
--- a/test/configs/test_polygraph_eval_seq_ue.yaml
+++ b/test/configs/test_polygraph_eval_seq_ue.yaml
@@ -31,6 +31,6 @@ stat_calculators:
 
 subsample_eval_dataset: 10
 batch_size: 2
-seed: null
+seed: 42
 device: null
 max_new_tokens: 256

From d9076fcc5cab6c84a2cb39fca2bf53eec29b1ed6 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 17:26:41 +0400
Subject: [PATCH 09/29] Fix seed format: must be a list for polygraph_eval

---
 test/configs/test_polygraph_eval_seq_ue.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/configs/test_polygraph_eval_seq_ue.yaml b/test/configs/test_polygraph_eval_seq_ue.yaml
index cd15fb68e..97158bad6 100644
--- a/test/configs/test_polygraph_eval_seq_ue.yaml
+++ b/test/configs/test_polygraph_eval_seq_ue.yaml
@@ -31,6 +31,7 @@ stat_calculators:
 
 subsample_eval_dataset: 10
 batch_size: 2
-seed: 42
+seed:
+  - 42
 device: null
 max_new_tokens: 256

From c00891799748bad1466c4a76777309569bb85778 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 17:44:18 +0400
Subject: [PATCH 10/29] Set temperature=0.7 in seq_ue test to stabilize
 sampling

---
 test/configs/test_polygraph_eval_seq_ue.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/configs/test_polygraph_eval_seq_ue.yaml b/test/configs/test_polygraph_eval_seq_ue.yaml
index 97158bad6..53fd97726 100644
--- a/test/configs/test_polygraph_eval_seq_ue.yaml
+++ b/test/configs/test_polygraph_eval_seq_ue.yaml
@@ -22,6 +22,9 @@ instruct: false
 prompt: ""
 
 ignore_exceptions: false
+
+generation_params:
+  temperature: 0.7
 generation_metrics:
   - name: RougeMetric
     args: ["rouge1"]

From a84359f7f8ac099c4d36f91c6b318871e1a962d2 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 18:01:58 +0400
Subject: [PATCH 11/29] Add renormalize_logits=True to sampling to prevent
 inf/nan in multinomial

---
 src/lm_polygraph/stat_calculators/sample.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py
index 05b2b854b..d129b2799 100644
--- a/src/lm_polygraph/stat_calculators/sample.py
+++ b/src/lm_polygraph/stat_calculators/sample.py
@@ -185,6 +185,7 @@ def __call__(
             max_new_tokens=max_new_tokens,
             min_new_tokens=2,
             do_sample=True,
+            renormalize_logits=True,
             num_beams=1,
             num_return_sequences=1,
             suppress_tokens=(

From aea48e2b781e9a615ef3a5d9eede93c951f75253 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 18:28:36 +0400
Subject: [PATCH 12/29] Add logits sanitizer to prevent inf/nan crash in
 sampling

Different numpy versions (1.x vs 2.x) can cause bloomz-560m to
produce inf/nan logits on CPU, crashing torch.multinomial.
Add _SanitizeLogitsProcessor that clamps inf/nan to finite values
before scoring and sampling. Runs first in the logits processor chain.
---
 src/lm_polygraph/stat_calculators/sample.py |  1 -
 src/lm_polygraph/utils/model.py             | 14 +++++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py
index d129b2799..05b2b854b 100644
--- a/src/lm_polygraph/stat_calculators/sample.py
+++ b/src/lm_polygraph/stat_calculators/sample.py
@@ -185,7 +185,6 @@ def __call__(
             max_new_tokens=max_new_tokens,
             min_new_tokens=2,
             do_sample=True,
-            renormalize_logits=True,
             num_beams=1,
             num_return_sequences=1,
             suppress_tokens=(
diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py
index aa06a3053..6d34198b7 100644
--- a/src/lm_polygraph/utils/model.py
+++ b/src/lm_polygraph/utils/model.py
@@ -455,6 +455,12 @@ def __call__(self, input_ids=None, scores=None):
             self.scores.append(scores.log_softmax(-1))
             return scores
 
+    class _SanitizeLogitsProcessor:
+        # Replaces inf/nan in logits with large finite values to prevent
+        # RuntimeError in torch.multinomial during sampling
+        def __call__(self, input_ids=None, scores=None):
+            return torch.nan_to_num(scores, nan=0.0, posinf=1e4, neginf=-1e4)
+
     def generate(self, **args):
         """
         Generates the model output with scores from batch formed by HF Tokenizer.
@@ -466,14 +472,16 @@ def generate(self, **args):
         """
         default_params = asdict(self.generation_parameters)
 
-        # add ScoresProcessor to collect original scores
+        # add ScoresProcessor to collect original scores, and SanitizeLogitsProcessor
+        # to prevent inf/nan from crashing torch.multinomial during sampling
         processor = self._ScoresProcessor()
+        sanitizer = self._SanitizeLogitsProcessor()
         if "logits_processor" in args.keys():
             logits_processor = LogitsProcessorList(
-                [processor, args["logits_processor"]]
+                [sanitizer, processor, args["logits_processor"]]
             )
         else:
-            logits_processor = LogitsProcessorList([processor])
+            logits_processor = LogitsProcessorList([sanitizer, processor])
         args["logits_processor"] = logits_processor
 
         # update default parameters with passed arguments

From 4d061b5cf608231af8017f9286b2d6c2ff0ae58c Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 19:48:51 +0400
Subject: [PATCH 13/29] Increase CI timeout to 45 minutes per job

---
 .github/workflows/python-app.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 0abc8d718..fe2109ba7 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -16,6 +16,7 @@ jobs:
   build:
 
     runs-on: ubuntu-latest
+    timeout-minutes: 45
 
     strategy:
       fail-fast: false

From 37f1b1a5761b6705c7f0e9fb32c1f0ae8bdbf4a3 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 22:59:52 +0400
Subject: [PATCH 14/29] Fix logits sanitizer to use per-row max instead of
 fixed 1e4

The previous sanitizer replaced +inf with 1e4, which completely
dominated softmax and caused the model to generate the same token
repeatedly, never hitting stop_strings. This made test_just_works
take 22+ minutes on CI (vs 3 min on main) because generations
ran to max_new_tokens instead of stopping at "\n".

Now replaces inf values with the max/min finite value from the same
row, preserving the original distribution shape.

Also add per-step timeout to pytest to prevent future hangs.
---
 .github/workflows/python-app.yml |  1 +
 src/lm_polygraph/utils/model.py  | 26 +++++++++++++++++++++++---
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index fe2109ba7..767c1a356 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -49,5 +49,6 @@ jobs:
       run: rm -rf $HOME/.cache
     # If we exceed disk space limit again, we can test lm-polygraph tests separately, and delete cachedir again
     - name: Test with pytest
+      timeout-minutes: 30
       run: |
         pytest --ignore=test/local
diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py
index 6d34198b7..e10a7a905 100644
--- a/src/lm_polygraph/utils/model.py
+++ b/src/lm_polygraph/utils/model.py
@@ -456,10 +456,30 @@ def __call__(self, input_ids=None, scores=None):
             return scores
 
     class _SanitizeLogitsProcessor:
-        # Replaces inf/nan in logits with large finite values to prevent
-        # RuntimeError in torch.multinomial during sampling
+        # Replaces inf/nan in logits with finite values to prevent
+        # RuntimeError in torch.multinomial during sampling.
+        # Uses per-row max/min of finite values to avoid dominating softmax.
         def __call__(self, input_ids=None, scores=None):
-            return torch.nan_to_num(scores, nan=0.0, posinf=1e4, neginf=-1e4)
+            if torch.isfinite(scores).all():
+                return scores
+            finite_mask = torch.isfinite(scores)
+            # Compute per-row max/min of finite values
+            masked = scores.clone()
+            masked[~finite_mask] = float("-inf")
+            row_max = masked.max(dim=-1, keepdim=True).values
+            masked[~finite_mask] = float("inf")
+            row_min = masked.min(dim=-1, keepdim=True).values
+            # Fallback if entire row is non-finite
+            row_max = torch.where(
+                torch.isfinite(row_max), row_max, torch.zeros_like(row_max)
+            )
+            row_min = torch.where(
+                torch.isfinite(row_min), row_min, torch.zeros_like(row_min)
+            )
+            scores = torch.where(torch.isposinf(scores), row_max, scores)
+            scores = torch.where(torch.isneginf(scores), row_min, scores)
+            scores = torch.nan_to_num(scores, nan=0.0)
+            return scores
 
     def generate(self, **args):
         """

From 112b4a4d0d64c4dd584e19cac31d45683e1f81a3 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 23:26:07 +0400
Subject: [PATCH 15/29] Add timing logs to diagnose slow CI tests

Stream subprocess output in tests and add per-stage timing to
polygraph_eval to identify which step is slow on CI.
---
 .github/workflows/python-app.yml |  5 ++++-
 scripts/polygraph_eval           | 13 ++++++++++---
 test/test_lm_polygraph.py        | 15 ++++++++++++++-
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 767c1a356..1fa94026c 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -33,6 +33,9 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install flake8 pytest hydra-core==1.3.2
+        # Pin numpy<2 for CI: bloomz-560m produces inf logits with numpy 2.x on CPU,
+        # causing degenerate generation. numpy 2.x compat is for vLLM users (GPU).
+        pip install "numpy<2"
         pip install .
     - name: Override transformers version
       if: matrix.transformers-version != 'default'
@@ -51,4 +54,4 @@ jobs:
     - name: Test with pytest
       timeout-minutes: 30
       run: |
-        pytest --ignore=test/local
+        pytest --ignore=test/local -s -v
diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index de8b80a48..6caf2c848 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -2,6 +2,7 @@
 
 import hydra
 import os
+import time
 import transformers
 from pathlib import Path
 from omegaconf import OmegaConf
@@ -98,13 +99,15 @@ def main(args):
         log.info("=" * 100)
         log.info(f"SEED: {seed}")
 
+        t_step = time.time()
         log.info(f"Loading model {args.model.path}...")
         transformers.set_seed(seed)
 
         model = get_model(args)
 
-        log.info("Done with loading model.")
+        log.info(f"Done with loading model. ({time.time() - t_step:.1f}s)")
 
+        t_step = time.time()
         log.info(f"Loading dataset {args.dataset}...")
         dataset = Dataset.load(
             args.dataset,
@@ -125,13 +128,14 @@ def main(args):
             **cache_kwargs,
         )
 #	images=dataset.images
-        log.info("Done with loading eval data.")
+        log.info(f"Done with loading eval data. ({time.time() - t_step:.1f}s)")
 
         log.info("=" * 100)
+        t_step = time.time()
         log.info("Initializing UE estimators...")
         estimators = []
         estimators += get_ue_methods(args, model)
-        log.info("Done loading UE estimators")
+        log.info(f"Done loading UE estimators ({time.time() - t_step:.1f}s)")
 
         if args.subsample_eval_dataset != -1:
             dataset.subsample(args.subsample_eval_dataset, seed=seed)
@@ -160,12 +164,15 @@ def main(args):
             log_time=getattr(args, "log_time", False),
         )
 
+        t_step = time.time()
+        log.info("Starting UEManager evaluation...")
         try:
             man()
         except Exception as e:
             man.state = "failed"
             raise e
         finally:
+            log.info(f"UEManager evaluation finished. ({time.time() - t_step:.1f}s)")
             man.save(save_path + f"/ue_manager_seed{seed}")
 
         if hasattr(args, "report_to_wandb") and args.report_to_wandb:
diff --git a/test/test_lm_polygraph.py b/test/test_lm_polygraph.py
index da82128c2..b438e90bd 100644
--- a/test/test_lm_polygraph.py
+++ b/test/test_lm_polygraph.py
@@ -1,5 +1,7 @@
 import subprocess
 import pathlib
+import time
+import sys
 
 from lm_polygraph.utils.manager import UEManager
 
@@ -14,7 +16,18 @@
 
 
 def exec_bash(s):
-    return subprocess.run(s, shell=True)
+    print(f"\n[TIMER] Starting command: {s}", flush=True)
+    t0 = time.time()
+    proc = subprocess.Popen(
+        s, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
+    )
+    for line in proc.stdout:
+        sys.stdout.write(line)
+        sys.stdout.flush()
+    proc.wait()
+    elapsed = time.time() - t0
+    print(f"[TIMER] Command finished in {elapsed:.1f}s (rc={proc.returncode})", flush=True)
+    return proc
 
 
 def pwd():

From 7d8d5aa983780b1260918009d98953b0cb7d9412 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 23:29:32 +0400
Subject: [PATCH 16/29] Fix black formatting for timing logs

---
 scripts/polygraph_eval    | 98 ++++++++++++++++++++++-----------------
 test/test_lm_polygraph.py |  4 +-
 2 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 6caf2c848..8f0728d37 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -23,7 +23,10 @@ from lm_polygraph.generation_metrics import *
 from lm_polygraph.estimators import *
 from lm_polygraph.ue_metrics import *
 from lm_polygraph.utils.common import load_external_module, load_processor, load_image
-from lm_polygraph.utils.generation_parameters import GenerationParameters, GenerationParametersFactory
+from lm_polygraph.utils.generation_parameters import (
+    GenerationParameters,
+    GenerationParametersFactory,
+)
 from lm_polygraph.defaults.register_default_stat_calculators import (
     register_default_stat_calculators,
 )
@@ -32,7 +35,8 @@ from lm_polygraph.utils.builder_enviroment_stat_calculator import (
 )
 from lm_polygraph.utils.factory_estimator import FactoryEstimator
 from lm_polygraph.utils.factory_stat_calculator import StatCalculatorContainer
-#from transformers import AutoProcessor, AutoModelForVision2Seq
+
+# from transformers import AutoProcessor, AutoModelForVision2Seq
 
 hydra_config = Path(os.environ.get("HYDRA_CONFIG", ""))
 
@@ -84,7 +88,7 @@ def main(args):
         project = os.environ["WANDB_PROJECT"]
         wandb.init(project=project, dir=save_path, config=wandb_cfg)
         wandb_save_directory(Path(save_path) / ".hydra")
-    
+
     save_path = args.save_path if "save_path" in args else save_path
     log.info(f"Main directory: {save_path}")
 
@@ -127,7 +131,7 @@ def main(args):
             trust_remote_code=getattr(args, "trust_remote_code", False),
             **cache_kwargs,
         )
-#	images=dataset.images
+        # 	images=dataset.images
         log.info(f"Done with loading eval data. ({time.time() - t_step:.1f}s)")
 
         log.info("=" * 100)
@@ -160,7 +164,7 @@ def main(args):
             ],
             ignore_exceptions=args.ignore_exceptions,
             max_new_tokens=args.max_new_tokens,
-            save_stats=getattr(args, 'save_stats', []),
+            save_stats=getattr(args, "save_stats", []),
             log_time=getattr(args, "log_time", False),
         )
 
@@ -176,14 +180,12 @@ def main(args):
             man.save(save_path + f"/ue_manager_seed{seed}")
 
         if hasattr(args, "report_to_wandb") and args.report_to_wandb:
-            wandb.log({str(k) : v for k, v in man.gen_metrics})
-            wandb.log({str(k) : v for k, v in man.metrics.items()})
+            wandb.log({str(k): v for k, v in man.gen_metrics})
+            wandb.log({str(k): v for k, v in man.metrics.items()})
             wandb.save(save_path + f"/ue_manager_seed{seed}")
 
-    
     if hasattr(args, "report_to_wandb") and args.report_to_wandb:
         wandb.finish()
-        
 
 
 def get_ue_metrics(args):
@@ -204,13 +206,17 @@ def get_ue_metrics(args):
 def get_stat_calculator_names(config):
     model_type_raw = getattr(config.model, "type", "Whitebox")
     model_type = (
-        "Blackbox" if model_type_raw == "Blackbox"
-        else "VisualLM" if model_type_raw == "VisualLM"
-        else "Whitebox"
+        "Blackbox"
+        if model_type_raw == "Blackbox"
+        else "VisualLM" if model_type_raw == "VisualLM" else "Whitebox"
     )
     language = getattr(config, "language", "en")
-    output_attentions = getattr(config, "output_attentions", True) and (getattr(config.model, "type", "Whitebox") != "vLLMCausalLM")
-    output_hidden_states = False if getattr(config.model, "type", "Whitebox") == "vLLMCausalLM" else True
+    output_attentions = getattr(config, "output_attentions", True) and (
+        getattr(config.model, "type", "Whitebox") != "vLLMCausalLM"
+    )
+    output_hidden_states = (
+        False if getattr(config.model, "type", "Whitebox") == "vLLMCausalLM" else True
+    )
     hf_cache = getattr(config, "hf_cache", None)
     deberta_batch_size = getattr(config, "deberta_batch_size", 10)
     blackbox_supports_logprobs = model_type == "Blackbox" and getattr(
@@ -223,7 +229,7 @@ def get_stat_calculator_names(config):
             model_type,
             language,
             hf_cache,
-            output_attentions=output_attentions, 
+            output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             blackbox_supports_logprobs=blackbox_supports_logprobs,
             deberta_batch_size=deberta_batch_size,
@@ -275,7 +281,13 @@ def get_generation_metrics(args):
             ),
         ]
         if args.task == "ats":
-            result += [AlignScore(target_is_claims=False, source_ignore_regex=ignore_regex, source_as_target=True)]
+            result += [
+                AlignScore(
+                    target_is_claims=False,
+                    source_ignore_regex=ignore_regex,
+                    source_as_target=True,
+                )
+            ]
         else:
             result += [AlignScore(target_is_claims=True)]
         if getattr(args.model, "type", "Whitebox") != "Blackbox":
@@ -412,7 +424,7 @@ def get_whitebox_model(args, cache_kwargs={}):
 
     generation_params = GenerationParametersFactory.from_params(
         yaml_config=getattr(args, "generation_params", {}),
-        native_config=base_model.generation_config.to_dict()
+        native_config=base_model.generation_config.to_dict(),
     )
 
     model = WhiteboxModel(
@@ -421,7 +433,7 @@ def get_whitebox_model(args, cache_kwargs={}):
         args.model.path,
         args.model.type,
         generation_params,
-        instruct=getattr(args, "instruct", False)
+        instruct=getattr(args, "instruct", False),
     )
 
     return model
@@ -438,59 +450,59 @@ def get_visual_model(args, cache_kwargs={}):
             getattr(args, "generation_params", {}),
             device_map=args.model.load_model_args.device_map,
             add_bos_token=getattr(args.model, "add_bos_token", True),
-            **cache_kwargs
+            **cache_kwargs,
         )
 
-    path_to_load_script = get_abs_path_from_hydra_config(
-            args.model.path_to_load_script
-        )
+    path_to_load_script = get_abs_path_from_hydra_config(args.model.path_to_load_script)
     load_module = load_external_module(path_to_load_script)
 
-    load_model_args = {'model_path': args.model.path}
+    load_model_args = {"model_path": args.model.path}
     load_model_args.update(args.model.load_model_args)
     base_model = load_module.load_model(**load_model_args)
 
-    load_tok_args = {'model_path': args.model.path}
+    load_tok_args = {"model_path": args.model.path}
     load_tok_args.update(args.model.load_tokenizer_args)
     tokenizer = load_module.load_tokenizer(**load_tok_args)
 
-    load_proc_args = {'model_path': args.model.path}
+    load_proc_args = {"model_path": args.model.path}
     load_proc_args.update(getattr(args.model, "load_processor_args", {}))
     processor = load_processor(**load_proc_args)
 
     generation_params = GenerationParametersFactory.from_params(
         yaml_config=getattr(args, "generation_params", {}),
-        native_config=base_model.generation_config.to_dict()
+        native_config=base_model.generation_config.to_dict(),
     )
 
-    model = VisualWhiteboxModel(base_model,
-                          processor,
-                          args.model.path,
-                          args.model.type,
-                          generation_params)
+    model = VisualWhiteboxModel(
+        base_model, processor, args.model.path, args.model.type, generation_params
+    )
 
     return model
 
 
 def get_vllm_model(args):
-    path_to_load_script = get_abs_path_from_hydra_config(
-            args.model.path_to_load_script
-        )
+    path_to_load_script = get_abs_path_from_hydra_config(args.model.path_to_load_script)
     load_module = load_external_module(path_to_load_script)
 
-    load_model_args = {'model_path': args.model.path, 
-                       'max_new_tokens': args.max_new_tokens, 
-                       'logprobs': args.model.logprobs}
+    load_model_args = {
+        "model_path": args.model.path,
+        "max_new_tokens": args.max_new_tokens,
+        "logprobs": args.model.logprobs,
+    }
 
     load_model_args.update(args.model.load_model_args)
     base_model, sampling_params = load_module.load_model(**load_model_args)
-    generation_parameters = GenerationParameters(**getattr(args, "generation_params", {}))
+    generation_parameters = GenerationParameters(
+        **getattr(args, "generation_params", {})
+    )
 
-    model = WhiteboxModelvLLM(model=base_model, 
-                              sampling_params=sampling_params,
-                              generation_parameters=generation_parameters,
-                              device=args.model.device,
-                              instruct= getattr(args.model, "instruct", False))
+    model = WhiteboxModelvLLM(
+        model=base_model,
+        sampling_params=sampling_params,
+        generation_parameters=generation_parameters,
+        device=args.model.device,
+        instruct=getattr(args.model, "instruct", False),
+    )
 
     return model
 
diff --git a/test/test_lm_polygraph.py b/test/test_lm_polygraph.py
index b438e90bd..6ce533e3b 100644
--- a/test/test_lm_polygraph.py
+++ b/test/test_lm_polygraph.py
@@ -26,7 +26,9 @@ def exec_bash(s):
         sys.stdout.flush()
     proc.wait()
     elapsed = time.time() - t0
-    print(f"[TIMER] Command finished in {elapsed:.1f}s (rc={proc.returncode})", flush=True)
+    print(
+        f"[TIMER] Command finished in {elapsed:.1f}s (rc={proc.returncode})", flush=True
+    )
     return proc
 
 

From 19c5748bec300ea17f6c2223f13240773fd48578 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 23:42:40 +0400
Subject: [PATCH 17/29] Add per-calculator timing logs to UEManager

---
 src/lm_polygraph/utils/manager.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index 3c7c4e24d..461f8ebe9 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -252,16 +252,14 @@ def calculate(self, batch_stats: dict, calculators: list, inp_texts: list) -> di
         """
         for stat_calculator in calculators:
             try:
-                if self.log_time:
-                    start_time = time.time()
-                    log.info(f"Calculating {stat_calculator}...")
+                start_time = time.time()
+                log.info(f"[CALC] Starting {stat_calculator.__class__.__name__}...")
                 new_stats = stat_calculator(
                     batch_stats, inp_texts, self.model, self.max_new_tokens
                 )
-                if self.log_time:
-                    log.info(
-                        f"Done calculating {stat_calculator} in {round(time.time() - start_time, 2)} secs"
-                    )
+                log.info(
+                    f"[CALC] Done {stat_calculator.__class__.__name__} in {round(time.time() - start_time, 2)}s"
+                )
                 for stat, stat_value in new_stats.items():
                     if stat in batch_stats.keys():
                         continue

From 8e3a894bec8654462cccd4312c2b5709b0aa47f8 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 23:44:49 +0400
Subject: [PATCH 18/29] Temporarily run only test_all_seq_ue for debugging

---
 .github/workflows/python-app.yml | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 1fa94026c..212cff702 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -18,10 +18,11 @@ jobs:
     runs-on: ubuntu-latest
     timeout-minutes: 45
 
+    # TODO: restore matrix after debugging
     strategy:
       fail-fast: false
       matrix:
-        transformers-version: ["default", "5.*"]
+        transformers-version: ["default"]
 
     steps:
     - uses: actions/checkout@v3
@@ -41,17 +42,18 @@ jobs:
       if: matrix.transformers-version != 'default'
       run: |
         pip install "transformers==${{ matrix.transformers-version }}"
-    - name: Lint
-      if: matrix.transformers-version == 'default'
-      uses: py-actions/flake8@v2
-      with:
-        args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401"
-        path: "."
-        plugins: "flake8-black"
+    # TODO: restore lint after debugging
+    # - name: Lint
+    #   if: matrix.transformers-version == 'default'
+    #   uses: py-actions/flake8@v2
+    #   with:
+    #     args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401"
+    #     path: "."
+    #     plugins: "flake8-black"
     - name: Remove cachedir in order to save up on disk
       run: rm -rf $HOME/.cache
     # If we exceed disk space limit again, we can test lm-polygraph tests separately, and delete cachedir again
     - name: Test with pytest
       timeout-minutes: 30
       run: |
-        pytest --ignore=test/local -s -v
+        pytest test/test_lm_polygraph.py::test_all_seq_ue -s -v

From d634148f4b3661a464e236752e1c7ec57230ef55 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 23:54:15 +0400
Subject: [PATCH 19/29] Add NLI calculator progress logging

---
 .../stat_calculators/greedy_alternatives_nli.py  | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py
index 187dd62f6..af1d9c7b9 100644
--- a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py
+++ b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py
@@ -1,4 +1,6 @@
+import logging
 import numpy as np
+import time
 import torch
 
 from typing import Dict, List, Tuple
@@ -10,6 +12,8 @@
 import torch.nn as nn
 import string
 
+log = logging.getLogger("lm_polygraph")
+
 
 def eval_nli_model(
     nli_queue: List[Tuple[str, str]],
@@ -69,9 +73,15 @@ def __call__(
         **kwargs,
     ) -> Dict[str, np.ndarray]:
         greedy_alternatives = dependencies["greedy_tokens_alternatives"]
+        total_samples = len(greedy_alternatives)
+        log.info(
+            f"[NLI] Processing {total_samples} samples, "
+            f"tokens per sample: {[len(s) for s in greedy_alternatives]}"
+        )
         greedy_alternatives_nli = []
-        for sample_alternatives in greedy_alternatives:
+        for sample_idx, sample_alternatives in enumerate(greedy_alternatives):
             nli_matrixes = []
+            t_sample = time.time()
             for w_number, word_alternatives in enumerate(sample_alternatives):
                 nli_queue = []
                 nli_matrix = [
@@ -107,6 +117,10 @@ def __call__(
                         nli_matrix[i][j] = nli_class[wi, wj]
 
                 nli_matrixes.append(nli_matrix)
+            log.info(
+                f"[NLI] Sample {sample_idx + 1}/{total_samples}: "
+                f"{len(sample_alternatives)} tokens in {time.time() - t_sample:.1f}s"
+            )
             greedy_alternatives_nli.append(nli_matrixes)
 
         return {"greedy_tokens_alternatives_nli": greedy_alternatives_nli}

From 5e27656deba141406b9625cf264077164338a6bd Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Wed, 8 Apr 2026 23:59:09 +0400
Subject: [PATCH 20/29] Pass generation params via GenerationConfig for
 transformers 5.x

In transformers 5.x, parameters like temperature, top_k, top_p must
be passed via GenerationConfig object, not as loose kwargs to
model.generate(). Without this, temperature was silently ignored,
causing the model to generate with default temperature=1.0 instead
of the configured value. This made generations much longer (never
hitting stop conditions early), causing NLI calculators to process
far more tokens and making CI tests take 25+ minutes instead of ~7.
---
 .github/workflows/python-app.yml | 23 +++++++++--------------
 src/lm_polygraph/utils/model.py  | 26 ++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 212cff702..56f35fd41 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -18,11 +18,10 @@ jobs:
     runs-on: ubuntu-latest
     timeout-minutes: 45
 
-    # TODO: restore matrix after debugging
     strategy:
       fail-fast: false
       matrix:
-        transformers-version: ["default"]
+        transformers-version: ["default", "5.*"]
 
     steps:
     - uses: actions/checkout@v3
@@ -34,26 +33,22 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install flake8 pytest hydra-core==1.3.2
-        # Pin numpy<2 for CI: bloomz-560m produces inf logits with numpy 2.x on CPU,
-        # causing degenerate generation. numpy 2.x compat is for vLLM users (GPU).
-        pip install "numpy<2"
         pip install .
     - name: Override transformers version
       if: matrix.transformers-version != 'default'
       run: |
         pip install "transformers==${{ matrix.transformers-version }}"
-    # TODO: restore lint after debugging
-    # - name: Lint
-    #   if: matrix.transformers-version == 'default'
-    #   uses: py-actions/flake8@v2
-    #   with:
-    #     args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401"
-    #     path: "."
-    #     plugins: "flake8-black"
+    - name: Lint
+      if: matrix.transformers-version == 'default'
+      uses: py-actions/flake8@v2
+      with:
+        args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401"
+        path: "."
+        plugins: "flake8-black"
     - name: Remove cachedir in order to save up on disk
       run: rm -rf $HOME/.cache
     # If we exceed disk space limit again, we can test lm-polygraph tests separately, and delete cachedir again
     - name: Test with pytest
       timeout-minutes: 30
       run: |
-        pytest test/test_lm_polygraph.py::test_all_seq_ue -s -v
+        pytest --ignore=test/local -s -v
diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py
index e10a7a905..5904eb6b9 100644
--- a/src/lm_polygraph/utils/model.py
+++ b/src/lm_polygraph/utils/model.py
@@ -12,6 +12,7 @@
     AutoModelForSeq2SeqLM,
     AutoModelForCausalLM,
     AutoConfig,
+    GenerationConfig,
     LogitsProcessorList,
     BartForConditionalGeneration,
 )
@@ -419,9 +420,25 @@ def __init__(
         self.generation_parameters = generation_parameters
         self.instruct = instruct
 
+    # Parameters that belong in GenerationConfig, not as loose kwargs
+    _GENERATION_CONFIG_KEYS = {
+        "temperature",
+        "top_k",
+        "top_p",
+        "do_sample",
+        "num_beams",
+        "repetition_penalty",
+        "max_new_tokens",
+        "max_length",
+        "min_length",
+        "num_return_sequences",
+        "renormalize_logits",
+    }
+
     def _validate_args(self, args):
         """
         Validates and adapts arguments for WhiteboxModel generation.
+        Wraps generation parameters in a GenerationConfig for transformers 5.x compat.
 
         Parameters:
             args (dict): The arguments to validate.
@@ -444,6 +461,15 @@ def _validate_args(self, args):
         for key in keys_to_remove:
             args_copy.pop(key, None)
 
+        # Wrap generation parameters in GenerationConfig for transformers 5.x compat
+        # (transformers 5.x ignores temperature/top_k/etc. as loose kwargs)
+        gen_config_kwargs = {}
+        for key in list(args_copy.keys()):
+            if key in self._GENERATION_CONFIG_KEYS:
+                gen_config_kwargs[key] = args_copy.pop(key)
+        if gen_config_kwargs:
+            args_copy["generation_config"] = GenerationConfig(**gen_config_kwargs)
+
         return args_copy
 
     class _ScoresProcessor:

From abd79c0abbc6ac65ad16c0b06b81a79c43791b64 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Thu, 9 Apr 2026 00:11:27 +0400
Subject: [PATCH 21/29] Revert "Pass generation params via GenerationConfig for
 transformers 5.x"

This reverts commit 5e27656deba141406b9625cf264077164338a6bd.
---
 .github/workflows/python-app.yml | 23 ++++++++++++++---------
 src/lm_polygraph/utils/model.py  | 26 --------------------------
 2 files changed, 14 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 56f35fd41..212cff702 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -18,10 +18,11 @@ jobs:
     runs-on: ubuntu-latest
     timeout-minutes: 45
 
+    # TODO: restore matrix after debugging
     strategy:
       fail-fast: false
       matrix:
-        transformers-version: ["default", "5.*"]
+        transformers-version: ["default"]
 
     steps:
     - uses: actions/checkout@v3
@@ -33,22 +34,26 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install flake8 pytest hydra-core==1.3.2
+        # Pin numpy<2 for CI: bloomz-560m produces inf logits with numpy 2.x on CPU,
+        # causing degenerate generation. numpy 2.x compat is for vLLM users (GPU).
+        pip install "numpy<2"
         pip install .
     - name: Override transformers version
       if: matrix.transformers-version != 'default'
       run: |
         pip install "transformers==${{ matrix.transformers-version }}"
-    - name: Lint
-      if: matrix.transformers-version == 'default'
-      uses: py-actions/flake8@v2
-      with:
-        args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401"
-        path: "."
-        plugins: "flake8-black"
+    # TODO: restore lint after debugging
+    # - name: Lint
+    #   if: matrix.transformers-version == 'default'
+    #   uses: py-actions/flake8@v2
+    #   with:
+    #     args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401"
+    #     path: "."
+    #     plugins: "flake8-black"
     - name: Remove cachedir in order to save up on disk
       run: rm -rf $HOME/.cache
     # If we exceed disk space limit again, we can test lm-polygraph tests separately, and delete cachedir again
     - name: Test with pytest
       timeout-minutes: 30
       run: |
-        pytest --ignore=test/local -s -v
+        pytest test/test_lm_polygraph.py::test_all_seq_ue -s -v
diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py
index 5904eb6b9..e10a7a905 100644
--- a/src/lm_polygraph/utils/model.py
+++ b/src/lm_polygraph/utils/model.py
@@ -12,7 +12,6 @@
     AutoModelForSeq2SeqLM,
     AutoModelForCausalLM,
     AutoConfig,
-    GenerationConfig,
     LogitsProcessorList,
     BartForConditionalGeneration,
 )
@@ -420,25 +419,9 @@ def __init__(
         self.generation_parameters = generation_parameters
         self.instruct = instruct
 
-    # Parameters that belong in GenerationConfig, not as loose kwargs
-    _GENERATION_CONFIG_KEYS = {
-        "temperature",
-        "top_k",
-        "top_p",
-        "do_sample",
-        "num_beams",
-        "repetition_penalty",
-        "max_new_tokens",
-        "max_length",
-        "min_length",
-        "num_return_sequences",
-        "renormalize_logits",
-    }
-
     def _validate_args(self, args):
         """
         Validates and adapts arguments for WhiteboxModel generation.
-        Wraps generation parameters in a GenerationConfig for transformers 5.x compat.
 
         Parameters:
             args (dict): The arguments to validate.
@@ -461,15 +444,6 @@ def _validate_args(self, args):
         for key in keys_to_remove:
             args_copy.pop(key, None)
 
-        # Wrap generation parameters in GenerationConfig for transformers 5.x compat
-        # (transformers 5.x ignores temperature/top_k/etc. as loose kwargs)
-        gen_config_kwargs = {}
-        for key in list(args_copy.keys()):
-            if key in self._GENERATION_CONFIG_KEYS:
-                gen_config_kwargs[key] = args_copy.pop(key)
-        if gen_config_kwargs:
-            args_copy["generation_config"] = GenerationConfig(**gen_config_kwargs)
-
         return args_copy
 
     class _ScoresProcessor:

From 90c3ccf2d3ec0e332b3662c15f06c2908c4b728c Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Thu, 9 Apr 2026 00:32:54 +0400
Subject: [PATCH 22/29] Revert debug changes, clean up PR

Revert all debug logging and failed fix attempts (sanitizer,
GenerationConfig wrapping, test config changes). Keep only the
core changes: relaxed spacy bound, optional comet, transformers
5.x import compat, and batch_encode_plus replacement.

CI now tests both transformers <5 and >=5 via matrix strategy.
---
 .github/workflows/python-app.yml              |  29 ++---
 scripts/polygraph_eval                        | 111 ++++++++----------
 .../greedy_alternatives_nli.py                |  16 +--
 src/lm_polygraph/utils/manager.py             |  12 +-
 src/lm_polygraph/utils/model.py               |  34 +-----
 test/configs/test_polygraph_eval_seq_ue.yaml  |   6 +-
 test/test_lm_polygraph.py                     |  17 +--
 7 files changed, 70 insertions(+), 155 deletions(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 212cff702..1de5747c1 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -18,11 +18,10 @@ jobs:
     runs-on: ubuntu-latest
     timeout-minutes: 45
 
-    # TODO: restore matrix after debugging
     strategy:
       fail-fast: false
       matrix:
-        transformers-version: ["default"]
+        transformers-version: ["<5", ">=5"]
 
     steps:
     - uses: actions/checkout@v3
@@ -34,26 +33,20 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install flake8 pytest hydra-core==1.3.2
-        # Pin numpy<2 for CI: bloomz-560m produces inf logits with numpy 2.x on CPU,
-        # causing degenerate generation. numpy 2.x compat is for vLLM users (GPU).
-        pip install "numpy<2"
         pip install .
-    - name: Override transformers version
-      if: matrix.transformers-version != 'default'
+    - name: Pin transformers version
       run: |
-        pip install "transformers==${{ matrix.transformers-version }}"
-    # TODO: restore lint after debugging
-    # - name: Lint
-    #   if: matrix.transformers-version == 'default'
-    #   uses: py-actions/flake8@v2
-    #   with:
-    #     args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401"
-    #     path: "."
-    #     plugins: "flake8-black"
+        pip install "transformers${{ matrix.transformers-version }}"
+    - name: Lint
+      if: matrix.transformers-version == '<5'
+      uses: py-actions/flake8@v2
+      with:
+        args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401"
+        path: "."
+        plugins: "flake8-black"
     - name: Remove cachedir in order to save up on disk
       run: rm -rf $HOME/.cache
     # If we exceed disk space limit again, we can test lm-polygraph tests separately, and delete cachedir again
     - name: Test with pytest
-      timeout-minutes: 30
       run: |
-        pytest test/test_lm_polygraph.py::test_all_seq_ue -s -v
+        pytest --ignore=test/local
diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 8f0728d37..de8b80a48 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -2,7 +2,6 @@
 
 import hydra
 import os
-import time
 import transformers
 from pathlib import Path
 from omegaconf import OmegaConf
@@ -23,10 +22,7 @@ from lm_polygraph.generation_metrics import *
 from lm_polygraph.estimators import *
 from lm_polygraph.ue_metrics import *
 from lm_polygraph.utils.common import load_external_module, load_processor, load_image
-from lm_polygraph.utils.generation_parameters import (
-    GenerationParameters,
-    GenerationParametersFactory,
-)
+from lm_polygraph.utils.generation_parameters import GenerationParameters, GenerationParametersFactory
 from lm_polygraph.defaults.register_default_stat_calculators import (
     register_default_stat_calculators,
 )
@@ -35,8 +31,7 @@ from lm_polygraph.utils.builder_enviroment_stat_calculator import (
 )
 from lm_polygraph.utils.factory_estimator import FactoryEstimator
 from lm_polygraph.utils.factory_stat_calculator import StatCalculatorContainer
-
-# from transformers import AutoProcessor, AutoModelForVision2Seq
+#from transformers import AutoProcessor, AutoModelForVision2Seq
 
 hydra_config = Path(os.environ.get("HYDRA_CONFIG", ""))
 
@@ -88,7 +83,7 @@ def main(args):
         project = os.environ["WANDB_PROJECT"]
         wandb.init(project=project, dir=save_path, config=wandb_cfg)
         wandb_save_directory(Path(save_path) / ".hydra")
-
+    
     save_path = args.save_path if "save_path" in args else save_path
     log.info(f"Main directory: {save_path}")
 
@@ -103,15 +98,13 @@ def main(args):
         log.info("=" * 100)
         log.info(f"SEED: {seed}")
 
-        t_step = time.time()
         log.info(f"Loading model {args.model.path}...")
         transformers.set_seed(seed)
 
         model = get_model(args)
 
-        log.info(f"Done with loading model. ({time.time() - t_step:.1f}s)")
+        log.info("Done with loading model.")
 
-        t_step = time.time()
         log.info(f"Loading dataset {args.dataset}...")
         dataset = Dataset.load(
             args.dataset,
@@ -131,15 +124,14 @@ def main(args):
             trust_remote_code=getattr(args, "trust_remote_code", False),
             **cache_kwargs,
         )
-        # 	images=dataset.images
-        log.info(f"Done with loading eval data. ({time.time() - t_step:.1f}s)")
+#	images=dataset.images
+        log.info("Done with loading eval data.")
 
         log.info("=" * 100)
-        t_step = time.time()
         log.info("Initializing UE estimators...")
         estimators = []
         estimators += get_ue_methods(args, model)
-        log.info(f"Done loading UE estimators ({time.time() - t_step:.1f}s)")
+        log.info("Done loading UE estimators")
 
         if args.subsample_eval_dataset != -1:
             dataset.subsample(args.subsample_eval_dataset, seed=seed)
@@ -164,28 +156,27 @@ def main(args):
             ],
             ignore_exceptions=args.ignore_exceptions,
             max_new_tokens=args.max_new_tokens,
-            save_stats=getattr(args, "save_stats", []),
+            save_stats=getattr(args, 'save_stats', []),
             log_time=getattr(args, "log_time", False),
         )
 
-        t_step = time.time()
-        log.info("Starting UEManager evaluation...")
         try:
             man()
         except Exception as e:
             man.state = "failed"
             raise e
         finally:
-            log.info(f"UEManager evaluation finished. ({time.time() - t_step:.1f}s)")
             man.save(save_path + f"/ue_manager_seed{seed}")
 
         if hasattr(args, "report_to_wandb") and args.report_to_wandb:
-            wandb.log({str(k): v for k, v in man.gen_metrics})
-            wandb.log({str(k): v for k, v in man.metrics.items()})
+            wandb.log({str(k) : v for k, v in man.gen_metrics})
+            wandb.log({str(k) : v for k, v in man.metrics.items()})
             wandb.save(save_path + f"/ue_manager_seed{seed}")
 
+    
     if hasattr(args, "report_to_wandb") and args.report_to_wandb:
         wandb.finish()
+        
 
 
 def get_ue_metrics(args):
@@ -206,17 +197,13 @@ def get_ue_metrics(args):
 def get_stat_calculator_names(config):
     model_type_raw = getattr(config.model, "type", "Whitebox")
     model_type = (
-        "Blackbox"
-        if model_type_raw == "Blackbox"
-        else "VisualLM" if model_type_raw == "VisualLM" else "Whitebox"
+        "Blackbox" if model_type_raw == "Blackbox"
+        else "VisualLM" if model_type_raw == "VisualLM"
+        else "Whitebox"
     )
     language = getattr(config, "language", "en")
-    output_attentions = getattr(config, "output_attentions", True) and (
-        getattr(config.model, "type", "Whitebox") != "vLLMCausalLM"
-    )
-    output_hidden_states = (
-        False if getattr(config.model, "type", "Whitebox") == "vLLMCausalLM" else True
-    )
+    output_attentions = getattr(config, "output_attentions", True) and (getattr(config.model, "type", "Whitebox") != "vLLMCausalLM")
+    output_hidden_states = False if getattr(config.model, "type", "Whitebox") == "vLLMCausalLM" else True
     hf_cache = getattr(config, "hf_cache", None)
     deberta_batch_size = getattr(config, "deberta_batch_size", 10)
     blackbox_supports_logprobs = model_type == "Blackbox" and getattr(
@@ -229,7 +216,7 @@ def get_stat_calculator_names(config):
             model_type,
             language,
             hf_cache,
-            output_attentions=output_attentions,
+            output_attentions=output_attentions, 
             output_hidden_states=output_hidden_states,
             blackbox_supports_logprobs=blackbox_supports_logprobs,
             deberta_batch_size=deberta_batch_size,
@@ -281,13 +268,7 @@ def get_generation_metrics(args):
             ),
         ]
         if args.task == "ats":
-            result += [
-                AlignScore(
-                    target_is_claims=False,
-                    source_ignore_regex=ignore_regex,
-                    source_as_target=True,
-                )
-            ]
+            result += [AlignScore(target_is_claims=False, source_ignore_regex=ignore_regex, source_as_target=True)]
         else:
             result += [AlignScore(target_is_claims=True)]
         if getattr(args.model, "type", "Whitebox") != "Blackbox":
@@ -424,7 +405,7 @@ def get_whitebox_model(args, cache_kwargs={}):
 
     generation_params = GenerationParametersFactory.from_params(
         yaml_config=getattr(args, "generation_params", {}),
-        native_config=base_model.generation_config.to_dict(),
+        native_config=base_model.generation_config.to_dict()
     )
 
     model = WhiteboxModel(
@@ -433,7 +414,7 @@ def get_whitebox_model(args, cache_kwargs={}):
         args.model.path,
         args.model.type,
         generation_params,
-        instruct=getattr(args, "instruct", False),
+        instruct=getattr(args, "instruct", False)
     )
 
     return model
@@ -450,59 +431,59 @@ def get_visual_model(args, cache_kwargs={}):
             getattr(args, "generation_params", {}),
             device_map=args.model.load_model_args.device_map,
             add_bos_token=getattr(args.model, "add_bos_token", True),
-            **cache_kwargs,
+            **cache_kwargs
         )
 
-    path_to_load_script = get_abs_path_from_hydra_config(args.model.path_to_load_script)
+    path_to_load_script = get_abs_path_from_hydra_config(
+            args.model.path_to_load_script
+        )
     load_module = load_external_module(path_to_load_script)
 
-    load_model_args = {"model_path": args.model.path}
+    load_model_args = {'model_path': args.model.path}
     load_model_args.update(args.model.load_model_args)
     base_model = load_module.load_model(**load_model_args)
 
-    load_tok_args = {"model_path": args.model.path}
+    load_tok_args = {'model_path': args.model.path}
     load_tok_args.update(args.model.load_tokenizer_args)
     tokenizer = load_module.load_tokenizer(**load_tok_args)
 
-    load_proc_args = {"model_path": args.model.path}
+    load_proc_args = {'model_path': args.model.path}
     load_proc_args.update(getattr(args.model, "load_processor_args", {}))
     processor = load_processor(**load_proc_args)
 
     generation_params = GenerationParametersFactory.from_params(
         yaml_config=getattr(args, "generation_params", {}),
-        native_config=base_model.generation_config.to_dict(),
+        native_config=base_model.generation_config.to_dict()
     )
 
-    model = VisualWhiteboxModel(
-        base_model, processor, args.model.path, args.model.type, generation_params
-    )
+    model = VisualWhiteboxModel(base_model,
+                          processor,
+                          args.model.path,
+                          args.model.type,
+                          generation_params)
 
     return model
 
 
 def get_vllm_model(args):
-    path_to_load_script = get_abs_path_from_hydra_config(args.model.path_to_load_script)
+    path_to_load_script = get_abs_path_from_hydra_config(
+            args.model.path_to_load_script
+        )
     load_module = load_external_module(path_to_load_script)
 
-    load_model_args = {
-        "model_path": args.model.path,
-        "max_new_tokens": args.max_new_tokens,
-        "logprobs": args.model.logprobs,
-    }
+    load_model_args = {'model_path': args.model.path, 
+                       'max_new_tokens': args.max_new_tokens, 
+                       'logprobs': args.model.logprobs}
 
     load_model_args.update(args.model.load_model_args)
     base_model, sampling_params = load_module.load_model(**load_model_args)
-    generation_parameters = GenerationParameters(
-        **getattr(args, "generation_params", {})
-    )
+    generation_parameters = GenerationParameters(**getattr(args, "generation_params", {}))
 
-    model = WhiteboxModelvLLM(
-        model=base_model,
-        sampling_params=sampling_params,
-        generation_parameters=generation_parameters,
-        device=args.model.device,
-        instruct=getattr(args.model, "instruct", False),
-    )
+    model = WhiteboxModelvLLM(model=base_model, 
+                              sampling_params=sampling_params,
+                              generation_parameters=generation_parameters,
+                              device=args.model.device,
+                              instruct= getattr(args.model, "instruct", False))
 
     return model
 
diff --git a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py
index af1d9c7b9..187dd62f6 100644
--- a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py
+++ b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py
@@ -1,6 +1,4 @@
-import logging
 import numpy as np
-import time
 import torch
 
 from typing import Dict, List, Tuple
@@ -12,8 +10,6 @@
 import torch.nn as nn
 import string
 
-log = logging.getLogger("lm_polygraph")
-
 
 def eval_nli_model(
     nli_queue: List[Tuple[str, str]],
@@ -73,15 +69,9 @@ def __call__(
         **kwargs,
     ) -> Dict[str, np.ndarray]:
         greedy_alternatives = dependencies["greedy_tokens_alternatives"]
-        total_samples = len(greedy_alternatives)
-        log.info(
-            f"[NLI] Processing {total_samples} samples, "
-            f"tokens per sample: {[len(s) for s in greedy_alternatives]}"
-        )
         greedy_alternatives_nli = []
-        for sample_idx, sample_alternatives in enumerate(greedy_alternatives):
+        for sample_alternatives in greedy_alternatives:
             nli_matrixes = []
-            t_sample = time.time()
             for w_number, word_alternatives in enumerate(sample_alternatives):
                 nli_queue = []
                 nli_matrix = [
@@ -117,10 +107,6 @@ def __call__(
                         nli_matrix[i][j] = nli_class[wi, wj]
 
                 nli_matrixes.append(nli_matrix)
-            log.info(
-                f"[NLI] Sample {sample_idx + 1}/{total_samples}: "
-                f"{len(sample_alternatives)} tokens in {time.time() - t_sample:.1f}s"
-            )
             greedy_alternatives_nli.append(nli_matrixes)
 
         return {"greedy_tokens_alternatives_nli": greedy_alternatives_nli}
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index 461f8ebe9..3c7c4e24d 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -252,14 +252,16 @@ def calculate(self, batch_stats: dict, calculators: list, inp_texts: list) -> di
         """
         for stat_calculator in calculators:
             try:
-                start_time = time.time()
-                log.info(f"[CALC] Starting {stat_calculator.__class__.__name__}...")
+                if self.log_time:
+                    start_time = time.time()
+                    log.info(f"Calculating {stat_calculator}...")
                 new_stats = stat_calculator(
                     batch_stats, inp_texts, self.model, self.max_new_tokens
                 )
-                log.info(
-                    f"[CALC] Done {stat_calculator.__class__.__name__} in {round(time.time() - start_time, 2)}s"
-                )
+                if self.log_time:
+                    log.info(
+                        f"Done calculating {stat_calculator} in {round(time.time() - start_time, 2)} secs"
+                    )
                 for stat, stat_value in new_stats.items():
                     if stat in batch_stats.keys():
                         continue
diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py
index e10a7a905..aa06a3053 100644
--- a/src/lm_polygraph/utils/model.py
+++ b/src/lm_polygraph/utils/model.py
@@ -455,32 +455,6 @@ def __call__(self, input_ids=None, scores=None):
             self.scores.append(scores.log_softmax(-1))
             return scores
 
-    class _SanitizeLogitsProcessor:
-        # Replaces inf/nan in logits with finite values to prevent
-        # RuntimeError in torch.multinomial during sampling.
-        # Uses per-row max/min of finite values to avoid dominating softmax.
-        def __call__(self, input_ids=None, scores=None):
-            if torch.isfinite(scores).all():
-                return scores
-            finite_mask = torch.isfinite(scores)
-            # Compute per-row max/min of finite values
-            masked = scores.clone()
-            masked[~finite_mask] = float("-inf")
-            row_max = masked.max(dim=-1, keepdim=True).values
-            masked[~finite_mask] = float("inf")
-            row_min = masked.min(dim=-1, keepdim=True).values
-            # Fallback if entire row is non-finite
-            row_max = torch.where(
-                torch.isfinite(row_max), row_max, torch.zeros_like(row_max)
-            )
-            row_min = torch.where(
-                torch.isfinite(row_min), row_min, torch.zeros_like(row_min)
-            )
-            scores = torch.where(torch.isposinf(scores), row_max, scores)
-            scores = torch.where(torch.isneginf(scores), row_min, scores)
-            scores = torch.nan_to_num(scores, nan=0.0)
-            return scores
-
     def generate(self, **args):
         """
         Generates the model output with scores from batch formed by HF Tokenizer.
@@ -492,16 +466,14 @@ def generate(self, **args):
         """
         default_params = asdict(self.generation_parameters)
 
-        # add ScoresProcessor to collect original scores, and SanitizeLogitsProcessor
-        # to prevent inf/nan from crashing torch.multinomial during sampling
+        # add ScoresProcessor to collect original scores
         processor = self._ScoresProcessor()
-        sanitizer = self._SanitizeLogitsProcessor()
         if "logits_processor" in args.keys():
             logits_processor = LogitsProcessorList(
-                [sanitizer, processor, args["logits_processor"]]
+                [processor, args["logits_processor"]]
             )
         else:
-            logits_processor = LogitsProcessorList([sanitizer, processor])
+            logits_processor = LogitsProcessorList([processor])
         args["logits_processor"] = logits_processor
 
         # update default parameters with passed arguments
diff --git a/test/configs/test_polygraph_eval_seq_ue.yaml b/test/configs/test_polygraph_eval_seq_ue.yaml
index 53fd97726..b259f071f 100644
--- a/test/configs/test_polygraph_eval_seq_ue.yaml
+++ b/test/configs/test_polygraph_eval_seq_ue.yaml
@@ -22,9 +22,6 @@ instruct: false
 prompt: ""
 
 ignore_exceptions: false
-
-generation_params:
-  temperature: 0.7
 generation_metrics:
   - name: RougeMetric
     args: ["rouge1"]
@@ -34,7 +31,6 @@ stat_calculators:
 
 subsample_eval_dataset: 10
 batch_size: 2
-seed:
-  - 42
+seed: null
 device: null
 max_new_tokens: 256
diff --git a/test/test_lm_polygraph.py b/test/test_lm_polygraph.py
index 6ce533e3b..da82128c2 100644
--- a/test/test_lm_polygraph.py
+++ b/test/test_lm_polygraph.py
@@ -1,7 +1,5 @@
 import subprocess
 import pathlib
-import time
-import sys
 
 from lm_polygraph.utils.manager import UEManager
 
@@ -16,20 +14,7 @@
 
 
 def exec_bash(s):
-    print(f"\n[TIMER] Starting command: {s}", flush=True)
-    t0 = time.time()
-    proc = subprocess.Popen(
-        s, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
-    )
-    for line in proc.stdout:
-        sys.stdout.write(line)
-        sys.stdout.flush()
-    proc.wait()
-    elapsed = time.time() - t0
-    print(
-        f"[TIMER] Command finished in {elapsed:.1f}s (rc={proc.returncode})", flush=True
-    )
-    return proc
+    return subprocess.run(s, shell=True)
 
 
 def pwd():

From fa46e1edb8ebdd68ce50be52419f659ca2f9003b Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Thu, 9 Apr 2026 00:35:10 +0400
Subject: [PATCH 23/29] Pin transformers before installing lm-polygraph

---
 .github/workflows/python-app.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 1de5747c1..c15bb1286 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -33,10 +33,8 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install flake8 pytest hydra-core==1.3.2
-        pip install .
-    - name: Pin transformers version
-      run: |
         pip install "transformers${{ matrix.transformers-version }}"
+        pip install .
     - name: Lint
       if: matrix.transformers-version == '<5'
       uses: py-actions/flake8@v2

From 2133b82cfba34ef4ec1d71c29629ce7c40c08c81 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Thu, 9 Apr 2026 00:45:19 +0400
Subject: [PATCH 24/29] Use single CI job (same as main)

---
 .github/workflows/python-app.yml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index c15bb1286..76dc643a5 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -16,12 +16,6 @@ jobs:
   build:
 
     runs-on: ubuntu-latest
-    timeout-minutes: 45
-
-    strategy:
-      fail-fast: false
-      matrix:
-        transformers-version: ["<5", ">=5"]
 
     steps:
     - uses: actions/checkout@v3
@@ -33,10 +27,8 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install flake8 pytest hydra-core==1.3.2
-        pip install "transformers${{ matrix.transformers-version }}"
         pip install .
     - name: Lint
-      if: matrix.transformers-version == '<5'
       uses: py-actions/flake8@v2
       with:
         args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401"

From 45955e808bb965de355b7bf5907a478ec958da1d Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Thu, 9 Apr 2026 00:53:44 +0400
Subject: [PATCH 25/29] Add logits sanitizer to prevent inf/nan crash in
 sampling

---
 src/lm_polygraph/utils/model.py | 38 ++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py
index aa06a3053..ccd1d11bc 100644
--- a/src/lm_polygraph/utils/model.py
+++ b/src/lm_polygraph/utils/model.py
@@ -455,6 +455,37 @@ def __call__(self, input_ids=None, scores=None):
             self.scores.append(scores.log_softmax(-1))
             return scores
 
+    class _SanitizeLogitsProcessor:
+        # Replaces inf/nan in logits with finite values to prevent
+        # RuntimeError in torch.multinomial during sampling.
+        # Uses per-row max/min of finite values to preserve distribution shape.
+        def __call__(self, input_ids=None, scores=None):
+            if torch.isfinite(scores).all():
+                return scores
+            finite_mask = torch.isfinite(scores)
+            scores_for_max = torch.where(
+                finite_mask,
+                scores,
+                torch.tensor(float("-inf"), dtype=scores.dtype, device=scores.device),
+            )
+            scores_for_min = torch.where(
+                finite_mask,
+                scores,
+                torch.tensor(float("inf"), dtype=scores.dtype, device=scores.device),
+            )
+            row_max = scores_for_max.max(dim=-1, keepdim=True).values
+            row_min = scores_for_min.min(dim=-1, keepdim=True).values
+            row_max = torch.where(
+                torch.isfinite(row_max), row_max, torch.zeros_like(row_max)
+            )
+            row_min = torch.where(
+                torch.isfinite(row_min), row_min, torch.zeros_like(row_min)
+            )
+            scores = torch.where(torch.isposinf(scores), row_max, scores)
+            scores = torch.where(torch.isneginf(scores), row_min, scores)
+            scores = torch.nan_to_num(scores, nan=0.0)
+            return scores
+
     def generate(self, **args):
         """
         Generates the model output with scores from batch formed by HF Tokenizer.
@@ -466,14 +497,15 @@ def generate(self, **args):
         """
         default_params = asdict(self.generation_parameters)
 
-        # add ScoresProcessor to collect original scores
+        # add ScoresProcessor and SanitizeLogitsProcessor
         processor = self._ScoresProcessor()
+        sanitizer = self._SanitizeLogitsProcessor()
         if "logits_processor" in args.keys():
             logits_processor = LogitsProcessorList(
-                [processor, args["logits_processor"]]
+                [sanitizer, processor, args["logits_processor"]]
             )
         else:
-            logits_processor = LogitsProcessorList([processor])
+            logits_processor = LogitsProcessorList([sanitizer, processor])
         args["logits_processor"] = logits_processor
 
         # update default parameters with passed arguments

From 0184ad58810ffcd74aa30dee837df132cd96a34e Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Thu, 9 Apr 2026 00:55:36 +0400
Subject: [PATCH 26/29] Add CI matrix: transformers <5 and >=5

---
 .github/workflows/python-app.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 76dc643a5..c15bb1286 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -16,6 +16,12 @@ jobs:
   build:
 
     runs-on: ubuntu-latest
+    timeout-minutes: 45
+
+    strategy:
+      fail-fast: false
+      matrix:
+        transformers-version: ["<5", ">=5"]
 
     steps:
     - uses: actions/checkout@v3
@@ -27,8 +33,10 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install flake8 pytest hydra-core==1.3.2
+        pip install "transformers${{ matrix.transformers-version }}"
         pip install .
     - name: Lint
+      if: matrix.transformers-version == '<5'
       uses: py-actions/flake8@v2
       with:
         args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401"

From 28dd6fb469e3ce2c00e41213e82aa02579089cb9 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Thu, 9 Apr 2026 01:30:52 +0400
Subject: [PATCH 27/29] Pin transformers<5, single CI job

---
 .github/workflows/python-app.yml | 8 --------
 requirements.txt                 | 2 +-
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index c15bb1286..76dc643a5 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -16,12 +16,6 @@ jobs:
   build:
 
     runs-on: ubuntu-latest
-    timeout-minutes: 45
-
-    strategy:
-      fail-fast: false
-      matrix:
-        transformers-version: ["<5", ">=5"]
 
     steps:
     - uses: actions/checkout@v3
@@ -33,10 +27,8 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install flake8 pytest hydra-core==1.3.2
-        pip install "transformers${{ matrix.transformers-version }}"
         pip install .
     - name: Lint
-      if: matrix.transformers-version == '<5'
       uses: py-actions/flake8@v2
       with:
         args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401"
diff --git a/requirements.txt b/requirements.txt
index 03f294b85..b4d67e210 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ matplotlib>=3.6
 pandas>=1.3.5
 torch>=2.6.0
 bs4
-transformers>=4.50.0
+transformers>=4.50.0,<5
 nltk>=3.6.5
 sacrebleu>=1.5.0
 sentencepiece>=0.2.1

From d226bc4b74dec10e8ab4ee87c9339a552ead4ad4 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Thu, 9 Apr 2026 12:44:57 +0400
Subject: [PATCH 28/29] Remove transformers<5 pin, move comet to optional
 extras

- Remove transformers upper bound (compat code handles both 4.x and 5.x)
- Move unbabel-comet to [comet] extra in pyproject.toml
- Update README with two install paths (with/without comet)
- Fix black formatting in generation_metrics __init__
---
 README.md                                       | 7 ++++++-
 pyproject.toml                                  | 3 +++
 requirements.txt                                | 3 +--
 src/lm_polygraph/generation_metrics/__init__.py | 1 +
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 7661e8d72..3483d484e 100644
--- a/README.md
+++ b/README.md
@@ -55,8 +55,13 @@ pip install lm-polygraph
 
 Some features require additional packages that are not installed by default:
 
-- **COMET metric** (translation evaluation): `unbabel-comet` has constrained `transformers` version requirements, so it is installed separately:
+- **COMET metric** (translation evaluation): `unbabel-comet` pins `numpy<2.0` which may conflict with packages like vLLM. Install via extras:
   ```shell
+  pip install lm-polygraph[comet]
+  ```
+  If you need numpy 2.x (e.g., for vLLM), install without the extra and add comet manually:
+  ```shell
+  pip install lm-polygraph
   pip install unbabel-comet --no-deps
   ```
 
diff --git a/pyproject.toml b/pyproject.toml
index 503aad05d..7f895a357 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,9 @@ classifiers = [
 "Repository" = "https://github.com/IINemo/lm-polygraph"
 "Documentation" = "https://lm-polygraph.readthedocs.io"
 
+[project.optional-dependencies]
+comet = ["unbabel-comet<3"]
+
 [tool.setuptools]
 script-files = [
   "scripts/polygraph_eval",
diff --git a/requirements.txt b/requirements.txt
index b4d67e210..5411d9864 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ matplotlib>=3.6
 pandas>=1.3.5
 torch>=2.6.0
 bs4
-transformers>=4.50.0,<5
+transformers>=4.50.0
 nltk>=3.6.5
 sacrebleu>=1.5.0
 sentencepiece>=0.2.1
@@ -29,7 +29,6 @@ openai>=1.52.0
 wget
 sentence-transformers
 bert-score>=0.3.13
-# unbabel-comet<3  # Optional, see README for installation instructions
 nltk>=3.7,<4
 evaluate>=0.4.2
 spacy>=3.4.0
diff --git a/src/lm_polygraph/generation_metrics/__init__.py b/src/lm_polygraph/generation_metrics/__init__.py
index 063ce18e0..da3529cfd 100644
--- a/src/lm_polygraph/generation_metrics/__init__.py
+++ b/src/lm_polygraph/generation_metrics/__init__.py
@@ -8,6 +8,7 @@
     from .comet import Comet
 except ImportError:
     Comet = None
+
 from .alignscore import AlignScore
 from .openai_fact_check import OpenAIFactCheck
 from .bert_score import BertScoreMetric

From daae87522d629f2aa560d23d49791cb6dfc64267 Mon Sep 17 00:00:00 2001
From: Vlad Smirnov <smirnovlad03@gmail.com>
Date: Thu, 9 Apr 2026 12:46:11 +0400
Subject: [PATCH 29/29] Install lm-polygraph[comet] in CI

---
 .github/workflows/python-app.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 76dc643a5..c6ab5a3ca 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -27,7 +27,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install flake8 pytest hydra-core==1.3.2
-        pip install .
+        pip install ".[comet]"
     - name: Lint
       uses: py-actions/flake8@v2
       with: