From a2e1eca5f15ae2cc04ae8125125d099a075b0781 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 16:14:48 +0400 Subject: [PATCH 01/29] Relax spacy upper bound, make unbabel-comet optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove spacy<3.8.0 upper bound: spacy 3.8+ uses thinc 8.3+/9.x which is compatible with numpy 2.x (required by vLLM and other modern ML packages). The old pin forced thinc 8.2.x → numpy 1.x, creating unresolvable conflicts with vLLM/torch/cupy. - Make unbabel-comet optional: comment out from requirements.txt and guard the import in generation_metrics/__init__.py. The Comet metric class is only used for translation evaluation and is not needed by most users. Users who need it can install separately with `pip install unbabel-comet --no-deps`. - Move `from evaluate import load` to lazy import inside Comet.__init__ so the module can be imported without unbabel-comet installed. --- requirements.txt | 4 ++-- src/lm_polygraph/generation_metrics/__init__.py | 6 +++++- src/lm_polygraph/generation_metrics/comet.py | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 520bc1750..fba670f3d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,10 +29,10 @@ openai>=1.52.0 wget sentence-transformers bert-score>=0.3.13 -unbabel-comet<3 +# unbabel-comet<3 # Optional: install separately with `pip install unbabel-comet --no-deps` nltk>=3.7,<4 evaluate>=0.4.2 -spacy>=3.4.0,<3.8.0 +spacy>=3.4.0 fastchat diskcache>=5.6.3 boostedprob diff --git a/src/lm_polygraph/generation_metrics/__init__.py b/src/lm_polygraph/generation_metrics/__init__.py index d9d66c958..063ce18e0 100644 --- a/src/lm_polygraph/generation_metrics/__init__.py +++ b/src/lm_polygraph/generation_metrics/__init__.py @@ -3,7 +3,11 @@ from .model_score import ModelScoreSeqMetric, ModelScoreTokenwiseMetric from .bart_score import BartScoreSeqMetric from .accuracy import AccuracyMetric -from .comet import Comet + +try: + from .comet import Comet +except ImportError: + Comet = None from .alignscore import AlignScore from .openai_fact_check import OpenAIFactCheck from .bert_score import BertScoreMetric diff --git a/src/lm_polygraph/generation_metrics/comet.py b/src/lm_polygraph/generation_metrics/comet.py index 0fcd9b3e2..5942b2b3f 100644 --- a/src/lm_polygraph/generation_metrics/comet.py +++ b/src/lm_polygraph/generation_metrics/comet.py @@ -1,6 +1,5 @@ import re import numpy as np -from evaluate import load from typing import List, Dict from .generation_metric import GenerationMetric @@ -14,6 +13,8 @@ class Comet(GenerationMetric): def __init__(self, source_ignore_regex=None, lang="en"): super().__init__(["greedy_texts", "input_texts"], "sequence") + from evaluate import load + self.scorer = load("comet") self.source_ignore_regex = ( re.compile(source_ignore_regex) if source_ignore_regex else None From a408e7ef61ad08872d3e6536570cc1bcb67499ad Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 16:17:59 +0400 Subject: [PATCH 02/29] Add optional dependencies section to README for unbabel-comet --- README.md | 9 +++++++++ requirements.txt | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2f9d65132..7661e8d72 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,15 @@ The latest tagged version is also available via PyPI: pip install lm-polygraph ``` +### Optional dependencies + +Some features require additional packages that are not installed by default: + +- **COMET metric** (translation evaluation): `unbabel-comet` has constrained `transformers` version requirements, so it is installed separately: + ```shell + pip install unbabel-comet --no-deps + ``` + ## Basic usage 1. Initialize the base model (encoder-decoder or decoder-only) and tokenizer from HuggingFace or a local file, and use them to initialize the WhiteboxModel for evaluation: ```python diff --git a/requirements.txt b/requirements.txt index fba670f3d..03f294b85 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,7 +29,7 @@ openai>=1.52.0 wget sentence-transformers bert-score>=0.3.13 -# unbabel-comet<3 # Optional: install separately with `pip install unbabel-comet --no-deps` +# unbabel-comet<3 # Optional, see README for installation instructions nltk>=3.7,<4 evaluate>=0.4.2 spacy>=3.4.0 From fd22daf54ae22607b35e1e34887345e3fab3ba92 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 16:20:28 +0400 Subject: [PATCH 03/29] Add transformers 5.x to CI test matrix Run tests against both default transformers (from requirements.txt) and transformers 5.x to catch compatibility issues early. Lint runs only once (on default version). Relates to #445. --- .github/workflows/python-app.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 76dc643a5..0abc8d718 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -17,6 +17,11 @@ jobs: runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + transformers-version: ["default", "5.*"] + steps: - uses: actions/checkout@v3 - name: Set up Python 3.12 @@ -28,7 +33,12 @@ jobs: python -m pip install --upgrade pip pip install flake8 pytest hydra-core==1.3.2 pip install . + - name: Override transformers version + if: matrix.transformers-version != 'default' + run: | + pip install "transformers==${{ matrix.transformers-version }}" - name: Lint + if: matrix.transformers-version == 'default' uses: py-actions/flake8@v2 with: args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401" From 7d631ba4e532f8562403cfcf19bb1365e92e6587 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 16:31:06 +0400 Subject: [PATCH 04/29] Fix transformers 5.0 compatibility for ensemble utils and visual model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit transformers 5.0 removed/renamed several classes: - beam_search submodule removed: BeamScorer no longer exists - Output classes renamed: - BeamSearchOutput → GenerateBeamEncoderDecoderOutput - BeamSearchDecoderOnlyOutput → GenerateBeamDecoderOnlyOutput - SampleOutput → GenerateNonBeamOutput - SampleDecoderOnlyOutput → GenerateDecoderOnlyOutput - GreedySearchOutput → GenerateNonBeamOutput - GreedySearchDecoderOnlyOutput → GenerateDecoderOnlyOutput - AutoModelForVision2Seq removed All imports now use try/except with aliases to support both 4.x and 5.x. Relates to #445. --- .../model_adapters/visual_whitebox_model.py | 7 ++++- .../utils/ensemble_utils/ensemble_beam.py | 26 ++++++++++++++----- .../utils/ensemble_utils/ensemble_greedy.py | 18 +++++++++---- .../utils/ensemble_utils/ensemble_sample.py | 18 +++++++++---- 4 files changed, 52 insertions(+), 17 deletions(-) diff --git a/src/lm_polygraph/model_adapters/visual_whitebox_model.py b/src/lm_polygraph/model_adapters/visual_whitebox_model.py index 9601bca27..3675294f5 100644 --- a/src/lm_polygraph/model_adapters/visual_whitebox_model.py +++ b/src/lm_polygraph/model_adapters/visual_whitebox_model.py @@ -8,12 +8,17 @@ import torch from PIL import Image from transformers import ( - AutoModelForVision2Seq, AutoProcessor, GenerationConfig, LogitsProcessorList, ) +try: + from transformers import AutoModelForVision2Seq +except ImportError: + # transformers >= 5.0 removed AutoModelForVision2Seq + AutoModelForVision2Seq = None + from lm_polygraph.utils.generation_parameters import GenerationParameters from lm_polygraph.utils.dataset import Dataset from lm_polygraph.utils.model import Model diff --git a/src/lm_polygraph/utils/ensemble_utils/ensemble_beam.py b/src/lm_polygraph/utils/ensemble_utils/ensemble_beam.py index 94f180daa..2ed5d2731 100644 --- a/src/lm_polygraph/utils/ensemble_utils/ensemble_beam.py +++ b/src/lm_polygraph/utils/ensemble_utils/ensemble_beam.py @@ -8,7 +8,13 @@ from torch import nn from transformers import GenerationMixin -from transformers.generation.beam_search import BeamScorer + +try: + from transformers.generation.beam_search import BeamScorer +except ImportError: + # transformers >= 5.0 removed BeamScorer entirely + BeamScorer = None + from transformers.generation.logits_process import ( LogitsProcessorList, ) @@ -16,11 +22,19 @@ StoppingCriteriaList, validate_stopping_criteria, ) -from transformers.generation.utils import ( - BeamSearchOutput, - BeamSearchDecoderOnlyOutput, - ModelOutput, -) +from transformers.generation.utils import ModelOutput + +try: + from transformers.generation.utils import ( + BeamSearchOutput, + BeamSearchDecoderOnlyOutput, + ) +except ImportError: + # transformers >= 5.0 renamed these classes + from transformers.generation.utils import ( + GenerateBeamEncoderDecoderOutput as BeamSearchOutput, + GenerateBeamDecoderOnlyOutput as BeamSearchDecoderOnlyOutput, + ) class EnsembleBeamSearchMixin(GenerationMixin): diff --git a/src/lm_polygraph/utils/ensemble_utils/ensemble_greedy.py b/src/lm_polygraph/utils/ensemble_utils/ensemble_greedy.py index db7b31cf1..027d9eec9 100644 --- a/src/lm_polygraph/utils/ensemble_utils/ensemble_greedy.py +++ b/src/lm_polygraph/utils/ensemble_utils/ensemble_greedy.py @@ -15,11 +15,19 @@ validate_stopping_criteria, ) from transformers.generation.streamers import BaseStreamer -from transformers.generation.utils import ( - GreedySearchOutput, - GreedySearchDecoderOnlyOutput, - ModelOutput, -) +from transformers.generation.utils import ModelOutput + +try: + from transformers.generation.utils import ( + GreedySearchOutput, + GreedySearchDecoderOnlyOutput, + ) +except ImportError: + # transformers >= 5.0 renamed these classes + from transformers.generation.utils import ( + GenerateNonBeamOutput as GreedySearchOutput, + GenerateDecoderOnlyOutput as GreedySearchDecoderOnlyOutput, + ) class EnsembleGreedyMixin(GenerationMixin): diff --git a/src/lm_polygraph/utils/ensemble_utils/ensemble_sample.py b/src/lm_polygraph/utils/ensemble_utils/ensemble_sample.py index 07b142dc8..1106171e6 100644 --- a/src/lm_polygraph/utils/ensemble_utils/ensemble_sample.py +++ b/src/lm_polygraph/utils/ensemble_utils/ensemble_sample.py @@ -16,11 +16,19 @@ validate_stopping_criteria, ) from transformers.generation.streamers import BaseStreamer -from transformers.generation.utils import ( - SampleOutput, - SampleDecoderOnlyOutput, - ModelOutput, -) +from transformers.generation.utils import ModelOutput + +try: + from transformers.generation.utils import ( + SampleOutput, + SampleDecoderOnlyOutput, + ) +except ImportError: + # transformers >= 5.0 renamed these classes + from transformers.generation.utils import ( + GenerateNonBeamOutput as SampleOutput, + GenerateDecoderOnlyOutput as SampleDecoderOnlyOutput, + ) class EnsembleSampleMixin(GenerationMixin): From 781bb2115d1614c4c7d0e03297902c9d4977ed3c Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 16:38:24 +0400 Subject: [PATCH 05/29] Use AutoModelForImageTextToText as fallback for AutoModelForVision2Seq --- src/lm_polygraph/model_adapters/visual_whitebox_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lm_polygraph/model_adapters/visual_whitebox_model.py b/src/lm_polygraph/model_adapters/visual_whitebox_model.py index 3675294f5..0b674712f 100644 --- a/src/lm_polygraph/model_adapters/visual_whitebox_model.py +++ b/src/lm_polygraph/model_adapters/visual_whitebox_model.py @@ -16,8 +16,8 @@ try: from transformers import AutoModelForVision2Seq except ImportError: - # transformers >= 5.0 removed AutoModelForVision2Seq - AutoModelForVision2Seq = None + # transformers >= 5.0 renamed AutoModelForVision2Seq → AutoModelForImageTextToText + from transformers import AutoModelForImageTextToText as AutoModelForVision2Seq from lm_polygraph.utils.generation_parameters import GenerationParameters from lm_polygraph.utils.dataset import Dataset From 448485c8d881460a88a317956070104ed4316722 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 16:46:24 +0400 Subject: [PATCH 06/29] Replace batch_encode_plus with direct tokenizer call batch_encode_plus was removed from newer transformers tokenizers. The direct __call__ (tokenizer(...)) is equivalent and works on all versions. --- src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py | 2 +- src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py | 4 ++-- src/lm_polygraph/stat_calculators/semantic_matrix.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py index bb0cfe3b2..187dd62f6 100644 --- a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py +++ b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py @@ -22,7 +22,7 @@ def eval_nli_model( with torch.no_grad(): for k in range(0, len(nli_set), deberta.batch_size): batch = nli_set[k : k + deberta.batch_size] - encoded = deberta.deberta_tokenizer.batch_encode_plus( + encoded = deberta.deberta_tokenizer( batch, padding=True, return_tensors="pt" ).to(deberta.device) logits = deberta.deberta(**encoded).logits.detach() diff --git a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py index 07d458d5a..5843416b6 100644 --- a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py +++ b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py @@ -47,14 +47,14 @@ def calculate_semantic_matrix(self, batch_pairs, batch_invs): probs_b = [] for first_texts, second_texts in tqdm(dl): batch = list(zip(first_texts, second_texts)) - encoded = tokenizer.batch_encode_plus( + encoded = tokenizer( batch, padding=True, return_tensors="pt" ).to(deberta.device) logits = deberta.deberta(**encoded).logits probs_f.append(softmax(logits)) batch = list(zip(second_texts, first_texts)) - encoded = tokenizer.batch_encode_plus( + encoded = tokenizer( batch, padding=True, return_tensors="pt" ).to(deberta.device) logits = deberta.deberta(**encoded).logits diff --git a/src/lm_polygraph/stat_calculators/semantic_matrix.py b/src/lm_polygraph/stat_calculators/semantic_matrix.py index 37bb969a5..14a9fb5ce 100644 --- a/src/lm_polygraph/stat_calculators/semantic_matrix.py +++ b/src/lm_polygraph/stat_calculators/semantic_matrix.py @@ -102,7 +102,7 @@ def __call__( logits_all = [] for first_texts, second_texts in dl: batch = list(zip(first_texts, second_texts)) - encoded = tokenizer.batch_encode_plus( + encoded = tokenizer( batch, padding=True, return_tensors="pt" ).to(device) logits = deberta.deberta(**encoded).logits From 0630902fc52622f010b09aef4ef99bae035a9494 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 16:52:37 +0400 Subject: [PATCH 07/29] Fix black formatting for tokenizer calls --- .../stat_calculators/greedy_semantic_matrix.py | 12 ++++++------ src/lm_polygraph/stat_calculators/semantic_matrix.py | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py index 5843416b6..1f3ccbe96 100644 --- a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py +++ b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py @@ -47,16 +47,16 @@ def calculate_semantic_matrix(self, batch_pairs, batch_invs): probs_b = [] for first_texts, second_texts in tqdm(dl): batch = list(zip(first_texts, second_texts)) - encoded = tokenizer( - batch, padding=True, return_tensors="pt" - ).to(deberta.device) + encoded = tokenizer(batch, padding=True, return_tensors="pt").to( + deberta.device + ) logits = deberta.deberta(**encoded).logits probs_f.append(softmax(logits)) batch = list(zip(second_texts, first_texts)) - encoded = tokenizer( - batch, padding=True, return_tensors="pt" - ).to(deberta.device) + encoded = tokenizer(batch, padding=True, return_tensors="pt").to( + deberta.device + ) logits = deberta.deberta(**encoded).logits probs_b.append(softmax(logits)) diff --git a/src/lm_polygraph/stat_calculators/semantic_matrix.py b/src/lm_polygraph/stat_calculators/semantic_matrix.py index 14a9fb5ce..b03d4967b 100644 --- a/src/lm_polygraph/stat_calculators/semantic_matrix.py +++ b/src/lm_polygraph/stat_calculators/semantic_matrix.py @@ -102,9 +102,9 @@ def __call__( logits_all = [] for first_texts, second_texts in dl: batch = list(zip(first_texts, second_texts)) - encoded = tokenizer( - batch, padding=True, return_tensors="pt" - ).to(device) + encoded = tokenizer(batch, padding=True, return_tensors="pt").to( + device + ) logits = deberta.deberta(**encoded).logits probs.append(softmax(logits)) logits_all.append(logits) From eac1c35077083f634c6733aeebea21d2dd1a04ad Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 17:15:15 +0400 Subject: [PATCH 08/29] Set seed=42 in test_polygraph_eval_seq_ue config The test_all_seq_ue test uses do_sample=True for sampling-based estimators (MonteCarloSequenceEntropy, PTrueSampling, etc.). With seed=null, torch.multinomial occasionally fails with "probability tensor contains inf, nan or element < 0" due to non-deterministic logit values from bloomz-560m on CPU. Setting a fixed seed makes the test deterministic and reproducible. --- test/configs/test_polygraph_eval_seq_ue.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/configs/test_polygraph_eval_seq_ue.yaml b/test/configs/test_polygraph_eval_seq_ue.yaml index b259f071f..cd15fb68e 100644 --- a/test/configs/test_polygraph_eval_seq_ue.yaml +++ b/test/configs/test_polygraph_eval_seq_ue.yaml @@ -31,6 +31,6 @@ stat_calculators: subsample_eval_dataset: 10 batch_size: 2 -seed: null +seed: 42 device: null max_new_tokens: 256 From d9076fcc5cab6c84a2cb39fca2bf53eec29b1ed6 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 17:26:41 +0400 Subject: [PATCH 09/29] Fix seed format: must be a list for polygraph_eval --- test/configs/test_polygraph_eval_seq_ue.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/configs/test_polygraph_eval_seq_ue.yaml b/test/configs/test_polygraph_eval_seq_ue.yaml index cd15fb68e..97158bad6 100644 --- a/test/configs/test_polygraph_eval_seq_ue.yaml +++ b/test/configs/test_polygraph_eval_seq_ue.yaml @@ -31,6 +31,7 @@ stat_calculators: subsample_eval_dataset: 10 batch_size: 2 -seed: 42 +seed: + - 42 device: null max_new_tokens: 256 From c00891799748bad1466c4a76777309569bb85778 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 17:44:18 +0400 Subject: [PATCH 10/29] Set temperature=0.7 in seq_ue test to stabilize sampling --- test/configs/test_polygraph_eval_seq_ue.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/configs/test_polygraph_eval_seq_ue.yaml b/test/configs/test_polygraph_eval_seq_ue.yaml index 97158bad6..53fd97726 100644 --- a/test/configs/test_polygraph_eval_seq_ue.yaml +++ b/test/configs/test_polygraph_eval_seq_ue.yaml @@ -22,6 +22,9 @@ instruct: false prompt: "" ignore_exceptions: false + +generation_params: + temperature: 0.7 generation_metrics: - name: RougeMetric args: ["rouge1"] From a84359f7f8ac099c4d36f91c6b318871e1a962d2 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 18:01:58 +0400 Subject: [PATCH 11/29] Add renormalize_logits=True to sampling to prevent inf/nan in multinomial --- src/lm_polygraph/stat_calculators/sample.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py index 05b2b854b..d129b2799 100644 --- a/src/lm_polygraph/stat_calculators/sample.py +++ b/src/lm_polygraph/stat_calculators/sample.py @@ -185,6 +185,7 @@ def __call__( max_new_tokens=max_new_tokens, min_new_tokens=2, do_sample=True, + renormalize_logits=True, num_beams=1, num_return_sequences=1, suppress_tokens=( From aea48e2b781e9a615ef3a5d9eede93c951f75253 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 18:28:36 +0400 Subject: [PATCH 12/29] Add logits sanitizer to prevent inf/nan crash in sampling Different numpy versions (1.x vs 2.x) can cause bloomz-560m to produce inf/nan logits on CPU, crashing torch.multinomial. Add _SanitizeLogitsProcessor that clamps inf/nan to finite values before scoring and sampling. Runs first in the logits processor chain. --- src/lm_polygraph/stat_calculators/sample.py | 1 - src/lm_polygraph/utils/model.py | 14 +++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py index d129b2799..05b2b854b 100644 --- a/src/lm_polygraph/stat_calculators/sample.py +++ b/src/lm_polygraph/stat_calculators/sample.py @@ -185,7 +185,6 @@ def __call__( max_new_tokens=max_new_tokens, min_new_tokens=2, do_sample=True, - renormalize_logits=True, num_beams=1, num_return_sequences=1, suppress_tokens=( diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py index aa06a3053..6d34198b7 100644 --- a/src/lm_polygraph/utils/model.py +++ b/src/lm_polygraph/utils/model.py @@ -455,6 +455,12 @@ def __call__(self, input_ids=None, scores=None): self.scores.append(scores.log_softmax(-1)) return scores + class _SanitizeLogitsProcessor: + # Replaces inf/nan in logits with large finite values to prevent + # RuntimeError in torch.multinomial during sampling + def __call__(self, input_ids=None, scores=None): + return torch.nan_to_num(scores, nan=0.0, posinf=1e4, neginf=-1e4) + def generate(self, **args): """ Generates the model output with scores from batch formed by HF Tokenizer. @@ -466,14 +472,16 @@ def generate(self, **args): """ default_params = asdict(self.generation_parameters) - # add ScoresProcessor to collect original scores + # add ScoresProcessor to collect original scores, and SanitizeLogitsProcessor + # to prevent inf/nan from crashing torch.multinomial during sampling processor = self._ScoresProcessor() + sanitizer = self._SanitizeLogitsProcessor() if "logits_processor" in args.keys(): logits_processor = LogitsProcessorList( - [processor, args["logits_processor"]] + [sanitizer, processor, args["logits_processor"]] ) else: - logits_processor = LogitsProcessorList([processor]) + logits_processor = LogitsProcessorList([sanitizer, processor]) args["logits_processor"] = logits_processor # update default parameters with passed arguments From 4d061b5cf608231af8017f9286b2d6c2ff0ae58c Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 19:48:51 +0400 Subject: [PATCH 13/29] Increase CI timeout to 45 minutes per job --- .github/workflows/python-app.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 0abc8d718..fe2109ba7 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -16,6 +16,7 @@ jobs: build: runs-on: ubuntu-latest + timeout-minutes: 45 strategy: fail-fast: false From 37f1b1a5761b6705c7f0e9fb32c1f0ae8bdbf4a3 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 22:59:52 +0400 Subject: [PATCH 14/29] Fix logits sanitizer to use per-row max instead of fixed 1e4 The previous sanitizer replaced +inf with 1e4, which completely dominated softmax and caused the model to generate the same token repeatedly, never hitting stop_strings. This made test_just_works take 22+ minutes on CI (vs 3 min on main) because generations ran to max_new_tokens instead of stopping at "\n". Now replaces inf values with the max/min finite value from the same row, preserving the original distribution shape. Also add per-step timeout to pytest to prevent future hangs. --- .github/workflows/python-app.yml | 1 + src/lm_polygraph/utils/model.py | 26 +++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index fe2109ba7..767c1a356 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -49,5 +49,6 @@ jobs: run: rm -rf $HOME/.cache # If we exceed disk space limit again, we can test lm-polygraph tests separately, and delete cachedir again - name: Test with pytest + timeout-minutes: 30 run: | pytest --ignore=test/local diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py index 6d34198b7..e10a7a905 100644 --- a/src/lm_polygraph/utils/model.py +++ b/src/lm_polygraph/utils/model.py @@ -456,10 +456,30 @@ def __call__(self, input_ids=None, scores=None): return scores class _SanitizeLogitsProcessor: - # Replaces inf/nan in logits with large finite values to prevent - # RuntimeError in torch.multinomial during sampling + # Replaces inf/nan in logits with finite values to prevent + # RuntimeError in torch.multinomial during sampling. + # Uses per-row max/min of finite values to avoid dominating softmax. def __call__(self, input_ids=None, scores=None): - return torch.nan_to_num(scores, nan=0.0, posinf=1e4, neginf=-1e4) + if torch.isfinite(scores).all(): + return scores + finite_mask = torch.isfinite(scores) + # Compute per-row max/min of finite values + masked = scores.clone() + masked[~finite_mask] = float("-inf") + row_max = masked.max(dim=-1, keepdim=True).values + masked[~finite_mask] = float("inf") + row_min = masked.min(dim=-1, keepdim=True).values + # Fallback if entire row is non-finite + row_max = torch.where( + torch.isfinite(row_max), row_max, torch.zeros_like(row_max) + ) + row_min = torch.where( + torch.isfinite(row_min), row_min, torch.zeros_like(row_min) + ) + scores = torch.where(torch.isposinf(scores), row_max, scores) + scores = torch.where(torch.isneginf(scores), row_min, scores) + scores = torch.nan_to_num(scores, nan=0.0) + return scores def generate(self, **args): """ From 112b4a4d0d64c4dd584e19cac31d45683e1f81a3 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 23:26:07 +0400 Subject: [PATCH 15/29] Add timing logs to diagnose slow CI tests Stream subprocess output in tests and add per-stage timing to polygraph_eval to identify which step is slow on CI. --- .github/workflows/python-app.yml | 5 ++++- scripts/polygraph_eval | 13 ++++++++++--- test/test_lm_polygraph.py | 15 ++++++++++++++- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 767c1a356..1fa94026c 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -33,6 +33,9 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest hydra-core==1.3.2 + # Pin numpy<2 for CI: bloomz-560m produces inf logits with numpy 2.x on CPU, + # causing degenerate generation. numpy 2.x compat is for vLLM users (GPU). + pip install "numpy<2" pip install . - name: Override transformers version if: matrix.transformers-version != 'default' @@ -51,4 +54,4 @@ jobs: - name: Test with pytest timeout-minutes: 30 run: | - pytest --ignore=test/local + pytest --ignore=test/local -s -v diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index de8b80a48..6caf2c848 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -2,6 +2,7 @@ import hydra import os +import time import transformers from pathlib import Path from omegaconf import OmegaConf @@ -98,13 +99,15 @@ def main(args): log.info("=" * 100) log.info(f"SEED: {seed}") + t_step = time.time() log.info(f"Loading model {args.model.path}...") transformers.set_seed(seed) model = get_model(args) - log.info("Done with loading model.") + log.info(f"Done with loading model. ({time.time() - t_step:.1f}s)") + t_step = time.time() log.info(f"Loading dataset {args.dataset}...") dataset = Dataset.load( args.dataset, @@ -125,13 +128,14 @@ def main(args): **cache_kwargs, ) # images=dataset.images - log.info("Done with loading eval data.") + log.info(f"Done with loading eval data. ({time.time() - t_step:.1f}s)") log.info("=" * 100) + t_step = time.time() log.info("Initializing UE estimators...") estimators = [] estimators += get_ue_methods(args, model) - log.info("Done loading UE estimators") + log.info(f"Done loading UE estimators ({time.time() - t_step:.1f}s)") if args.subsample_eval_dataset != -1: dataset.subsample(args.subsample_eval_dataset, seed=seed) @@ -160,12 +164,15 @@ def main(args): log_time=getattr(args, "log_time", False), ) + t_step = time.time() + log.info("Starting UEManager evaluation...") try: man() except Exception as e: man.state = "failed" raise e finally: + log.info(f"UEManager evaluation finished. ({time.time() - t_step:.1f}s)") man.save(save_path + f"/ue_manager_seed{seed}") if hasattr(args, "report_to_wandb") and args.report_to_wandb: diff --git a/test/test_lm_polygraph.py b/test/test_lm_polygraph.py index da82128c2..b438e90bd 100644 --- a/test/test_lm_polygraph.py +++ b/test/test_lm_polygraph.py @@ -1,5 +1,7 @@ import subprocess import pathlib +import time +import sys from lm_polygraph.utils.manager import UEManager @@ -14,7 +16,18 @@ def exec_bash(s): - return subprocess.run(s, shell=True) + print(f"\n[TIMER] Starting command: {s}", flush=True) + t0 = time.time() + proc = subprocess.Popen( + s, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True + ) + for line in proc.stdout: + sys.stdout.write(line) + sys.stdout.flush() + proc.wait() + elapsed = time.time() - t0 + print(f"[TIMER] Command finished in {elapsed:.1f}s (rc={proc.returncode})", flush=True) + return proc def pwd(): From 7d8d5aa983780b1260918009d98953b0cb7d9412 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 23:29:32 +0400 Subject: [PATCH 16/29] Fix black formatting for timing logs --- scripts/polygraph_eval | 98 ++++++++++++++++++++++----------------- test/test_lm_polygraph.py | 4 +- 2 files changed, 58 insertions(+), 44 deletions(-) diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 6caf2c848..8f0728d37 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -23,7 +23,10 @@ from lm_polygraph.generation_metrics import * from lm_polygraph.estimators import * from lm_polygraph.ue_metrics import * from lm_polygraph.utils.common import load_external_module, load_processor, load_image -from lm_polygraph.utils.generation_parameters import GenerationParameters, GenerationParametersFactory +from lm_polygraph.utils.generation_parameters import ( + GenerationParameters, + GenerationParametersFactory, +) from lm_polygraph.defaults.register_default_stat_calculators import ( register_default_stat_calculators, ) @@ -32,7 +35,8 @@ from lm_polygraph.utils.builder_enviroment_stat_calculator import ( ) from lm_polygraph.utils.factory_estimator import FactoryEstimator from lm_polygraph.utils.factory_stat_calculator import StatCalculatorContainer -#from transformers import AutoProcessor, AutoModelForVision2Seq + +# from transformers import AutoProcessor, AutoModelForVision2Seq hydra_config = Path(os.environ.get("HYDRA_CONFIG", "")) @@ -84,7 +88,7 @@ def main(args): project = os.environ["WANDB_PROJECT"] wandb.init(project=project, dir=save_path, config=wandb_cfg) wandb_save_directory(Path(save_path) / ".hydra") - + save_path = args.save_path if "save_path" in args else save_path log.info(f"Main directory: {save_path}") @@ -127,7 +131,7 @@ def main(args): trust_remote_code=getattr(args, "trust_remote_code", False), **cache_kwargs, ) -# images=dataset.images + # images=dataset.images log.info(f"Done with loading eval data. ({time.time() - t_step:.1f}s)") log.info("=" * 100) @@ -160,7 +164,7 @@ def main(args): ], ignore_exceptions=args.ignore_exceptions, max_new_tokens=args.max_new_tokens, - save_stats=getattr(args, 'save_stats', []), + save_stats=getattr(args, "save_stats", []), log_time=getattr(args, "log_time", False), ) @@ -176,14 +180,12 @@ def main(args): man.save(save_path + f"/ue_manager_seed{seed}") if hasattr(args, "report_to_wandb") and args.report_to_wandb: - wandb.log({str(k) : v for k, v in man.gen_metrics}) - wandb.log({str(k) : v for k, v in man.metrics.items()}) + wandb.log({str(k): v for k, v in man.gen_metrics}) + wandb.log({str(k): v for k, v in man.metrics.items()}) wandb.save(save_path + f"/ue_manager_seed{seed}") - if hasattr(args, "report_to_wandb") and args.report_to_wandb: wandb.finish() - def get_ue_metrics(args): @@ -204,13 +206,17 @@ def get_ue_metrics(args): def get_stat_calculator_names(config): model_type_raw = getattr(config.model, "type", "Whitebox") model_type = ( - "Blackbox" if model_type_raw == "Blackbox" - else "VisualLM" if model_type_raw == "VisualLM" - else "Whitebox" + "Blackbox" + if model_type_raw == "Blackbox" + else "VisualLM" if model_type_raw == "VisualLM" else "Whitebox" ) language = getattr(config, "language", "en") - output_attentions = getattr(config, "output_attentions", True) and (getattr(config.model, "type", "Whitebox") != "vLLMCausalLM") - output_hidden_states = False if getattr(config.model, "type", "Whitebox") == "vLLMCausalLM" else True + output_attentions = getattr(config, "output_attentions", True) and ( + getattr(config.model, "type", "Whitebox") != "vLLMCausalLM" + ) + output_hidden_states = ( + False if getattr(config.model, "type", "Whitebox") == "vLLMCausalLM" else True + ) hf_cache = getattr(config, "hf_cache", None) deberta_batch_size = getattr(config, "deberta_batch_size", 10) blackbox_supports_logprobs = model_type == "Blackbox" and getattr( @@ -223,7 +229,7 @@ def get_stat_calculator_names(config): model_type, language, hf_cache, - output_attentions=output_attentions, + output_attentions=output_attentions, output_hidden_states=output_hidden_states, blackbox_supports_logprobs=blackbox_supports_logprobs, deberta_batch_size=deberta_batch_size, @@ -275,7 +281,13 @@ def get_generation_metrics(args): ), ] if args.task == "ats": - result += [AlignScore(target_is_claims=False, source_ignore_regex=ignore_regex, source_as_target=True)] + result += [ + AlignScore( + target_is_claims=False, + source_ignore_regex=ignore_regex, + source_as_target=True, + ) + ] else: result += [AlignScore(target_is_claims=True)] if getattr(args.model, "type", "Whitebox") != "Blackbox": @@ -412,7 +424,7 @@ def get_whitebox_model(args, cache_kwargs={}): generation_params = GenerationParametersFactory.from_params( yaml_config=getattr(args, "generation_params", {}), - native_config=base_model.generation_config.to_dict() + native_config=base_model.generation_config.to_dict(), ) model = WhiteboxModel( @@ -421,7 +433,7 @@ def get_whitebox_model(args, cache_kwargs={}): args.model.path, args.model.type, generation_params, - instruct=getattr(args, "instruct", False) + instruct=getattr(args, "instruct", False), ) return model @@ -438,59 +450,59 @@ def get_visual_model(args, cache_kwargs={}): getattr(args, "generation_params", {}), device_map=args.model.load_model_args.device_map, add_bos_token=getattr(args.model, "add_bos_token", True), - **cache_kwargs + **cache_kwargs, ) - path_to_load_script = get_abs_path_from_hydra_config( - args.model.path_to_load_script - ) + path_to_load_script = get_abs_path_from_hydra_config(args.model.path_to_load_script) load_module = load_external_module(path_to_load_script) - load_model_args = {'model_path': args.model.path} + load_model_args = {"model_path": args.model.path} load_model_args.update(args.model.load_model_args) base_model = load_module.load_model(**load_model_args) - load_tok_args = {'model_path': args.model.path} + load_tok_args = {"model_path": args.model.path} load_tok_args.update(args.model.load_tokenizer_args) tokenizer = load_module.load_tokenizer(**load_tok_args) - load_proc_args = {'model_path': args.model.path} + load_proc_args = {"model_path": args.model.path} load_proc_args.update(getattr(args.model, "load_processor_args", {})) processor = load_processor(**load_proc_args) generation_params = GenerationParametersFactory.from_params( yaml_config=getattr(args, "generation_params", {}), - native_config=base_model.generation_config.to_dict() + native_config=base_model.generation_config.to_dict(), ) - model = VisualWhiteboxModel(base_model, - processor, - args.model.path, - args.model.type, - generation_params) + model = VisualWhiteboxModel( + base_model, processor, args.model.path, args.model.type, generation_params + ) return model def get_vllm_model(args): - path_to_load_script = get_abs_path_from_hydra_config( - args.model.path_to_load_script - ) + path_to_load_script = get_abs_path_from_hydra_config(args.model.path_to_load_script) load_module = load_external_module(path_to_load_script) - load_model_args = {'model_path': args.model.path, - 'max_new_tokens': args.max_new_tokens, - 'logprobs': args.model.logprobs} + load_model_args = { + "model_path": args.model.path, + "max_new_tokens": args.max_new_tokens, + "logprobs": args.model.logprobs, + } load_model_args.update(args.model.load_model_args) base_model, sampling_params = load_module.load_model(**load_model_args) - generation_parameters = GenerationParameters(**getattr(args, "generation_params", {})) + generation_parameters = GenerationParameters( + **getattr(args, "generation_params", {}) + ) - model = WhiteboxModelvLLM(model=base_model, - sampling_params=sampling_params, - generation_parameters=generation_parameters, - device=args.model.device, - instruct= getattr(args.model, "instruct", False)) + model = WhiteboxModelvLLM( + model=base_model, + sampling_params=sampling_params, + generation_parameters=generation_parameters, + device=args.model.device, + instruct=getattr(args.model, "instruct", False), + ) return model diff --git a/test/test_lm_polygraph.py b/test/test_lm_polygraph.py index b438e90bd..6ce533e3b 100644 --- a/test/test_lm_polygraph.py +++ b/test/test_lm_polygraph.py @@ -26,7 +26,9 @@ def exec_bash(s): sys.stdout.flush() proc.wait() elapsed = time.time() - t0 - print(f"[TIMER] Command finished in {elapsed:.1f}s (rc={proc.returncode})", flush=True) + print( + f"[TIMER] Command finished in {elapsed:.1f}s (rc={proc.returncode})", flush=True + ) return proc From 19c5748bec300ea17f6c2223f13240773fd48578 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 23:42:40 +0400 Subject: [PATCH 17/29] Add per-calculator timing logs to UEManager --- src/lm_polygraph/utils/manager.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index 3c7c4e24d..461f8ebe9 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -252,16 +252,14 @@ def calculate(self, batch_stats: dict, calculators: list, inp_texts: list) -> di """ for stat_calculator in calculators: try: - if self.log_time: - start_time = time.time() - log.info(f"Calculating {stat_calculator}...") + start_time = time.time() + log.info(f"[CALC] Starting {stat_calculator.__class__.__name__}...") new_stats = stat_calculator( batch_stats, inp_texts, self.model, self.max_new_tokens ) - if self.log_time: - log.info( - f"Done calculating {stat_calculator} in {round(time.time() - start_time, 2)} secs" - ) + log.info( + f"[CALC] Done {stat_calculator.__class__.__name__} in {round(time.time() - start_time, 2)}s" + ) for stat, stat_value in new_stats.items(): if stat in batch_stats.keys(): continue From 8e3a894bec8654462cccd4312c2b5709b0aa47f8 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 23:44:49 +0400 Subject: [PATCH 18/29] Temporarily run only test_all_seq_ue for debugging --- .github/workflows/python-app.yml | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 1fa94026c..212cff702 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -18,10 +18,11 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 45 + # TODO: restore matrix after debugging strategy: fail-fast: false matrix: - transformers-version: ["default", "5.*"] + transformers-version: ["default"] steps: - uses: actions/checkout@v3 @@ -41,17 +42,18 @@ jobs: if: matrix.transformers-version != 'default' run: | pip install "transformers==${{ matrix.transformers-version }}" - - name: Lint - if: matrix.transformers-version == 'default' - uses: py-actions/flake8@v2 - with: - args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401" - path: "." - plugins: "flake8-black" + # TODO: restore lint after debugging + # - name: Lint + # if: matrix.transformers-version == 'default' + # uses: py-actions/flake8@v2 + # with: + # args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401" + # path: "." + # plugins: "flake8-black" - name: Remove cachedir in order to save up on disk run: rm -rf $HOME/.cache # If we exceed disk space limit again, we can test lm-polygraph tests separately, and delete cachedir again - name: Test with pytest timeout-minutes: 30 run: | - pytest --ignore=test/local -s -v + pytest test/test_lm_polygraph.py::test_all_seq_ue -s -v From d634148f4b3661a464e236752e1c7ec57230ef55 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 23:54:15 +0400 Subject: [PATCH 19/29] Add NLI calculator progress logging --- .../stat_calculators/greedy_alternatives_nli.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py index 187dd62f6..af1d9c7b9 100644 --- a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py +++ b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py @@ -1,4 +1,6 @@ +import logging import numpy as np +import time import torch from typing import Dict, List, Tuple @@ -10,6 +12,8 @@ import torch.nn as nn import string +log = logging.getLogger("lm_polygraph") + def eval_nli_model( nli_queue: List[Tuple[str, str]], @@ -69,9 +73,15 @@ def __call__( **kwargs, ) -> Dict[str, np.ndarray]: greedy_alternatives = dependencies["greedy_tokens_alternatives"] + total_samples = len(greedy_alternatives) + log.info( + f"[NLI] Processing {total_samples} samples, " + f"tokens per sample: {[len(s) for s in greedy_alternatives]}" + ) greedy_alternatives_nli = [] - for sample_alternatives in greedy_alternatives: + for sample_idx, sample_alternatives in enumerate(greedy_alternatives): nli_matrixes = [] + t_sample = time.time() for w_number, word_alternatives in enumerate(sample_alternatives): nli_queue = [] nli_matrix = [ @@ -107,6 +117,10 @@ def __call__( nli_matrix[i][j] = nli_class[wi, wj] nli_matrixes.append(nli_matrix) + log.info( + f"[NLI] Sample {sample_idx + 1}/{total_samples}: " + f"{len(sample_alternatives)} tokens in {time.time() - t_sample:.1f}s" + ) greedy_alternatives_nli.append(nli_matrixes) return {"greedy_tokens_alternatives_nli": greedy_alternatives_nli} From 5e27656deba141406b9625cf264077164338a6bd Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Wed, 8 Apr 2026 23:59:09 +0400 Subject: [PATCH 20/29] Pass generation params via GenerationConfig for transformers 5.x In transformers 5.x, parameters like temperature, top_k, top_p must be passed via GenerationConfig object, not as loose kwargs to model.generate(). Without this, temperature was silently ignored, causing the model to generate with default temperature=1.0 instead of the configured value. This made generations much longer (never hitting stop conditions early), causing NLI calculators to process far more tokens and making CI tests take 25+ minutes instead of ~7. --- .github/workflows/python-app.yml | 23 +++++++++-------------- src/lm_polygraph/utils/model.py | 26 ++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 212cff702..56f35fd41 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -18,11 +18,10 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 45 - # TODO: restore matrix after debugging strategy: fail-fast: false matrix: - transformers-version: ["default"] + transformers-version: ["default", "5.*"] steps: - uses: actions/checkout@v3 @@ -34,26 +33,22 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest hydra-core==1.3.2 - # Pin numpy<2 for CI: bloomz-560m produces inf logits with numpy 2.x on CPU, - # causing degenerate generation. numpy 2.x compat is for vLLM users (GPU). - pip install "numpy<2" pip install . - name: Override transformers version if: matrix.transformers-version != 'default' run: | pip install "transformers==${{ matrix.transformers-version }}" - # TODO: restore lint after debugging - # - name: Lint - # if: matrix.transformers-version == 'default' - # uses: py-actions/flake8@v2 - # with: - # args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401" - # path: "." - # plugins: "flake8-black" + - name: Lint + if: matrix.transformers-version == 'default' + uses: py-actions/flake8@v2 + with: + args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401" + path: "." + plugins: "flake8-black" - name: Remove cachedir in order to save up on disk run: rm -rf $HOME/.cache # If we exceed disk space limit again, we can test lm-polygraph tests separately, and delete cachedir again - name: Test with pytest timeout-minutes: 30 run: | - pytest test/test_lm_polygraph.py::test_all_seq_ue -s -v + pytest --ignore=test/local -s -v diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py index e10a7a905..5904eb6b9 100644 --- a/src/lm_polygraph/utils/model.py +++ b/src/lm_polygraph/utils/model.py @@ -12,6 +12,7 @@ AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoConfig, + GenerationConfig, LogitsProcessorList, BartForConditionalGeneration, ) @@ -419,9 +420,25 @@ def __init__( self.generation_parameters = generation_parameters self.instruct = instruct + # Parameters that belong in GenerationConfig, not as loose kwargs + _GENERATION_CONFIG_KEYS = { + "temperature", + "top_k", + "top_p", + "do_sample", + "num_beams", + "repetition_penalty", + "max_new_tokens", + "max_length", + "min_length", + "num_return_sequences", + "renormalize_logits", + } + def _validate_args(self, args): """ Validates and adapts arguments for WhiteboxModel generation. + Wraps generation parameters in a GenerationConfig for transformers 5.x compat. Parameters: args (dict): The arguments to validate. @@ -444,6 +461,15 @@ def _validate_args(self, args): for key in keys_to_remove: args_copy.pop(key, None) + # Wrap generation parameters in GenerationConfig for transformers 5.x compat + # (transformers 5.x ignores temperature/top_k/etc. as loose kwargs) + gen_config_kwargs = {} + for key in list(args_copy.keys()): + if key in self._GENERATION_CONFIG_KEYS: + gen_config_kwargs[key] = args_copy.pop(key) + if gen_config_kwargs: + args_copy["generation_config"] = GenerationConfig(**gen_config_kwargs) + return args_copy class _ScoresProcessor: From abd79c0abbc6ac65ad16c0b06b81a79c43791b64 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Thu, 9 Apr 2026 00:11:27 +0400 Subject: [PATCH 21/29] Revert "Pass generation params via GenerationConfig for transformers 5.x" This reverts commit 5e27656deba141406b9625cf264077164338a6bd. --- .github/workflows/python-app.yml | 23 ++++++++++++++--------- src/lm_polygraph/utils/model.py | 26 -------------------------- 2 files changed, 14 insertions(+), 35 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 56f35fd41..212cff702 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -18,10 +18,11 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 45 + # TODO: restore matrix after debugging strategy: fail-fast: false matrix: - transformers-version: ["default", "5.*"] + transformers-version: ["default"] steps: - uses: actions/checkout@v3 @@ -33,22 +34,26 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest hydra-core==1.3.2 + # Pin numpy<2 for CI: bloomz-560m produces inf logits with numpy 2.x on CPU, + # causing degenerate generation. numpy 2.x compat is for vLLM users (GPU). + pip install "numpy<2" pip install . - name: Override transformers version if: matrix.transformers-version != 'default' run: | pip install "transformers==${{ matrix.transformers-version }}" - - name: Lint - if: matrix.transformers-version == 'default' - uses: py-actions/flake8@v2 - with: - args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401" - path: "." - plugins: "flake8-black" + # TODO: restore lint after debugging + # - name: Lint + # if: matrix.transformers-version == 'default' + # uses: py-actions/flake8@v2 + # with: + # args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401" + # path: "." + # plugins: "flake8-black" - name: Remove cachedir in order to save up on disk run: rm -rf $HOME/.cache # If we exceed disk space limit again, we can test lm-polygraph tests separately, and delete cachedir again - name: Test with pytest timeout-minutes: 30 run: | - pytest --ignore=test/local -s -v + pytest test/test_lm_polygraph.py::test_all_seq_ue -s -v diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py index 5904eb6b9..e10a7a905 100644 --- a/src/lm_polygraph/utils/model.py +++ b/src/lm_polygraph/utils/model.py @@ -12,7 +12,6 @@ AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoConfig, - GenerationConfig, LogitsProcessorList, BartForConditionalGeneration, ) @@ -420,25 +419,9 @@ def __init__( self.generation_parameters = generation_parameters self.instruct = instruct - # Parameters that belong in GenerationConfig, not as loose kwargs - _GENERATION_CONFIG_KEYS = { - "temperature", - "top_k", - "top_p", - "do_sample", - "num_beams", - "repetition_penalty", - "max_new_tokens", - "max_length", - "min_length", - "num_return_sequences", - "renormalize_logits", - } - def _validate_args(self, args): """ Validates and adapts arguments for WhiteboxModel generation. - Wraps generation parameters in a GenerationConfig for transformers 5.x compat. Parameters: args (dict): The arguments to validate. @@ -461,15 +444,6 @@ def _validate_args(self, args): for key in keys_to_remove: args_copy.pop(key, None) - # Wrap generation parameters in GenerationConfig for transformers 5.x compat - # (transformers 5.x ignores temperature/top_k/etc. as loose kwargs) - gen_config_kwargs = {} - for key in list(args_copy.keys()): - if key in self._GENERATION_CONFIG_KEYS: - gen_config_kwargs[key] = args_copy.pop(key) - if gen_config_kwargs: - args_copy["generation_config"] = GenerationConfig(**gen_config_kwargs) - return args_copy class _ScoresProcessor: From 90c3ccf2d3ec0e332b3662c15f06c2908c4b728c Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Thu, 9 Apr 2026 00:32:54 +0400 Subject: [PATCH 22/29] Revert debug changes, clean up PR Revert all debug logging and failed fix attempts (sanitizer, GenerationConfig wrapping, test config changes). Keep only the core changes: relaxed spacy bound, optional comet, transformers 5.x import compat, and batch_encode_plus replacement. CI now tests both transformers <5 and >=5 via matrix strategy. --- .github/workflows/python-app.yml | 29 ++--- scripts/polygraph_eval | 111 ++++++++---------- .../greedy_alternatives_nli.py | 16 +-- src/lm_polygraph/utils/manager.py | 12 +- src/lm_polygraph/utils/model.py | 34 +----- test/configs/test_polygraph_eval_seq_ue.yaml | 6 +- test/test_lm_polygraph.py | 17 +-- 7 files changed, 70 insertions(+), 155 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 212cff702..1de5747c1 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -18,11 +18,10 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 45 - # TODO: restore matrix after debugging strategy: fail-fast: false matrix: - transformers-version: ["default"] + transformers-version: ["<5", ">=5"] steps: - uses: actions/checkout@v3 @@ -34,26 +33,20 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest hydra-core==1.3.2 - # Pin numpy<2 for CI: bloomz-560m produces inf logits with numpy 2.x on CPU, - # causing degenerate generation. numpy 2.x compat is for vLLM users (GPU). - pip install "numpy<2" pip install . - - name: Override transformers version - if: matrix.transformers-version != 'default' + - name: Pin transformers version run: | - pip install "transformers==${{ matrix.transformers-version }}" - # TODO: restore lint after debugging - # - name: Lint - # if: matrix.transformers-version == 'default' - # uses: py-actions/flake8@v2 - # with: - # args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401" - # path: "." - # plugins: "flake8-black" + pip install "transformers${{ matrix.transformers-version }}" + - name: Lint + if: matrix.transformers-version == '<5' + uses: py-actions/flake8@v2 + with: + args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401" + path: "." + plugins: "flake8-black" - name: Remove cachedir in order to save up on disk run: rm -rf $HOME/.cache # If we exceed disk space limit again, we can test lm-polygraph tests separately, and delete cachedir again - name: Test with pytest - timeout-minutes: 30 run: | - pytest test/test_lm_polygraph.py::test_all_seq_ue -s -v + pytest --ignore=test/local diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 8f0728d37..de8b80a48 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -2,7 +2,6 @@ import hydra import os -import time import transformers from pathlib import Path from omegaconf import OmegaConf @@ -23,10 +22,7 @@ from lm_polygraph.generation_metrics import * from lm_polygraph.estimators import * from lm_polygraph.ue_metrics import * from lm_polygraph.utils.common import load_external_module, load_processor, load_image -from lm_polygraph.utils.generation_parameters import ( - GenerationParameters, - GenerationParametersFactory, -) +from lm_polygraph.utils.generation_parameters import GenerationParameters, GenerationParametersFactory from lm_polygraph.defaults.register_default_stat_calculators import ( register_default_stat_calculators, ) @@ -35,8 +31,7 @@ from lm_polygraph.utils.builder_enviroment_stat_calculator import ( ) from lm_polygraph.utils.factory_estimator import FactoryEstimator from lm_polygraph.utils.factory_stat_calculator import StatCalculatorContainer - -# from transformers import AutoProcessor, AutoModelForVision2Seq +#from transformers import AutoProcessor, AutoModelForVision2Seq hydra_config = Path(os.environ.get("HYDRA_CONFIG", "")) @@ -88,7 +83,7 @@ def main(args): project = os.environ["WANDB_PROJECT"] wandb.init(project=project, dir=save_path, config=wandb_cfg) wandb_save_directory(Path(save_path) / ".hydra") - + save_path = args.save_path if "save_path" in args else save_path log.info(f"Main directory: {save_path}") @@ -103,15 +98,13 @@ def main(args): log.info("=" * 100) log.info(f"SEED: {seed}") - t_step = time.time() log.info(f"Loading model {args.model.path}...") transformers.set_seed(seed) model = get_model(args) - log.info(f"Done with loading model. ({time.time() - t_step:.1f}s)") + log.info("Done with loading model.") - t_step = time.time() log.info(f"Loading dataset {args.dataset}...") dataset = Dataset.load( args.dataset, @@ -131,15 +124,14 @@ def main(args): trust_remote_code=getattr(args, "trust_remote_code", False), **cache_kwargs, ) - # images=dataset.images - log.info(f"Done with loading eval data. ({time.time() - t_step:.1f}s)") +# images=dataset.images + log.info("Done with loading eval data.") log.info("=" * 100) - t_step = time.time() log.info("Initializing UE estimators...") estimators = [] estimators += get_ue_methods(args, model) - log.info(f"Done loading UE estimators ({time.time() - t_step:.1f}s)") + log.info("Done loading UE estimators") if args.subsample_eval_dataset != -1: dataset.subsample(args.subsample_eval_dataset, seed=seed) @@ -164,28 +156,27 @@ def main(args): ], ignore_exceptions=args.ignore_exceptions, max_new_tokens=args.max_new_tokens, - save_stats=getattr(args, "save_stats", []), + save_stats=getattr(args, 'save_stats', []), log_time=getattr(args, "log_time", False), ) - t_step = time.time() - log.info("Starting UEManager evaluation...") try: man() except Exception as e: man.state = "failed" raise e finally: - log.info(f"UEManager evaluation finished. ({time.time() - t_step:.1f}s)") man.save(save_path + f"/ue_manager_seed{seed}") if hasattr(args, "report_to_wandb") and args.report_to_wandb: - wandb.log({str(k): v for k, v in man.gen_metrics}) - wandb.log({str(k): v for k, v in man.metrics.items()}) + wandb.log({str(k) : v for k, v in man.gen_metrics}) + wandb.log({str(k) : v for k, v in man.metrics.items()}) wandb.save(save_path + f"/ue_manager_seed{seed}") + if hasattr(args, "report_to_wandb") and args.report_to_wandb: wandb.finish() + def get_ue_metrics(args): @@ -206,17 +197,13 @@ def get_ue_metrics(args): def get_stat_calculator_names(config): model_type_raw = getattr(config.model, "type", "Whitebox") model_type = ( - "Blackbox" - if model_type_raw == "Blackbox" - else "VisualLM" if model_type_raw == "VisualLM" else "Whitebox" + "Blackbox" if model_type_raw == "Blackbox" + else "VisualLM" if model_type_raw == "VisualLM" + else "Whitebox" ) language = getattr(config, "language", "en") - output_attentions = getattr(config, "output_attentions", True) and ( - getattr(config.model, "type", "Whitebox") != "vLLMCausalLM" - ) - output_hidden_states = ( - False if getattr(config.model, "type", "Whitebox") == "vLLMCausalLM" else True - ) + output_attentions = getattr(config, "output_attentions", True) and (getattr(config.model, "type", "Whitebox") != "vLLMCausalLM") + output_hidden_states = False if getattr(config.model, "type", "Whitebox") == "vLLMCausalLM" else True hf_cache = getattr(config, "hf_cache", None) deberta_batch_size = getattr(config, "deberta_batch_size", 10) blackbox_supports_logprobs = model_type == "Blackbox" and getattr( @@ -229,7 +216,7 @@ def get_stat_calculator_names(config): model_type, language, hf_cache, - output_attentions=output_attentions, + output_attentions=output_attentions, output_hidden_states=output_hidden_states, blackbox_supports_logprobs=blackbox_supports_logprobs, deberta_batch_size=deberta_batch_size, @@ -281,13 +268,7 @@ def get_generation_metrics(args): ), ] if args.task == "ats": - result += [ - AlignScore( - target_is_claims=False, - source_ignore_regex=ignore_regex, - source_as_target=True, - ) - ] + result += [AlignScore(target_is_claims=False, source_ignore_regex=ignore_regex, source_as_target=True)] else: result += [AlignScore(target_is_claims=True)] if getattr(args.model, "type", "Whitebox") != "Blackbox": @@ -424,7 +405,7 @@ def get_whitebox_model(args, cache_kwargs={}): generation_params = GenerationParametersFactory.from_params( yaml_config=getattr(args, "generation_params", {}), - native_config=base_model.generation_config.to_dict(), + native_config=base_model.generation_config.to_dict() ) model = WhiteboxModel( @@ -433,7 +414,7 @@ def get_whitebox_model(args, cache_kwargs={}): args.model.path, args.model.type, generation_params, - instruct=getattr(args, "instruct", False), + instruct=getattr(args, "instruct", False) ) return model @@ -450,59 +431,59 @@ def get_visual_model(args, cache_kwargs={}): getattr(args, "generation_params", {}), device_map=args.model.load_model_args.device_map, add_bos_token=getattr(args.model, "add_bos_token", True), - **cache_kwargs, + **cache_kwargs ) - path_to_load_script = get_abs_path_from_hydra_config(args.model.path_to_load_script) + path_to_load_script = get_abs_path_from_hydra_config( + args.model.path_to_load_script + ) load_module = load_external_module(path_to_load_script) - load_model_args = {"model_path": args.model.path} + load_model_args = {'model_path': args.model.path} load_model_args.update(args.model.load_model_args) base_model = load_module.load_model(**load_model_args) - load_tok_args = {"model_path": args.model.path} + load_tok_args = {'model_path': args.model.path} load_tok_args.update(args.model.load_tokenizer_args) tokenizer = load_module.load_tokenizer(**load_tok_args) - load_proc_args = {"model_path": args.model.path} + load_proc_args = {'model_path': args.model.path} load_proc_args.update(getattr(args.model, "load_processor_args", {})) processor = load_processor(**load_proc_args) generation_params = GenerationParametersFactory.from_params( yaml_config=getattr(args, "generation_params", {}), - native_config=base_model.generation_config.to_dict(), + native_config=base_model.generation_config.to_dict() ) - model = VisualWhiteboxModel( - base_model, processor, args.model.path, args.model.type, generation_params - ) + model = VisualWhiteboxModel(base_model, + processor, + args.model.path, + args.model.type, + generation_params) return model def get_vllm_model(args): - path_to_load_script = get_abs_path_from_hydra_config(args.model.path_to_load_script) + path_to_load_script = get_abs_path_from_hydra_config( + args.model.path_to_load_script + ) load_module = load_external_module(path_to_load_script) - load_model_args = { - "model_path": args.model.path, - "max_new_tokens": args.max_new_tokens, - "logprobs": args.model.logprobs, - } + load_model_args = {'model_path': args.model.path, + 'max_new_tokens': args.max_new_tokens, + 'logprobs': args.model.logprobs} load_model_args.update(args.model.load_model_args) base_model, sampling_params = load_module.load_model(**load_model_args) - generation_parameters = GenerationParameters( - **getattr(args, "generation_params", {}) - ) + generation_parameters = GenerationParameters(**getattr(args, "generation_params", {})) - model = WhiteboxModelvLLM( - model=base_model, - sampling_params=sampling_params, - generation_parameters=generation_parameters, - device=args.model.device, - instruct=getattr(args.model, "instruct", False), - ) + model = WhiteboxModelvLLM(model=base_model, + sampling_params=sampling_params, + generation_parameters=generation_parameters, + device=args.model.device, + instruct= getattr(args.model, "instruct", False)) return model diff --git a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py index af1d9c7b9..187dd62f6 100644 --- a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py +++ b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py @@ -1,6 +1,4 @@ -import logging import numpy as np -import time import torch from typing import Dict, List, Tuple @@ -12,8 +10,6 @@ import torch.nn as nn import string -log = logging.getLogger("lm_polygraph") - def eval_nli_model( nli_queue: List[Tuple[str, str]], @@ -73,15 +69,9 @@ def __call__( **kwargs, ) -> Dict[str, np.ndarray]: greedy_alternatives = dependencies["greedy_tokens_alternatives"] - total_samples = len(greedy_alternatives) - log.info( - f"[NLI] Processing {total_samples} samples, " - f"tokens per sample: {[len(s) for s in greedy_alternatives]}" - ) greedy_alternatives_nli = [] - for sample_idx, sample_alternatives in enumerate(greedy_alternatives): + for sample_alternatives in greedy_alternatives: nli_matrixes = [] - t_sample = time.time() for w_number, word_alternatives in enumerate(sample_alternatives): nli_queue = [] nli_matrix = [ @@ -117,10 +107,6 @@ def __call__( nli_matrix[i][j] = nli_class[wi, wj] nli_matrixes.append(nli_matrix) - log.info( - f"[NLI] Sample {sample_idx + 1}/{total_samples}: " - f"{len(sample_alternatives)} tokens in {time.time() - t_sample:.1f}s" - ) greedy_alternatives_nli.append(nli_matrixes) return {"greedy_tokens_alternatives_nli": greedy_alternatives_nli} diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index 461f8ebe9..3c7c4e24d 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -252,14 +252,16 @@ def calculate(self, batch_stats: dict, calculators: list, inp_texts: list) -> di """ for stat_calculator in calculators: try: - start_time = time.time() - log.info(f"[CALC] Starting {stat_calculator.__class__.__name__}...") + if self.log_time: + start_time = time.time() + log.info(f"Calculating {stat_calculator}...") new_stats = stat_calculator( batch_stats, inp_texts, self.model, self.max_new_tokens ) - log.info( - f"[CALC] Done {stat_calculator.__class__.__name__} in {round(time.time() - start_time, 2)}s" - ) + if self.log_time: + log.info( + f"Done calculating {stat_calculator} in {round(time.time() - start_time, 2)} secs" + ) for stat, stat_value in new_stats.items(): if stat in batch_stats.keys(): continue diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py index e10a7a905..aa06a3053 100644 --- a/src/lm_polygraph/utils/model.py +++ b/src/lm_polygraph/utils/model.py @@ -455,32 +455,6 @@ def __call__(self, input_ids=None, scores=None): self.scores.append(scores.log_softmax(-1)) return scores - class _SanitizeLogitsProcessor: - # Replaces inf/nan in logits with finite values to prevent - # RuntimeError in torch.multinomial during sampling. - # Uses per-row max/min of finite values to avoid dominating softmax. - def __call__(self, input_ids=None, scores=None): - if torch.isfinite(scores).all(): - return scores - finite_mask = torch.isfinite(scores) - # Compute per-row max/min of finite values - masked = scores.clone() - masked[~finite_mask] = float("-inf") - row_max = masked.max(dim=-1, keepdim=True).values - masked[~finite_mask] = float("inf") - row_min = masked.min(dim=-1, keepdim=True).values - # Fallback if entire row is non-finite - row_max = torch.where( - torch.isfinite(row_max), row_max, torch.zeros_like(row_max) - ) - row_min = torch.where( - torch.isfinite(row_min), row_min, torch.zeros_like(row_min) - ) - scores = torch.where(torch.isposinf(scores), row_max, scores) - scores = torch.where(torch.isneginf(scores), row_min, scores) - scores = torch.nan_to_num(scores, nan=0.0) - return scores - def generate(self, **args): """ Generates the model output with scores from batch formed by HF Tokenizer. @@ -492,16 +466,14 @@ def generate(self, **args): """ default_params = asdict(self.generation_parameters) - # add ScoresProcessor to collect original scores, and SanitizeLogitsProcessor - # to prevent inf/nan from crashing torch.multinomial during sampling + # add ScoresProcessor to collect original scores processor = self._ScoresProcessor() - sanitizer = self._SanitizeLogitsProcessor() if "logits_processor" in args.keys(): logits_processor = LogitsProcessorList( - [sanitizer, processor, args["logits_processor"]] + [processor, args["logits_processor"]] ) else: - logits_processor = LogitsProcessorList([sanitizer, processor]) + logits_processor = LogitsProcessorList([processor]) args["logits_processor"] = logits_processor # update default parameters with passed arguments diff --git a/test/configs/test_polygraph_eval_seq_ue.yaml b/test/configs/test_polygraph_eval_seq_ue.yaml index 53fd97726..b259f071f 100644 --- a/test/configs/test_polygraph_eval_seq_ue.yaml +++ b/test/configs/test_polygraph_eval_seq_ue.yaml @@ -22,9 +22,6 @@ instruct: false prompt: "" ignore_exceptions: false - -generation_params: - temperature: 0.7 generation_metrics: - name: RougeMetric args: ["rouge1"] @@ -34,7 +31,6 @@ stat_calculators: subsample_eval_dataset: 10 batch_size: 2 -seed: - - 42 +seed: null device: null max_new_tokens: 256 diff --git a/test/test_lm_polygraph.py b/test/test_lm_polygraph.py index 6ce533e3b..da82128c2 100644 --- a/test/test_lm_polygraph.py +++ b/test/test_lm_polygraph.py @@ -1,7 +1,5 @@ import subprocess import pathlib -import time -import sys from lm_polygraph.utils.manager import UEManager @@ -16,20 +14,7 @@ def exec_bash(s): - print(f"\n[TIMER] Starting command: {s}", flush=True) - t0 = time.time() - proc = subprocess.Popen( - s, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True - ) - for line in proc.stdout: - sys.stdout.write(line) - sys.stdout.flush() - proc.wait() - elapsed = time.time() - t0 - print( - f"[TIMER] Command finished in {elapsed:.1f}s (rc={proc.returncode})", flush=True - ) - return proc + return subprocess.run(s, shell=True) def pwd(): From fa46e1edb8ebdd68ce50be52419f659ca2f9003b Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Thu, 9 Apr 2026 00:35:10 +0400 Subject: [PATCH 23/29] Pin transformers before installing lm-polygraph --- .github/workflows/python-app.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 1de5747c1..c15bb1286 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -33,10 +33,8 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest hydra-core==1.3.2 - pip install . - - name: Pin transformers version - run: | pip install "transformers${{ matrix.transformers-version }}" + pip install . - name: Lint if: matrix.transformers-version == '<5' uses: py-actions/flake8@v2 From 2133b82cfba34ef4ec1d71c29629ce7c40c08c81 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Thu, 9 Apr 2026 00:45:19 +0400 Subject: [PATCH 24/29] Use single CI job (same as main) --- .github/workflows/python-app.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index c15bb1286..76dc643a5 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -16,12 +16,6 @@ jobs: build: runs-on: ubuntu-latest - timeout-minutes: 45 - - strategy: - fail-fast: false - matrix: - transformers-version: ["<5", ">=5"] steps: - uses: actions/checkout@v3 @@ -33,10 +27,8 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest hydra-core==1.3.2 - pip install "transformers${{ matrix.transformers-version }}" pip install . - name: Lint - if: matrix.transformers-version == '<5' uses: py-actions/flake8@v2 with: args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401" From 45955e808bb965de355b7bf5907a478ec958da1d Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Thu, 9 Apr 2026 00:53:44 +0400 Subject: [PATCH 25/29] Add logits sanitizer to prevent inf/nan crash in sampling --- src/lm_polygraph/utils/model.py | 38 ++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py index aa06a3053..ccd1d11bc 100644 --- a/src/lm_polygraph/utils/model.py +++ b/src/lm_polygraph/utils/model.py @@ -455,6 +455,37 @@ def __call__(self, input_ids=None, scores=None): self.scores.append(scores.log_softmax(-1)) return scores + class _SanitizeLogitsProcessor: + # Replaces inf/nan in logits with finite values to prevent + # RuntimeError in torch.multinomial during sampling. + # Uses per-row max/min of finite values to preserve distribution shape. + def __call__(self, input_ids=None, scores=None): + if torch.isfinite(scores).all(): + return scores + finite_mask = torch.isfinite(scores) + scores_for_max = torch.where( + finite_mask, + scores, + torch.tensor(float("-inf"), dtype=scores.dtype, device=scores.device), + ) + scores_for_min = torch.where( + finite_mask, + scores, + torch.tensor(float("inf"), dtype=scores.dtype, device=scores.device), + ) + row_max = scores_for_max.max(dim=-1, keepdim=True).values + row_min = scores_for_min.min(dim=-1, keepdim=True).values + row_max = torch.where( + torch.isfinite(row_max), row_max, torch.zeros_like(row_max) + ) + row_min = torch.where( + torch.isfinite(row_min), row_min, torch.zeros_like(row_min) + ) + scores = torch.where(torch.isposinf(scores), row_max, scores) + scores = torch.where(torch.isneginf(scores), row_min, scores) + scores = torch.nan_to_num(scores, nan=0.0) + return scores + def generate(self, **args): """ Generates the model output with scores from batch formed by HF Tokenizer. @@ -466,14 +497,15 @@ def generate(self, **args): """ default_params = asdict(self.generation_parameters) - # add ScoresProcessor to collect original scores + # add ScoresProcessor and SanitizeLogitsProcessor processor = self._ScoresProcessor() + sanitizer = self._SanitizeLogitsProcessor() if "logits_processor" in args.keys(): logits_processor = LogitsProcessorList( - [processor, args["logits_processor"]] + [sanitizer, processor, args["logits_processor"]] ) else: - logits_processor = LogitsProcessorList([processor]) + logits_processor = LogitsProcessorList([sanitizer, processor]) args["logits_processor"] = logits_processor # update default parameters with passed arguments From 0184ad58810ffcd74aa30dee837df132cd96a34e Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Thu, 9 Apr 2026 00:55:36 +0400 Subject: [PATCH 26/29] Add CI matrix: transformers <5 and >=5 --- .github/workflows/python-app.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 76dc643a5..c15bb1286 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -16,6 +16,12 @@ jobs: build: runs-on: ubuntu-latest + timeout-minutes: 45 + + strategy: + fail-fast: false + matrix: + transformers-version: ["<5", ">=5"] steps: - uses: actions/checkout@v3 @@ -27,8 +33,10 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest hydra-core==1.3.2 + pip install "transformers${{ matrix.transformers-version }}" pip install . - name: Lint + if: matrix.transformers-version == '<5' uses: py-actions/flake8@v2 with: args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401" From 28dd6fb469e3ce2c00e41213e82aa02579089cb9 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Thu, 9 Apr 2026 01:30:52 +0400 Subject: [PATCH 27/29] Pin transformers<5, single CI job --- .github/workflows/python-app.yml | 8 -------- requirements.txt | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index c15bb1286..76dc643a5 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -16,12 +16,6 @@ jobs: build: runs-on: ubuntu-latest - timeout-minutes: 45 - - strategy: - fail-fast: false - matrix: - transformers-version: ["<5", ">=5"] steps: - uses: actions/checkout@v3 @@ -33,10 +27,8 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest hydra-core==1.3.2 - pip install "transformers${{ matrix.transformers-version }}" pip install . - name: Lint - if: matrix.transformers-version == '<5' uses: py-actions/flake8@v2 with: args: "--extend-ignore E501,F405,F403,E203 --per-file-ignores __init__.py:F401,builder_stat_calculator_simple.py:F401" diff --git a/requirements.txt b/requirements.txt index 03f294b85..b4d67e210 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ matplotlib>=3.6 pandas>=1.3.5 torch>=2.6.0 bs4 -transformers>=4.50.0 +transformers>=4.50.0,<5 nltk>=3.6.5 sacrebleu>=1.5.0 sentencepiece>=0.2.1 From d226bc4b74dec10e8ab4ee87c9339a552ead4ad4 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Thu, 9 Apr 2026 12:44:57 +0400 Subject: [PATCH 28/29] Remove transformers<5 pin, move comet to optional extras - Remove transformers upper bound (compat code handles both 4.x and 5.x) - Move unbabel-comet to [comet] extra in pyproject.toml - Update README with two install paths (with/without comet) - Fix black formatting in generation_metrics __init__ --- README.md | 7 ++++++- pyproject.toml | 3 +++ requirements.txt | 3 +-- src/lm_polygraph/generation_metrics/__init__.py | 1 + 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 7661e8d72..3483d484e 100644 --- a/README.md +++ b/README.md @@ -55,8 +55,13 @@ pip install lm-polygraph Some features require additional packages that are not installed by default: -- **COMET metric** (translation evaluation): `unbabel-comet` has constrained `transformers` version requirements, so it is installed separately: +- **COMET metric** (translation evaluation): `unbabel-comet` pins `numpy<2.0` which may conflict with packages like vLLM. Install via extras: ```shell + pip install lm-polygraph[comet] + ``` + If you need numpy 2.x (e.g., for vLLM), install without the extra and add comet manually: + ```shell + pip install lm-polygraph pip install unbabel-comet --no-deps ``` diff --git a/pyproject.toml b/pyproject.toml index 503aad05d..7f895a357 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,9 @@ classifiers = [ "Repository" = "https://github.com/IINemo/lm-polygraph" "Documentation" = "https://lm-polygraph.readthedocs.io" +[project.optional-dependencies] +comet = ["unbabel-comet<3"] + [tool.setuptools] script-files = [ "scripts/polygraph_eval", diff --git a/requirements.txt b/requirements.txt index b4d67e210..5411d9864 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ matplotlib>=3.6 pandas>=1.3.5 torch>=2.6.0 bs4 -transformers>=4.50.0,<5 +transformers>=4.50.0 nltk>=3.6.5 sacrebleu>=1.5.0 sentencepiece>=0.2.1 @@ -29,7 +29,6 @@ openai>=1.52.0 wget sentence-transformers bert-score>=0.3.13 -# unbabel-comet<3 # Optional, see README for installation instructions nltk>=3.7,<4 evaluate>=0.4.2 spacy>=3.4.0 diff --git a/src/lm_polygraph/generation_metrics/__init__.py b/src/lm_polygraph/generation_metrics/__init__.py index 063ce18e0..da3529cfd 100644 --- a/src/lm_polygraph/generation_metrics/__init__.py +++ b/src/lm_polygraph/generation_metrics/__init__.py @@ -8,6 +8,7 @@ from .comet import Comet except ImportError: Comet = None + from .alignscore import AlignScore from .openai_fact_check import OpenAIFactCheck from .bert_score import BertScoreMetric From daae87522d629f2aa560d23d49791cb6dfc64267 Mon Sep 17 00:00:00 2001 From: Vlad Smirnov Date: Thu, 9 Apr 2026 12:46:11 +0400 Subject: [PATCH 29/29] Install lm-polygraph[comet] in CI --- .github/workflows/python-app.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 76dc643a5..c6ab5a3ca 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -27,7 +27,7 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest hydra-core==1.3.2 - pip install . + pip install ".[comet]" - name: Lint uses: py-actions/flake8@v2 with: