Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
a2e1eca
Relax spacy upper bound, make unbabel-comet optional
smirnovlad Apr 8, 2026
a408e7e
Add optional dependencies section to README for unbabel-comet
smirnovlad Apr 8, 2026
fd22daf
Add transformers 5.x to CI test matrix
smirnovlad Apr 8, 2026
7d631ba
Fix transformers 5.0 compatibility for ensemble utils and visual model
smirnovlad Apr 8, 2026
781bb21
Use AutoModelForImageTextToText as fallback for AutoModelForVision2Seq
smirnovlad Apr 8, 2026
448485c
Replace batch_encode_plus with direct tokenizer call
smirnovlad Apr 8, 2026
0630902
Fix black formatting for tokenizer calls
smirnovlad Apr 8, 2026
eac1c35
Set seed=42 in test_polygraph_eval_seq_ue config
smirnovlad Apr 8, 2026
d9076fc
Fix seed format: must be a list for polygraph_eval
smirnovlad Apr 8, 2026
c008917
Set temperature=0.7 in seq_ue test to stabilize sampling
smirnovlad Apr 8, 2026
a84359f
Add renormalize_logits=True to sampling to prevent inf/nan in multino…
smirnovlad Apr 8, 2026
aea48e2
Add logits sanitizer to prevent inf/nan crash in sampling
smirnovlad Apr 8, 2026
4d061b5
Increase CI timeout to 45 minutes per job
smirnovlad Apr 8, 2026
37f1b1a
Fix logits sanitizer to use per-row max instead of fixed 1e4
smirnovlad Apr 8, 2026
112b4a4
Add timing logs to diagnose slow CI tests
smirnovlad Apr 8, 2026
7d8d5aa
Fix black formatting for timing logs
smirnovlad Apr 8, 2026
19c5748
Add per-calculator timing logs to UEManager
smirnovlad Apr 8, 2026
8e3a894
Temporarily run only test_all_seq_ue for debugging
smirnovlad Apr 8, 2026
d634148
Add NLI calculator progress logging
smirnovlad Apr 8, 2026
5e27656
Pass generation params via GenerationConfig for transformers 5.x
smirnovlad Apr 8, 2026
abd79c0
Revert "Pass generation params via GenerationConfig for transformers …
smirnovlad Apr 8, 2026
90c3ccf
Revert debug changes, clean up PR
smirnovlad Apr 8, 2026
fa46e1e
Pin transformers before installing lm-polygraph
smirnovlad Apr 8, 2026
2133b82
Use single CI job (same as main)
smirnovlad Apr 8, 2026
45955e8
Add logits sanitizer to prevent inf/nan crash in sampling
smirnovlad Apr 8, 2026
0184ad5
Add CI matrix: transformers <5 and >=5
smirnovlad Apr 8, 2026
28dd6fb
Pin transformers<5, single CI job
smirnovlad Apr 8, 2026
d226bc4
Remove transformers<5 pin, move comet to optional extras
smirnovlad Apr 9, 2026
daae875
Install lm-polygraph[comet] in CI
smirnovlad Apr 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install flake8 pytest hydra-core==1.3.2
pip install .
pip install ".[comet]"
- name: Lint
uses: py-actions/flake8@v2
with:
Expand Down
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,20 @@ The latest tagged version is also available via PyPI:
pip install lm-polygraph
```

### Optional dependencies

Some features require additional packages that are not installed by default:

- **COMET metric** (translation evaluation): `unbabel-comet` pins `numpy<2.0` which may conflict with packages like vLLM. Install via extras:
```shell
pip install lm-polygraph[comet]
```
If you need numpy 2.x (e.g., for vLLM), install without the extra and add comet manually:
```shell
pip install lm-polygraph
pip install unbabel-comet --no-deps
```

## <a name="basic_usage"></a>Basic usage
1. Initialize the base model (encoder-decoder or decoder-only) and tokenizer from HuggingFace or a local file, and use them to initialize the WhiteboxModel for evaluation:
```python
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ classifiers = [
"Repository" = "https://github.com/IINemo/lm-polygraph"
"Documentation" = "https://lm-polygraph.readthedocs.io"

[project.optional-dependencies]
comet = ["unbabel-comet<3"]

[tool.setuptools]
script-files = [
"scripts/polygraph_eval",
Expand Down
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,9 @@ openai>=1.52.0
wget
sentence-transformers
bert-score>=0.3.13
unbabel-comet<3
nltk>=3.7,<4
evaluate>=0.4.2
spacy>=3.4.0,<3.8.0
spacy>=3.4.0
fastchat
diskcache>=5.6.3
boostedprob
7 changes: 6 additions & 1 deletion src/lm_polygraph/generation_metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
from .model_score import ModelScoreSeqMetric, ModelScoreTokenwiseMetric
from .bart_score import BartScoreSeqMetric
from .accuracy import AccuracyMetric
from .comet import Comet

try:
from .comet import Comet
except ImportError:
Comet = None

from .alignscore import AlignScore
from .openai_fact_check import OpenAIFactCheck
from .bert_score import BertScoreMetric
Expand Down
3 changes: 2 additions & 1 deletion src/lm_polygraph/generation_metrics/comet.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import re
import numpy as np
from evaluate import load

from typing import List, Dict
from .generation_metric import GenerationMetric
Expand All @@ -14,6 +13,8 @@ class Comet(GenerationMetric):

def __init__(self, source_ignore_regex=None, lang="en"):
super().__init__(["greedy_texts", "input_texts"], "sequence")
from evaluate import load

self.scorer = load("comet")
self.source_ignore_regex = (
re.compile(source_ignore_regex) if source_ignore_regex else None
Expand Down
7 changes: 6 additions & 1 deletion src/lm_polygraph/model_adapters/visual_whitebox_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,17 @@
import torch
from PIL import Image
from transformers import (
AutoModelForVision2Seq,
AutoProcessor,
GenerationConfig,
LogitsProcessorList,
)

try:
from transformers import AutoModelForVision2Seq
except ImportError:
# transformers >= 5.0 renamed AutoModelForVision2Seq → AutoModelForImageTextToText
from transformers import AutoModelForImageTextToText as AutoModelForVision2Seq

from lm_polygraph.utils.generation_parameters import GenerationParameters
from lm_polygraph.utils.dataset import Dataset
from lm_polygraph.utils.model import Model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def eval_nli_model(
with torch.no_grad():
for k in range(0, len(nli_set), deberta.batch_size):
batch = nli_set[k : k + deberta.batch_size]
encoded = deberta.deberta_tokenizer.batch_encode_plus(
encoded = deberta.deberta_tokenizer(
batch, padding=True, return_tensors="pt"
).to(deberta.device)
logits = deberta.deberta(**encoded).logits.detach()
Expand Down
12 changes: 6 additions & 6 deletions src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,16 @@ def calculate_semantic_matrix(self, batch_pairs, batch_invs):
probs_b = []
for first_texts, second_texts in tqdm(dl):
batch = list(zip(first_texts, second_texts))
encoded = tokenizer.batch_encode_plus(
batch, padding=True, return_tensors="pt"
).to(deberta.device)
encoded = tokenizer(batch, padding=True, return_tensors="pt").to(
deberta.device
)
logits = deberta.deberta(**encoded).logits
probs_f.append(softmax(logits))

batch = list(zip(second_texts, first_texts))
encoded = tokenizer.batch_encode_plus(
batch, padding=True, return_tensors="pt"
).to(deberta.device)
encoded = tokenizer(batch, padding=True, return_tensors="pt").to(
deberta.device
)
logits = deberta.deberta(**encoded).logits
probs_b.append(softmax(logits))

Expand Down
6 changes: 3 additions & 3 deletions src/lm_polygraph/stat_calculators/semantic_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,9 @@ def __call__(
logits_all = []
for first_texts, second_texts in dl:
batch = list(zip(first_texts, second_texts))
encoded = tokenizer.batch_encode_plus(
batch, padding=True, return_tensors="pt"
).to(device)
encoded = tokenizer(batch, padding=True, return_tensors="pt").to(
device
)
logits = deberta.deberta(**encoded).logits
probs.append(softmax(logits))
logits_all.append(logits)
Expand Down
26 changes: 20 additions & 6 deletions src/lm_polygraph/utils/ensemble_utils/ensemble_beam.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,33 @@
from torch import nn

from transformers import GenerationMixin
from transformers.generation.beam_search import BeamScorer

try:
from transformers.generation.beam_search import BeamScorer
except ImportError:
# transformers >= 5.0 removed BeamScorer entirely
BeamScorer = None

from transformers.generation.logits_process import (
LogitsProcessorList,
)
from transformers.generation.stopping_criteria import (
StoppingCriteriaList,
validate_stopping_criteria,
)
from transformers.generation.utils import (
BeamSearchOutput,
BeamSearchDecoderOnlyOutput,
ModelOutput,
)
from transformers.generation.utils import ModelOutput

try:
from transformers.generation.utils import (
BeamSearchOutput,
BeamSearchDecoderOnlyOutput,
)
except ImportError:
# transformers >= 5.0 renamed these classes
from transformers.generation.utils import (
GenerateBeamEncoderDecoderOutput as BeamSearchOutput,
GenerateBeamDecoderOnlyOutput as BeamSearchDecoderOnlyOutput,
)


class EnsembleBeamSearchMixin(GenerationMixin):
Expand Down
18 changes: 13 additions & 5 deletions src/lm_polygraph/utils/ensemble_utils/ensemble_greedy.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,19 @@
validate_stopping_criteria,
)
from transformers.generation.streamers import BaseStreamer
from transformers.generation.utils import (
GreedySearchOutput,
GreedySearchDecoderOnlyOutput,
ModelOutput,
)
from transformers.generation.utils import ModelOutput

try:
from transformers.generation.utils import (
GreedySearchOutput,
GreedySearchDecoderOnlyOutput,
)
except ImportError:
# transformers >= 5.0 renamed these classes
from transformers.generation.utils import (
GenerateNonBeamOutput as GreedySearchOutput,
GenerateDecoderOnlyOutput as GreedySearchDecoderOnlyOutput,
)


class EnsembleGreedyMixin(GenerationMixin):
Expand Down
18 changes: 13 additions & 5 deletions src/lm_polygraph/utils/ensemble_utils/ensemble_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,19 @@
validate_stopping_criteria,
)
from transformers.generation.streamers import BaseStreamer
from transformers.generation.utils import (
SampleOutput,
SampleDecoderOnlyOutput,
ModelOutput,
)
from transformers.generation.utils import ModelOutput

try:
from transformers.generation.utils import (
SampleOutput,
SampleDecoderOnlyOutput,
)
except ImportError:
# transformers >= 5.0 renamed these classes
from transformers.generation.utils import (
GenerateNonBeamOutput as SampleOutput,
GenerateDecoderOnlyOutput as SampleDecoderOnlyOutput,
)


class EnsembleSampleMixin(GenerationMixin):
Expand Down
38 changes: 35 additions & 3 deletions src/lm_polygraph/utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,37 @@ def __call__(self, input_ids=None, scores=None):
self.scores.append(scores.log_softmax(-1))
return scores

class _SanitizeLogitsProcessor:
# Replaces inf/nan in logits with finite values to prevent
# RuntimeError in torch.multinomial during sampling.
# Uses per-row max/min of finite values to preserve distribution shape.
def __call__(self, input_ids=None, scores=None):
if torch.isfinite(scores).all():
return scores
finite_mask = torch.isfinite(scores)
scores_for_max = torch.where(
finite_mask,
scores,
torch.tensor(float("-inf"), dtype=scores.dtype, device=scores.device),
)
scores_for_min = torch.where(
finite_mask,
scores,
torch.tensor(float("inf"), dtype=scores.dtype, device=scores.device),
)
row_max = scores_for_max.max(dim=-1, keepdim=True).values
row_min = scores_for_min.min(dim=-1, keepdim=True).values
row_max = torch.where(
torch.isfinite(row_max), row_max, torch.zeros_like(row_max)
)
row_min = torch.where(
torch.isfinite(row_min), row_min, torch.zeros_like(row_min)
)
scores = torch.where(torch.isposinf(scores), row_max, scores)
scores = torch.where(torch.isneginf(scores), row_min, scores)
scores = torch.nan_to_num(scores, nan=0.0)
return scores

def generate(self, **args):
"""
Generates the model output with scores from batch formed by HF Tokenizer.
Expand All @@ -466,14 +497,15 @@ def generate(self, **args):
"""
default_params = asdict(self.generation_parameters)

# add ScoresProcessor to collect original scores
# add ScoresProcessor and SanitizeLogitsProcessor
processor = self._ScoresProcessor()
sanitizer = self._SanitizeLogitsProcessor()
if "logits_processor" in args.keys():
logits_processor = LogitsProcessorList(
[processor, args["logits_processor"]]
[sanitizer, processor, args["logits_processor"]]
)
else:
logits_processor = LogitsProcessorList([processor])
logits_processor = LogitsProcessorList([sanitizer, processor])
args["logits_processor"] = logits_processor

# update default parameters with passed arguments
Expand Down
Loading