diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 76dc643a5..c6ab5a3ca 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -27,7 +27,7 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest hydra-core==1.3.2 - pip install . + pip install ".[comet]" - name: Lint uses: py-actions/flake8@v2 with: diff --git a/README.md b/README.md index 2f9d65132..3483d484e 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,20 @@ The latest tagged version is also available via PyPI: pip install lm-polygraph ``` +### Optional dependencies + +Some features require additional packages that are not installed by default: + +- **COMET metric** (translation evaluation): `unbabel-comet` pins `numpy<2.0` which may conflict with packages like vLLM. Install via extras: + ```shell + pip install lm-polygraph[comet] + ``` + If you need numpy 2.x (e.g., for vLLM), install without the extra and add comet manually: + ```shell + pip install lm-polygraph + pip install unbabel-comet --no-deps + ``` + ## Basic usage 1. Initialize the base model (encoder-decoder or decoder-only) and tokenizer from HuggingFace or a local file, and use them to initialize the WhiteboxModel for evaluation: ```python diff --git a/pyproject.toml b/pyproject.toml index 503aad05d..7f895a357 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,9 @@ classifiers = [ "Repository" = "https://github.com/IINemo/lm-polygraph" "Documentation" = "https://lm-polygraph.readthedocs.io" +[project.optional-dependencies] +comet = ["unbabel-comet<3"] + [tool.setuptools] script-files = [ "scripts/polygraph_eval", diff --git a/requirements.txt b/requirements.txt index 520bc1750..5411d9864 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,10 +29,9 @@ openai>=1.52.0 wget sentence-transformers bert-score>=0.3.13 -unbabel-comet<3 nltk>=3.7,<4 evaluate>=0.4.2 -spacy>=3.4.0,<3.8.0 +spacy>=3.4.0 fastchat diskcache>=5.6.3 boostedprob diff --git a/src/lm_polygraph/generation_metrics/__init__.py b/src/lm_polygraph/generation_metrics/__init__.py index d9d66c958..da3529cfd 100644 --- a/src/lm_polygraph/generation_metrics/__init__.py +++ b/src/lm_polygraph/generation_metrics/__init__.py @@ -3,7 +3,12 @@ from .model_score import ModelScoreSeqMetric, ModelScoreTokenwiseMetric from .bart_score import BartScoreSeqMetric from .accuracy import AccuracyMetric -from .comet import Comet + +try: + from .comet import Comet +except ImportError: + Comet = None + from .alignscore import AlignScore from .openai_fact_check import OpenAIFactCheck from .bert_score import BertScoreMetric diff --git a/src/lm_polygraph/generation_metrics/comet.py b/src/lm_polygraph/generation_metrics/comet.py index 0fcd9b3e2..5942b2b3f 100644 --- a/src/lm_polygraph/generation_metrics/comet.py +++ b/src/lm_polygraph/generation_metrics/comet.py @@ -1,6 +1,5 @@ import re import numpy as np -from evaluate import load from typing import List, Dict from .generation_metric import GenerationMetric @@ -14,6 +13,8 @@ class Comet(GenerationMetric): def __init__(self, source_ignore_regex=None, lang="en"): super().__init__(["greedy_texts", "input_texts"], "sequence") + from evaluate import load + self.scorer = load("comet") self.source_ignore_regex = ( re.compile(source_ignore_regex) if source_ignore_regex else None diff --git a/src/lm_polygraph/model_adapters/visual_whitebox_model.py b/src/lm_polygraph/model_adapters/visual_whitebox_model.py index 9601bca27..0b674712f 100644 --- a/src/lm_polygraph/model_adapters/visual_whitebox_model.py +++ b/src/lm_polygraph/model_adapters/visual_whitebox_model.py @@ -8,12 +8,17 @@ import torch from PIL import Image from transformers import ( - AutoModelForVision2Seq, AutoProcessor, GenerationConfig, LogitsProcessorList, ) +try: + from transformers import AutoModelForVision2Seq +except ImportError: + # transformers >= 5.0 renamed AutoModelForVision2Seq → AutoModelForImageTextToText + from transformers import AutoModelForImageTextToText as AutoModelForVision2Seq + from lm_polygraph.utils.generation_parameters import GenerationParameters from lm_polygraph.utils.dataset import Dataset from lm_polygraph.utils.model import Model diff --git a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py index bb0cfe3b2..187dd62f6 100644 --- a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py +++ b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py @@ -22,7 +22,7 @@ def eval_nli_model( with torch.no_grad(): for k in range(0, len(nli_set), deberta.batch_size): batch = nli_set[k : k + deberta.batch_size] - encoded = deberta.deberta_tokenizer.batch_encode_plus( + encoded = deberta.deberta_tokenizer( batch, padding=True, return_tensors="pt" ).to(deberta.device) logits = deberta.deberta(**encoded).logits.detach() diff --git a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py index 07d458d5a..1f3ccbe96 100644 --- a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py +++ b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py @@ -47,16 +47,16 @@ def calculate_semantic_matrix(self, batch_pairs, batch_invs): probs_b = [] for first_texts, second_texts in tqdm(dl): batch = list(zip(first_texts, second_texts)) - encoded = tokenizer.batch_encode_plus( - batch, padding=True, return_tensors="pt" - ).to(deberta.device) + encoded = tokenizer(batch, padding=True, return_tensors="pt").to( + deberta.device + ) logits = deberta.deberta(**encoded).logits probs_f.append(softmax(logits)) batch = list(zip(second_texts, first_texts)) - encoded = tokenizer.batch_encode_plus( - batch, padding=True, return_tensors="pt" - ).to(deberta.device) + encoded = tokenizer(batch, padding=True, return_tensors="pt").to( + deberta.device + ) logits = deberta.deberta(**encoded).logits probs_b.append(softmax(logits)) diff --git a/src/lm_polygraph/stat_calculators/semantic_matrix.py b/src/lm_polygraph/stat_calculators/semantic_matrix.py index 37bb969a5..b03d4967b 100644 --- a/src/lm_polygraph/stat_calculators/semantic_matrix.py +++ b/src/lm_polygraph/stat_calculators/semantic_matrix.py @@ -102,9 +102,9 @@ def __call__( logits_all = [] for first_texts, second_texts in dl: batch = list(zip(first_texts, second_texts)) - encoded = tokenizer.batch_encode_plus( - batch, padding=True, return_tensors="pt" - ).to(device) + encoded = tokenizer(batch, padding=True, return_tensors="pt").to( + device + ) logits = deberta.deberta(**encoded).logits probs.append(softmax(logits)) logits_all.append(logits) diff --git a/src/lm_polygraph/utils/ensemble_utils/ensemble_beam.py b/src/lm_polygraph/utils/ensemble_utils/ensemble_beam.py index 94f180daa..2ed5d2731 100644 --- a/src/lm_polygraph/utils/ensemble_utils/ensemble_beam.py +++ b/src/lm_polygraph/utils/ensemble_utils/ensemble_beam.py @@ -8,7 +8,13 @@ from torch import nn from transformers import GenerationMixin -from transformers.generation.beam_search import BeamScorer + +try: + from transformers.generation.beam_search import BeamScorer +except ImportError: + # transformers >= 5.0 removed BeamScorer entirely + BeamScorer = None + from transformers.generation.logits_process import ( LogitsProcessorList, ) @@ -16,11 +22,19 @@ StoppingCriteriaList, validate_stopping_criteria, ) -from transformers.generation.utils import ( - BeamSearchOutput, - BeamSearchDecoderOnlyOutput, - ModelOutput, -) +from transformers.generation.utils import ModelOutput + +try: + from transformers.generation.utils import ( + BeamSearchOutput, + BeamSearchDecoderOnlyOutput, + ) +except ImportError: + # transformers >= 5.0 renamed these classes + from transformers.generation.utils import ( + GenerateBeamEncoderDecoderOutput as BeamSearchOutput, + GenerateBeamDecoderOnlyOutput as BeamSearchDecoderOnlyOutput, + ) class EnsembleBeamSearchMixin(GenerationMixin): diff --git a/src/lm_polygraph/utils/ensemble_utils/ensemble_greedy.py b/src/lm_polygraph/utils/ensemble_utils/ensemble_greedy.py index db7b31cf1..027d9eec9 100644 --- a/src/lm_polygraph/utils/ensemble_utils/ensemble_greedy.py +++ b/src/lm_polygraph/utils/ensemble_utils/ensemble_greedy.py @@ -15,11 +15,19 @@ validate_stopping_criteria, ) from transformers.generation.streamers import BaseStreamer -from transformers.generation.utils import ( - GreedySearchOutput, - GreedySearchDecoderOnlyOutput, - ModelOutput, -) +from transformers.generation.utils import ModelOutput + +try: + from transformers.generation.utils import ( + GreedySearchOutput, + GreedySearchDecoderOnlyOutput, + ) +except ImportError: + # transformers >= 5.0 renamed these classes + from transformers.generation.utils import ( + GenerateNonBeamOutput as GreedySearchOutput, + GenerateDecoderOnlyOutput as GreedySearchDecoderOnlyOutput, + ) class EnsembleGreedyMixin(GenerationMixin): diff --git a/src/lm_polygraph/utils/ensemble_utils/ensemble_sample.py b/src/lm_polygraph/utils/ensemble_utils/ensemble_sample.py index 07b142dc8..1106171e6 100644 --- a/src/lm_polygraph/utils/ensemble_utils/ensemble_sample.py +++ b/src/lm_polygraph/utils/ensemble_utils/ensemble_sample.py @@ -16,11 +16,19 @@ validate_stopping_criteria, ) from transformers.generation.streamers import BaseStreamer -from transformers.generation.utils import ( - SampleOutput, - SampleDecoderOnlyOutput, - ModelOutput, -) +from transformers.generation.utils import ModelOutput + +try: + from transformers.generation.utils import ( + SampleOutput, + SampleDecoderOnlyOutput, + ) +except ImportError: + # transformers >= 5.0 renamed these classes + from transformers.generation.utils import ( + GenerateNonBeamOutput as SampleOutput, + GenerateDecoderOnlyOutput as SampleDecoderOnlyOutput, + ) class EnsembleSampleMixin(GenerationMixin): diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py index aa06a3053..ccd1d11bc 100644 --- a/src/lm_polygraph/utils/model.py +++ b/src/lm_polygraph/utils/model.py @@ -455,6 +455,37 @@ def __call__(self, input_ids=None, scores=None): self.scores.append(scores.log_softmax(-1)) return scores + class _SanitizeLogitsProcessor: + # Replaces inf/nan in logits with finite values to prevent + # RuntimeError in torch.multinomial during sampling. + # Uses per-row max/min of finite values to preserve distribution shape. + def __call__(self, input_ids=None, scores=None): + if torch.isfinite(scores).all(): + return scores + finite_mask = torch.isfinite(scores) + scores_for_max = torch.where( + finite_mask, + scores, + torch.tensor(float("-inf"), dtype=scores.dtype, device=scores.device), + ) + scores_for_min = torch.where( + finite_mask, + scores, + torch.tensor(float("inf"), dtype=scores.dtype, device=scores.device), + ) + row_max = scores_for_max.max(dim=-1, keepdim=True).values + row_min = scores_for_min.min(dim=-1, keepdim=True).values + row_max = torch.where( + torch.isfinite(row_max), row_max, torch.zeros_like(row_max) + ) + row_min = torch.where( + torch.isfinite(row_min), row_min, torch.zeros_like(row_min) + ) + scores = torch.where(torch.isposinf(scores), row_max, scores) + scores = torch.where(torch.isneginf(scores), row_min, scores) + scores = torch.nan_to_num(scores, nan=0.0) + return scores + def generate(self, **args): """ Generates the model output with scores from batch formed by HF Tokenizer. @@ -466,14 +497,15 @@ def generate(self, **args): """ default_params = asdict(self.generation_parameters) - # add ScoresProcessor to collect original scores + # add ScoresProcessor and SanitizeLogitsProcessor processor = self._ScoresProcessor() + sanitizer = self._SanitizeLogitsProcessor() if "logits_processor" in args.keys(): logits_processor = LogitsProcessorList( - [processor, args["logits_processor"]] + [sanitizer, processor, args["logits_processor"]] ) else: - logits_processor = LogitsProcessorList([processor]) + logits_processor = LogitsProcessorList([sanitizer, processor]) args["logits_processor"] = logits_processor # update default parameters with passed arguments