diff --git a/README.md b/README.md index 1687e30d..21b2b29e 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,8 @@ The `guidellm benchmark` command is used to run benchmarks against a generative - `prompt_tokens`: Average number of tokens for prompts. - `output_tokens`: Average number of tokens for outputs. - - `TYPE_stdev`, `TYPE_min`, `TYPE_max`: Standard deviation, minimum, and maximum values for the specified type (e.g., `prompt_tokens`, `output_tokens`). If not provided, will use the provided tokens value only. + - `turns`: Average number of request-response pairs per sample. Values above `1` result in a multi-turn[^1] benchmark. + - `TYPE_stdev`, `TYPE_min`, `TYPE_max`: Standard deviation, minimum, and maximum values for the specified type (e.g., `prompt_tokens`, `output_tokens`, `turns`). If not provided, will use the provided tokens value only. - `samples`: Number of samples to generate, defaults to 1000. - `source`: Source text data for generation, defaults to a local copy of Pride and Prejudice. @@ -208,3 +209,7 @@ If you find GuideLLM helpful in your research or projects, please consider citin howpublished={\url{https://github.com/vllm-project/guidellm}}, } ``` + +- - - + +[^1]: Multi-turn refers to a benchmark where each dataset row represents a series of sequential requests, with each subsequent request building upon the context of the previous ones. diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py index 2ef85c3e..31f936af 100644 --- a/src/guidellm/benchmark/entrypoints.py +++ b/src/guidellm/benchmark/entrypoints.py @@ -90,11 +90,11 @@ async def benchmark_generative_text( ), random_seed=random_seed, ) - unique_requests = request_loader.num_unique_items(raise_err=False) + unique_samples = request_loader.num_unique_items(raise_err=False) console.print_line( - f"Created loader with {unique_requests} unique requests from {data}.\n\n" - if unique_requests > 0 - else f"Created loader with unknown number unique requests from {data}.\n\n" + f"Created loader with {unique_samples} unique samples from {data}.\n\n" + if unique_samples > 0 + else f"Created loader with unknown number unique samples from {data}.\n\n" ) profile = create_profile(rate_type=rate_type, rate=rate) diff --git a/src/guidellm/dataset/synthetic.py b/src/guidellm/dataset/synthetic.py index 9868ab52..d7839718 100644 --- a/src/guidellm/dataset/synthetic.py +++ b/src/guidellm/dataset/synthetic.py @@ -2,7 +2,7 @@ import random from collections.abc import Iterable, Iterator from pathlib import Path -from typing import Any, Literal, Optional, Union +from typing import Any, Optional, TypedDict, Union import yaml from datasets import ( @@ -63,6 +63,26 @@ class SyntheticDatasetConfig(BaseModel): gt=0, default=None, ) + turns: int = Field( + description="The number of turns in the conversation.", + gt=0, + default=1, + ) + turns_stdev: Optional[int] = Field( + description="The standard deviation of the number of turns.", + gt=0, + default=None, + ) + turns_min: Optional[int] = Field( + description="The minimum number of turns in the conversation.", + gt=0, + default=None, + ) + turns_max: Optional[int] = Field( + description="The maximum number of turns in the conversation.", + gt=0, + default=None, + ) samples: int = Field( description="The number of samples to generate for the dataset.", gt=0, @@ -118,14 +138,13 @@ def parse_config_file(data: Union[str, Path]) -> "SyntheticDatasetConfig": return SyntheticDatasetConfig(**config_dict) -class SyntheticTextItemsGenerator( - Iterable[ - dict[ - Literal["prompt", "prompt_tokens_count", "output_tokens_count"], - Union[str, int], - ] - ] -): +class SyntheticDatasetRow(TypedDict): + prompt: list[str] + prompt_tokens_count: list[int] + output_tokens_count: list[int] + + +class SyntheticTextItemsGenerator(Iterable[SyntheticDatasetRow]): def __init__( self, config: SyntheticDatasetConfig, @@ -141,12 +160,7 @@ def __init__( def __iter__( self, - ) -> Iterator[ - dict[ - Literal["prompt", "prompt_tokens_count", "output_tokens_count"], - Union[str, int], - ] - ]: + ) -> Iterator[SyntheticDatasetRow]: prompt_tokens_sampler = IntegerRangeSampler( average=self.config.prompt_tokens, variance=self.config.prompt_tokens_stdev, @@ -161,20 +175,33 @@ def __iter__( max_value=self.config.output_tokens_max, random_seed=self.random_seed + 1, # ensure diff dist from prompts ) + turns_sampler = IntegerRangeSampler( + average=self.config.turns, + variance=self.config.turns_stdev, + min_value=self.config.turns_min, + max_value=self.config.turns_max, + random_seed=self.random_seed + 7, # ensure diff dist + ) # ensure diff distribution from output tokens rand = random.Random(self.random_seed + 2) # noqa: S311 - for _, prompt_tokens, output_tokens in zip( - range(self.config.samples), - prompt_tokens_sampler, - output_tokens_sampler, - ): - start_index = rand.randint(0, len(self.text_creator.words)) - yield { - "prompt": self._create_prompt(prompt_tokens, start_index), - "prompt_tokens_count": prompt_tokens, - "output_tokens_count": output_tokens, + for _, turns in zip(range(self.config.samples), turns_sampler): + row: SyntheticDatasetRow = { + "prompt": [], + "prompt_tokens_count": [], + "output_tokens_count": [], } + for _, prompt_tokens, output_tokens in zip( + range(turns), + prompt_tokens_sampler, + output_tokens_sampler, + ): + start_index = rand.randint(0, len(self.text_creator.words)) + row["prompt"].append(self._create_prompt(prompt_tokens, start_index)) + row["prompt_tokens_count"].append(prompt_tokens) + row["output_tokens_count"].append(output_tokens) + + yield row def _create_prompt(self, prompt_tokens: int, start_index: int) -> str: if prompt_tokens <= 0: diff --git a/src/guidellm/preprocess/item.py b/src/guidellm/preprocess/item.py new file mode 100644 index 00000000..91801de8 --- /dev/null +++ b/src/guidellm/preprocess/item.py @@ -0,0 +1,47 @@ +from collections.abc import Sequence +from typing import Generic, Optional, TypeVar + +from pydantic import Field + +from guidellm.objects.pydantic import StandardBaseModel + +PromptT = TypeVar("PromptT") + + +class Item(StandardBaseModel, Generic[PromptT]): + """ + Represents a single item in a dataset, + containing a prompt and its associated metadata. + """ + + value: PromptT = Field( + description="The prompt text or data for the item.", + examples=[ + "What is the capital of France?", + "Explain quantum computing in simple terms.", + ], + ) + prompt_tokens: Optional[int] = Field( + default=None, gt=0, description="Number of tokens in the prompt" + ) + output_tokens: Optional[int] = Field( + default=None, gt=0, description="Number of tokens in the output" + ) + + +class ItemList(Sequence[Item[PromptT]]): + """ + Represents a list of items, each containing a prompt and its metadata. + """ + + shared_prefix: Optional[PromptT] + + def __init__(self, *items: Item[PromptT], shared_prefix: Optional[PromptT] = None): + self.shared_prefix = shared_prefix + self._items = list(items) + + def __getitem__(self, key): + return self._items[key] + + def __len__(self) -> int: + return len(self._items) diff --git a/src/guidellm/request/loader.py b/src/guidellm/request/loader.py index 452e4733..082b8697 100644 --- a/src/guidellm/request/loader.py +++ b/src/guidellm/request/loader.py @@ -11,10 +11,9 @@ from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict from transformers import PreTrainedTokenizerBase # type: ignore[import] -from guidellm.config import settings from guidellm.dataset import ColumnInputTypes, load_dataset from guidellm.objects import StandardBaseModel -from guidellm.request.request import GenerationRequest +from guidellm.preprocess.item import Item, ItemList from guidellm.request.session import GenerativeRequestSession __all__ = [ @@ -113,7 +112,7 @@ def __iter__(self) -> Iterator[GenerativeRequestSession]: scope_create_count += 1 for item in dataset_iter: - yield GenerativeRequestSession(self._create_request(item)) + yield GenerativeRequestSession(self._create_items(item)) self._preserved_iter = None @@ -261,7 +260,8 @@ def _get_dataset_iter( return dataset_iter - def _create_request(self, item: dict[str, Any]) -> GenerationRequest: + def _create_items(self, item: dict[str, Any]) -> ItemList: + prompts = item[self.column_mappings["prompt_column"]] prompt_tokens = ( item[self.column_mappings["prompt_tokens_count_column"]] if "prompt_tokens_count_column" in self.column_mappings @@ -273,13 +273,12 @@ def _create_request(self, item: dict[str, Any]) -> GenerationRequest: else None ) - return GenerationRequest( - request_type=settings.preferred_route, - content=item[self.column_mappings["prompt_column"]], - stats=( - {"prompt_tokens": prompt_tokens} if prompt_tokens is not None else {} - ), - constraints=( - {"output_tokens": output_tokens} if output_tokens is not None else {} - ), + items = ( + Item(value=prompt, output_tokens=out_t, prompt_tokens=in_t) + for prompt, in_t, out_t in zip( + prompts if isinstance(prompts, list) else [prompts], + prompt_tokens if isinstance(prompt_tokens, list) else [prompt_tokens], + output_tokens if isinstance(output_tokens, list) else [output_tokens], + ) ) + return ItemList(*items) diff --git a/src/guidellm/request/session.py b/src/guidellm/request/session.py index 6ea7633b..e52795c3 100644 --- a/src/guidellm/request/session.py +++ b/src/guidellm/request/session.py @@ -1,15 +1,18 @@ +import itertools from abc import ABC, abstractmethod -from typing import Generic, TypeVar +from typing import TYPE_CHECKING, Generic + +if TYPE_CHECKING: + from collections.abc import Sequence from guidellm.backend.response import ResponseSummary +from guidellm.config import settings +from guidellm.preprocess.item import Item, ItemList from guidellm.request.request import GenerationRequest +from guidellm.request.types import RequestT, ResponseT __all__ = ["GenerativeRequestSession", "RequestSession"] -# TODO: Replace with specific types that implement needed features -RequestT = TypeVar("RequestT") -ResponseT = TypeVar("ResponseT") - class RequestSession(ABC, Generic[RequestT, ResponseT]): @abstractmethod @@ -29,24 +32,60 @@ def push_response(self, response: ResponseT) -> None: ... def complete(self) -> bool: ... -# TODO: Implement multiturn support class GenerativeRequestSession(RequestSession[GenerationRequest, ResponseSummary]): - def __init__(self, request: GenerationRequest) -> None: - self.request = request - self._complete = False + def __init__(self, items: ItemList) -> None: + if len(items) < 1: + raise ValueError("Prompts cannot be empty") + + self.prompts: Sequence[Item] = items + self.responses: list[Item] = [] def __len__(self) -> int: - return 1 + return len(self.prompts) def get_next_request(self) -> GenerationRequest: - return self.request + completed_responses = len(self.responses) + + # FIXME: Can only handle string requests + content = "".join( + itertools.chain.from_iterable( + (x.value, y.value) + for x, y in zip(self.prompts, self.responses + [Item(value="")]) + ) + ) + + prev_prompt_tokens = sum( + (x.prompt_tokens or 0) + (x.output_tokens or 0) for x in self.responses + ) + prompt_tokens = ( + self.prompts[completed_responses].prompt_tokens or 0 + ) + prev_prompt_tokens + + output_tokens = self.prompts[completed_responses].output_tokens + + return GenerationRequest( + request_type=settings.preferred_route, + content=content, + stats=({"prompt_tokens": prompt_tokens} if prompt_tokens else {}), + constraints=({"output_tokens": output_tokens} if output_tokens else {}), + ) def get_next_delay(self) -> float: return 0.0 - def push_response(self, response: ResponseSummary) -> None: # noqa: ARG002 - self._complete = True + def push_response(self, response: ResponseSummary) -> None: + if len(self.responses) < len(self.prompts): + resp = Item( + value=response.value, + prompt_tokens=response.response_prompt_tokens + or response.request_prompt_tokens, + output_tokens=response.response_output_tokens + or response.request_output_tokens, + ) + self.responses.append(resp) + else: + raise ValueError("Response list full") @property def complete(self) -> bool: - return self._complete + return len(self.responses) >= len(self.prompts)