diff --git a/token_benchmark_ray.py b/token_benchmark_ray.py index a8c7754..02e56a6 100644 --- a/token_benchmark_ray.py +++ b/token_benchmark_ray.py @@ -95,15 +95,17 @@ def get_token_throughput_latencies( ): iter += 1 - default_sampling_params = {"max_tokens": num_output_tokens_list.pop()} - default_sampling_params.update(additional_sampling_params) - request_config = RequestConfig( - model=model, - prompt=prompts.pop(), - sampling_params=default_sampling_params, - llm_api=llm_api, - ) - req_launcher.launch_requests(request_config) + if iter <= max_num_completed_requests: + default_sampling_params = {"max_tokens": num_output_tokens_list.pop()} + default_sampling_params.update(additional_sampling_params) + request_config = RequestConfig( + model=model, + prompt=prompts.pop(), + sampling_params=default_sampling_params, + llm_api=llm_api, + ) + req_launcher.launch_requests(request_config) + # Retrieving results less frequently allows for more concurrent requests # to be launched. This will overall reduce the amount of time it takes # for the test to run.