diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py
index a438fe9..a62df98 100644
--- a/aiu_fms_testing_utils/scripts/drive_paged_programs.py
+++ b/aiu_fms_testing_utils/scripts/drive_paged_programs.py
@@ -421,6 +421,18 @@ def __load_validation_info(
     **extra_kwargs,
 )
 
+# do an extra inference call to workaround the issue on z/OS where the first inference
+# result is always incorrect during multi-AIU (issue 173)
+extract_validation_information(
+    model,
+    input_ids,
+    max_new_tokens,
+    post_iteration_hook=None,
+    last_n_tokens=64,
+    prefill_chunk_size=args.prefill_chunk_size,
+    **extra_kwargs,
+)
+
 if USE_DISTRIBUTED:
     # wait for rank0 to be finished as it is the only one generating the criteria json
     # this is needed since otherwise we may run into a race condition