|
32 | 32 |
|
33 | 33 | # TODO: Use `pytest_addoption` instead?
|
34 | 34 | # Keep all results in one place to allow recovering them for debugging in case of failure.
|
35 |
| -TEST_RESULTS_PATH = pathlib.Path(os.environ.get("TEST_RESULTS_PATH", "/tmp/fast_llm_tests")) |
| 35 | +TEST_RESULTS_PATH = pathlib.Path(os.environ.get("TEST_RESULTS_PATH", "/tmp/fast_llm_tests")).resolve() |
36 | 36 | FORCE_REUSE_RESULTS = int(os.environ.get("FORCE_REUSE_RESULTS", 0)) != 0
|
37 | 37 | REUSE_RESULTS = FORCE_REUSE_RESULTS or int(os.environ.get("REUSE_RESULTS", 0)) != 0
|
38 | 38 | _LOG_LEVEL = int(os.environ.get("LOG_LEVEL", 13))
|
@@ -350,78 +350,84 @@ def get_test_concatenated_memmap_dataset(
|
350 | 350 | index_file.open("w").writelines([str(path / f"dataset_{i}") + "\n" for i in range(num_files)])
|
351 | 351 |
|
352 | 352 |
|
353 |
| -def run_test_script( |
354 |
| - name: str, |
355 |
| - script: list[str], |
356 |
| - num_gpus: int = 1, |
357 |
| - *, |
358 |
| - model_type: str = TEST_MODEL_TYPE, |
359 |
| - is_megatron: bool = False, |
360 |
| - compare: str | None = None, |
361 |
| - config: CompareConfig | None = None, |
362 |
| - prepare_fn=None, |
363 |
| - compare_fn=None, |
364 |
| - do_compare: bool = True, |
365 |
| -): |
366 |
| - if torch.cuda.device_count() < num_gpus: |
367 |
| - pytest.skip(f"Not enough GPUs to run test ({torch.cuda.device_count()}<{num_gpus})") |
368 |
| - env = os.environ.copy() |
369 |
| - if is_megatron: |
370 |
| - # Prevent Megatron from complaining. |
371 |
| - env["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" |
372 |
| - env["NVTE_FLASH_ATTN"] = "0" |
373 |
| - path = TEST_RESULTS_PATH.resolve() / name |
374 |
| - skip = False |
375 |
| - artifact_path = path / ARTIFACT_PATH |
376 |
| - if path.exists(): |
377 |
| - assert path.is_dir() |
378 |
| - # TODO: Better way to check if the previous attempt succeeded. |
379 |
| - if ( |
380 |
| - REUSE_RESULTS |
381 |
| - and artifact_path.is_dir() |
382 |
| - and len(list((artifact_path / "0").iterdir())) >= (1 if is_megatron else 3) |
383 |
| - ): |
384 |
| - skip = True |
| 353 | +@pytest.fixture(scope="session") |
| 354 | +def run_test_script(worker_resources): |
| 355 | + def do_run_test_script( |
| 356 | + name: str, |
| 357 | + script: list[str], |
| 358 | + num_gpus: int = 1, |
| 359 | + *, |
| 360 | + model_type: str = TEST_MODEL_TYPE, |
| 361 | + is_megatron: bool = False, |
| 362 | + compare: str | None = None, |
| 363 | + config: CompareConfig | None = None, |
| 364 | + prepare_fn=None, |
| 365 | + compare_fn=None, |
| 366 | + do_compare: bool = True, |
| 367 | + ): |
| 368 | + if torch.cuda.device_count() < num_gpus: |
| 369 | + pytest.skip(f"Not enough GPUs to run test ({torch.cuda.device_count()}<{num_gpus})") |
| 370 | + env = os.environ.copy() |
| 371 | + if is_megatron: |
| 372 | + # Prevent Megatron from complaining. |
| 373 | + env["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" |
| 374 | + env["NVTE_FLASH_ATTN"] = "0" |
| 375 | + path = TEST_RESULTS_PATH / name |
| 376 | + skip = False |
| 377 | + artifact_path = path / ARTIFACT_PATH |
| 378 | + if path.exists(): |
| 379 | + assert path.is_dir() |
| 380 | + # TODO: Better way to check if the previous attempt succeeded. |
| 381 | + if ( |
| 382 | + REUSE_RESULTS |
| 383 | + and artifact_path.is_dir() |
| 384 | + and len(list((artifact_path / "0").iterdir())) >= (1 if is_megatron else 3) |
| 385 | + ): |
| 386 | + skip = True |
| 387 | + elif FORCE_REUSE_RESULTS: |
| 388 | + raise RuntimeError(artifact_path) |
| 389 | + else: |
| 390 | + shutil.rmtree(path) |
385 | 391 | elif FORCE_REUSE_RESULTS:
|
386 |
| - raise RuntimeError(artifact_path) |
| 392 | + raise RuntimeError(path) |
| 393 | + if prepare_fn is not None: |
| 394 | + skip = prepare_fn(TEST_RESULTS_PATH / name, None if compare is None else TEST_RESULTS_PATH / compare, skip) |
| 395 | + if is_megatron: |
| 396 | + script = [*script, f"--structured-logs-dir={path}", f"--data-cache-path={path}"] |
387 | 397 | else:
|
388 |
| - shutil.rmtree(path) |
389 |
| - elif FORCE_REUSE_RESULTS: |
390 |
| - raise RuntimeError(path) |
391 |
| - if prepare_fn is not None: |
392 |
| - skip = prepare_fn(TEST_RESULTS_PATH / name, None if compare is None else TEST_RESULTS_PATH / compare, skip) |
393 |
| - if is_megatron: |
394 |
| - script = [*script, f"--structured-logs-dir={path}", f"--data-cache-path={path}"] |
395 |
| - else: |
396 |
| - script = [model_type, *script, f"run.experiment_dir={path}"] |
397 |
| - header = ["Megatron-LM/pretrain_gpt.py"] if is_megatron else ["--no-python", "fast-llm", "train"] |
398 |
| - command = [ |
399 |
| - "python", |
400 |
| - "-m", |
401 |
| - "torch.distributed.run", |
402 |
| - f"--nproc-per-node={num_gpus}", |
403 |
| - *header, |
404 |
| - *script, |
405 |
| - ] |
406 |
| - print(" ".join(command)) |
407 |
| - if skip: |
408 |
| - print("Reusing existing run.") |
409 |
| - else: |
410 |
| - get_test_dataset() |
411 |
| - if num_gpus == 1 and not is_megatron: |
412 |
| - CliTrainingConfig.parse_and_run(script) |
| 398 | + script = [model_type, *script, f"run.experiment_dir={path}"] |
| 399 | + header = ["Megatron-LM/pretrain_gpt.py"] if is_megatron else ["--no-python", "fast-llm", "train"] |
| 400 | + command = [ |
| 401 | + "python", |
| 402 | + "-m", |
| 403 | + "torch.distributed.run", |
| 404 | + f"--nproc-per-node={num_gpus}", |
| 405 | + f"--rdzv-endpoint=localhost:{worker_resources.rendezvous_port}", |
| 406 | + f"--master-port={worker_resources.torchrun_port}", |
| 407 | + *header, |
| 408 | + *script, |
| 409 | + ] |
| 410 | + print(" ".join(command)) |
| 411 | + if skip: |
| 412 | + print("Reusing existing run.") |
413 | 413 | else:
|
414 |
| - completed_proc = subprocess.run(command, env=env, timeout=60) |
415 |
| - if completed_proc.returncode: |
416 |
| - raise RuntimeError(f"Process failed with return code {completed_proc.returncode}") |
417 |
| - if compare and do_compare: |
418 |
| - if compare_fn is not None: |
419 |
| - compare_fn(TEST_RESULTS_PATH / name, TEST_RESULTS_PATH / compare) |
420 |
| - compare_tensor_logs( |
421 |
| - TEST_RESULTS_PATH / compare / ARTIFACT_PATH, |
422 |
| - TEST_RESULTS_PATH / name / ARTIFACT_PATH, |
423 |
| - config, |
424 |
| - ) |
| 414 | + get_test_dataset() |
| 415 | + if num_gpus == 1 and not is_megatron: |
| 416 | + CliTrainingConfig.parse_and_run(script) |
| 417 | + else: |
| 418 | + completed_proc = subprocess.run(command, env=env, timeout=60) |
| 419 | + if completed_proc.returncode: |
| 420 | + raise RuntimeError(f"Process failed with return code {completed_proc.returncode}") |
| 421 | + if compare and do_compare: |
| 422 | + if compare_fn is not None: |
| 423 | + compare_fn(TEST_RESULTS_PATH / name, TEST_RESULTS_PATH / compare) |
| 424 | + compare_tensor_logs( |
| 425 | + TEST_RESULTS_PATH / compare / ARTIFACT_PATH, |
| 426 | + TEST_RESULTS_PATH / name / ARTIFACT_PATH, |
| 427 | + config, |
| 428 | + ) |
| 429 | + |
| 430 | + return do_run_test_script |
425 | 431 |
|
426 | 432 |
|
427 | 433 | def materialize_meta_tensors(model, tensor_space):
|
|
0 commit comments