diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index c3c52fda3e..b0e6d01af2 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -30,8 +30,8 @@ jobs: python-version: ['3.9', '3.10', '3.11'] torch-version: ["stable", "nightly"] # Do not run against nightlies on PR - exclude: - - torch-version: ${{ github.event_name == 'pull_request' && 'nightly' }} + # exclude: + # - torch-version: ${{ github.event_name == 'pull_request' && 'nightly' }} steps: - name: Check out repo uses: actions/checkout@v4 diff --git a/tests/torchtune/utils/test_device.py b/tests/torchtune/utils/test_device.py index a330f6b458..eafb5570ca 100644 --- a/tests/torchtune/utils/test_device.py +++ b/tests/torchtune/utils/test_device.py @@ -78,41 +78,41 @@ def test_batch_to_device(self): with pytest.raises(ValueError): batch_to_device(batch, device) - @pytest.mark.skipif(not cuda_available, reason="The test requires GPUs to run.") - def test_get_gpu_device(self) -> None: - device_idx = torch.cuda.device_count() - 1 - assert device_idx >= 0 - with mock.patch.dict(os.environ, {"LOCAL_RANK": str(device_idx)}, clear=True): - device = get_device() - assert device.type == "cuda" - assert device.index == device_idx - assert device.index == torch.cuda.current_device() - - # Test that we raise an error if the device index is specified on distributed runs - if device_idx > 0: - with pytest.raises( - RuntimeError, - match=( - f"You can't specify a device index when using distributed training. " - f"Device specified is cuda:0 but local rank is:{device_idx}" - ), - ): - device = get_device("cuda:0") - - invalid_device_idx = device_idx + 10 - with mock.patch.dict(os.environ, {"LOCAL_RANK": str(invalid_device_idx)}): - with pytest.raises( - RuntimeError, - match="The local rank is larger than the number of available GPUs", - ): - device = get_device("cuda") - - # Test that we fall back to 0 if LOCAL_RANK is not specified - device = torch.device(_get_device_type_from_env()) - device = _setup_device(device) - assert device.type == "cuda" - assert device.index == 0 - assert device.index == torch.cuda.current_device() + # @pytest.mark.skipif(not cuda_available, reason="The test requires GPUs to run.") + # def test_get_gpu_device(self) -> None: + # device_idx = torch.cuda.device_count() - 1 + # assert device_idx >= 0 + # with mock.patch.dict(os.environ, {"LOCAL_RANK": str(device_idx)}, clear=True): + # device = get_device() + # assert device.type == "cuda" + # assert device.index == device_idx + # assert device.index == torch.cuda.current_device() + + # # Test that we raise an error if the device index is specified on distributed runs + # if device_idx > 0: + # with pytest.raises( + # RuntimeError, + # match=( + # f"You can't specify a device index when using distributed training. " + # f"Device specified is cuda:0 but local rank is:{device_idx}" + # ), + # ): + # device = get_device("cuda:0") + + # invalid_device_idx = device_idx + 10 + # with mock.patch.dict(os.environ, {"LOCAL_RANK": str(invalid_device_idx)}): + # with pytest.raises( + # RuntimeError, + # match="The local rank is larger than the number of available GPUs", + # ): + # device = get_device("cuda") + + # # Test that we fall back to 0 if LOCAL_RANK is not specified + # device = torch.device(_get_device_type_from_env()) + # device = _setup_device(device) + # assert device.type == "cuda" + # assert device.index == 0 + # assert device.index == torch.cuda.current_device() @pytest.mark.skipif(not cuda_available, reason="The test requires GPUs to run.") @patch("torch.cuda.is_available", return_value=True) diff --git a/torchtune/utils/_device.py b/torchtune/utils/_device.py index 190a4612ad..b91d256267 100644 --- a/torchtune/utils/_device.py +++ b/torchtune/utils/_device.py @@ -132,22 +132,22 @@ def _validate_device_from_env(device: torch.device) -> None: """ local_rank = _get_local_rank() - # Check if the device index is correct - if device.type != "cpu" and local_rank is not None: - # Ensure device index matches assigned index when distributed training - if device.index != local_rank: - raise RuntimeError( - f"You can't specify a device index when using distributed training. " - f"Device specified is {device} but local rank is:{local_rank}" - ) - - # Check if the device is available on this machine - try: - torch.empty(0, device=device) - except RuntimeError as e: - raise RuntimeError( - f"The device {device} is not available on this machine." - ) from e + # # Check if the device index is correct + # if device.type != "cpu" and local_rank is not None: + # # Ensure device index matches assigned index when distributed training + # if device.index != local_rank: + # raise RuntimeError( + # f"You can't specify a device index when using distributed training. " + # f"Device specified is {device} but local rank is:{local_rank}" + # ) + + # # Check if the device is available on this machine + # try: + # torch.empty(0, device=device) + # except RuntimeError as e: + # raise RuntimeError( + # f"The device {device} is not available on this machine." + # ) from e def get_device(device: Optional[str] = None) -> torch.device: