basic mac support (#158)

d4l3k · web-flow · commit 9cc565cf9fec · 2025-04-08T10:08:00.000-07:00
diff --git a/.github/workflows/unittest-mac.yaml b/.github/workflows/unittest-mac.yaml
@@ -0,0 +1,70 @@
+name: Unit Tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  unittest-mac:
+    runs-on: macos-m2-15
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        
+      - name: Setup miniconda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: 3.12
+
+      - name: Install Rust
+        run: |
+          set -ex
+
+          curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain=stable --profile=default -y
+          . "$HOME/.cargo/env"
+
+      - name: Install Dependencies
+        run: |
+          set -ex
+
+          if [[ -n "$CONDA_ENV" ]]; then
+            # Use binaries under conda environment
+            export PATH="$CONDA_ENV/bin":$PATH
+          fi
+          . "$HOME/.cargo/env"
+
+          conda install libprotobuf -y
+
+          python -m pip install --upgrade pip
+
+          pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+
+          pip install -e .[dev] -v
+
+      - name: Run Python Tests
+        run: |
+          set -ex
+
+          if [[ -n "$CONDA_ENV" ]]; then
+            # Use binaries under conda environment
+            export PATH="$CONDA_ENV/bin":$PATH
+          fi
+
+          # Run tests
+          pytest -v
+
+      - name: Run Rust Tests
+        run: |
+          set -ex
+
+          if [[ -n "$CONDA_ENV" ]]; then
+            # Use binaries under conda environment
+            export PATH="$CONDA_ENV/bin":$PATH
+          fi
+          . "$HOME/.cargo/env"
+
+          export RUSTFLAGS="-C link-arg=-undefined -C link-arg=dynamic_lookup"
+
+          cargo test -v
diff --git a/torchft/checkpointing/pg_transport_test.py b/torchft/checkpointing/pg_transport_test.py
@@ -1,5 +1,6 @@
+import sys
 from datetime import timedelta
-from unittest import TestCase, skipUnless
+from unittest import TestCase, skipIf, skipUnless
 
 import torch
 from torch.distributed import TCPStore
@@ -14,6 +15,8 @@
 
 
 class PGTransportTest(TestCase):
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    @skipIf(sys.platform == "darwin", "not passing on mac")
     def test_pg_transport_gloo(self) -> None:
         store: TCPStore = TCPStore(
             host_name="localhost", port=0, is_master=True, wait_for_workers=False
diff --git a/torchft/multiprocessing_test.py b/torchft/multiprocessing_test.py
@@ -25,7 +25,10 @@ def test_monitored_queue_put(self) -> None:
 
         mq = _MonitoredPipe(local)
         mq.send(1)
-        with self.assertRaisesRegex(ConnectionResetError, "Connection reset by peer"):
+        with self.assertRaisesRegex(
+            (ConnectionResetError, BrokenPipeError),
+            "(Connection reset by peer|Broken pipe)",
+        ):
             while True:
                 mq.send(1)
 
diff --git a/torchft/process_group.py b/torchft/process_group.py
@@ -40,14 +40,12 @@
 import torch.distributed as dist
 import torch.multiprocessing as mp
 
-# pyre-fixme[21]: no attribute ProcessGroupNCCL
 # pyre-fixme[21]: no attribute ProcessGroupGloo
 from torch.distributed import (
     DeviceMesh,
     PrefixStore,
     ProcessGroup as BaseProcessGroup,
     ProcessGroupGloo as BaseProcessGroupGloo,
-    ProcessGroupNCCL as BaseProcessGroupNCCL,
     Store,
     TCPStore,
 )
@@ -687,6 +685,9 @@ def _wrap_work(self, work: Work, opts: object) -> Work:
         return _WorkCUDATimeout(self, work, timeout)
 
     def _create_pg(self, store: Store, rank: int, world_size: int) -> BaseProcessGroup:
+        # pyre-fixme[21]: no attribute ProcessGroupNCCL
+        from torch.distributed import ProcessGroupNCCL as BaseProcessGroupNCCL
+
         self._errored = None
 
         pg = BaseProcessGroup(store, rank, world_size)
@@ -1717,6 +1718,8 @@ class ProcessGroupBabyNCCL(ProcessGroupBaby):
 
     @classmethod
     def _create_pg(cls, store: Store, rank: int, world_size: int) -> BaseProcessGroup:
+        from torch.distributed import ProcessGroupNCCL as BaseProcessGroupNCCL
+
         pg = BaseProcessGroup(store, rank, world_size)
         pg._set_default_backend(ProcessGroup.BackendType.NCCL)
         # pyre-fixme[16]: no attribute ProcessGroupNCCL
diff --git a/torchft/process_group_test.py b/torchft/process_group_test.py
@@ -6,10 +6,11 @@
 
 import gc
 import os
+import sys
 from concurrent.futures import Future, ProcessPoolExecutor, ThreadPoolExecutor
 from datetime import timedelta
 from typing import Any, Callable, Dict, List, cast
-from unittest import TestCase, skipUnless
+from unittest import TestCase, skipIf, skipUnless
 from unittest.mock import Mock
 
 import torch
@@ -949,7 +950,7 @@ def worker(pg: ProcessGroup, rank: int, dev: str) -> str:
             # nccl: Tensor-likes are not equal/not close (due to abort)
             with self.assertRaisesRegex(
                 Exception,
-                r"(Connection closed by peer|Timed out waiting|no error|Read error|not equal|not close)",
+                r"(Connection closed by peer|timed out after|Timed out waiting|no error|Read error|not equal|not close)",
             ):
                 test(pg, rank, t1.clone())
                 raise RuntimeError("no error")
@@ -992,6 +993,7 @@ def test_collective_with_resiliency(self, collective: str) -> None:
         self._run_with_resiliency(collective, device="cpu")
 
 
+@skipIf(sys.platform == "darwin", "not reliable on mac")
 class BabyGlooMultiPgTest(MultiPgBaseTest):
     BACKEND = "baby_gloo"
     WORLD_SIZE = 3