Implemented Adjoint and updated example

astroC86 · astroC86 · commit f0fd7cbcd6ed · 2025-05-18T20:03:10.000+02:00
diff --git a/examples/matixmul.py b/examples/matixmul.py
@@ -1,3 +1,4 @@
+import sys
 import math
 import numpy as np
 from mpi4py import MPI
@@ -17,9 +18,9 @@
 assert P_prime * C >= nProcs
 
 # matrix dims
-M = 5    # any M
-K = 4    # any K
-N = 5    # any N
+M = 37    # any M
+K = 37    # any K
+N = 37    # any N
 
 blk_rows = int(math.ceil(M / P_prime))
 blk_cols = int(math.ceil(N / P_prime))
@@ -65,7 +66,7 @@
 
 comm.Barrier()
 
-MMop_MPI   = SUMMAMatrixMult(A_p, N)
+Aop        = SUMMAMatrixMult(A_p, N)
 col_lens   = comm.allgather(my_own_cols)
 total_cols = np.add.reduce(col_lens, 0)
 x = DistributedArray(global_shape=K * total_cols,
@@ -74,13 +75,31 @@
                      mask=[i % P_prime for i in range(comm.Get_size())],
                      dtype=np.float32)
 x[:] = B_p.flatten()
-y = MMop_MPI  @ x
+y = Aop @ x
 
 # ======================= VERIFICATION =================-=============
-C_true = (np.arange(M*K).reshape(M, K).astype(np.float32)
-            @ np.arange(K*N).reshape(K, N).astype(np.float32))
-expect = C_true[row_start:row_end, :]
-if not np.allclose(y.local_array, expect, atol=1e-6):
-    print(f"RANK {rank}: VERIFICATION FAILED")
+A      = np.arange(M*K).reshape(M, K).astype(np.float32)
+B      = np.arange(K*N).reshape(K, N).astype(np.float32)
+C_true = A @ B
+Z_true = (A.T.dot(C_true.conj())).conj()
+
+
+col_start   = my_layer * blk_cols   # note: same my_group index on cols
+col_end     = min(N, col_start + blk_cols)
+my_own_cols = col_end - col_start
+expected_y = C_true[:,col_start:col_end].flatten()
+
+if not np.allclose(y.local_array, expected_y, atol=1e-6):
+    print(f"RANK {rank}: FORWARD VERIFICATION FAILED")
+    print(f'{rank} local: {y.local_array}, expected: {C_true[:,col_start:col_end]}')
+else:
+    print(f"RANK {rank}: FORWARD VERIFICATION PASSED")
+
+
+z = Aop.H @ y
+expected_z = Z_true[:,col_start:col_end].flatten()
+if not np.allclose(z.local_array, expected_z, atol=1e-6):
+    print(f"RANK {rank}: ADJOINT VERIFICATION FAILED")
+    print(f'{rank} local: {y.local_array}, expected: {C_true[:,col_start:col_end]}')
 else:
-    print(f"RANK {rank}: VERIFICATION PASSED")
+    print(f"RANK {rank}: ADJOINT VERIFICATION PASSED")
diff --git a/pylops_mpi/basicoperators/MatrixMultiply.py b/pylops_mpi/basicoperators/MatrixMultiply.py
@@ -10,61 +10,137 @@
     Partition
 )
 
+
 class SUMMAMatrixMult(MPILinearOperator):
     def __init__(
             self,
-            A: NDArray, #I am going to have to assume that the partitioning has been done correctly
+            A: NDArray,
             N: int,
             base_comm: MPI.Comm = MPI.COMM_WORLD,
             dtype: DTypeLike = "float64",
     ) -> None:
-        rank   = base_comm.Get_rank()
-        nProcs = base_comm.Get_size()
-        self._P_prime = int(math.ceil(math.sqrt(nProcs)))
-        self._C = int(math.ceil(nProcs / self._P_prime))
-        assert self._P_prime * self._C >= nProcs
+        rank = base_comm.Get_rank()
+        size = base_comm.Get_size()
+
+        # Determine grid dimensions (P_prime × C) such that P_prime * C ≥ size
+        self._P_prime = int(math.ceil(math.sqrt(size)))
+        self._C       = int(math.ceil(size / self._P_prime))
+        assert self._P_prime * self._C >= size
+
+        # Compute this process's group and layer indices
+        self._group_id = rank % self._P_prime
+        self._layer_id = rank // self._P_prime
+
+        # Split communicators by layer (rows) and by group (columns)
+        self.base_comm   = base_comm
+        self._layer_comm = base_comm.Split(color=self._layer_id, key=self._group_id)
+        self._group_comm = base_comm.Split(color=self._group_id, key=self._layer_id)
 
-        self.N = N
         self.A = A
-        self._my_group   = rank % self._P_prime
-        self._my_layer   = rank // self._P_prime
-        self._layer_comm = base_comm.Split(color=self._my_layer, key=self._my_group)
-        self._group_comm = base_comm.Split(color=self._my_group, key=self._my_layer)
-        K_global = A.shape[1]
-
-        blk_cols = int(math.ceil(self.N / self._P_prime))
-        col_start = self._my_group * blk_cols
-        col_end = min(self.N, col_start + blk_cols)
-        my_own_cols = col_end - col_start
-        total_cols = base_comm.allreduce(my_own_cols, op=MPI.SUM)
-        self.dims = (K_global, total_cols)
 
-        super().__init__(shape=(1, int(np.prod(self.dims))), dtype=np.dtype(dtype), base_comm=base_comm)
+        self.M = self._layer_comm.allreduce(self.A.shape[0], op=MPI.SUM)
+        self.K = A.shape[1]
+        self.N = N
 
+        # Determine how many columns each group holds
+        block_cols      = int(math.ceil(self.N / self._P_prime))
+        local_col_start = self._group_id * block_cols
+        local_col_end   = min(self.N, local_col_start + block_cols)
+        local_ncols     = local_col_end - local_col_start
+
+        # Sum up the total number of input columns across all processes
+        total_ncols = base_comm.allreduce(local_ncols, op=MPI.SUM)
+        self.dims   = (self.K, total_ncols)
+
+        # Recompute how many output columns each layer holds
+        layer_col_start  = self._layer_id * block_cols
+        layer_col_end    = min(self.N, layer_col_start + block_cols)
+        layer_ncols      = layer_col_end - layer_col_start
+        total_layer_cols = self.base_comm.allreduce(layer_ncols, op=MPI.SUM)
+
+        self.dimsd = (self.M, total_layer_cols)
+        shape      = (int(np.prod(self.dimsd)), int(np.prod(self.dims)))
+
+        super().__init__(shape=shape, dtype=np.dtype(dtype), base_comm=base_comm)
+        
     def _matvec(self, x: DistributedArray) -> DistributedArray:
         ncp = get_module(x.engine)
         if x.partition != Partition.SCATTER:
             raise ValueError(f"x should have partition={Partition.SCATTER} Got {x.partition} instead...")
-        blk_cols     = int(math.ceil(self.N / self._P_prime))
-        col_start    = self._my_group * blk_cols
-        col_end      = min(self.N, col_start + blk_cols)
-        my_own_cols  = col_end - col_start
+        blk_cols    = int(math.ceil(self.N / self._P_prime))
+        col_start   = self._group_id * blk_cols
+        col_end     = min(self.N, col_start + blk_cols)
+        my_own_cols = col_end - col_start
         x = x.local_array.reshape((self.dims[0], my_own_cols))
         C_local = None
         for t in range(self._P_prime):
             responsible_layer = t % self._C
-            if self._my_layer == responsible_layer:
-                B_block = self._layer_comm.bcast(x if self._my_group == t else None, root=t)
-                if t == self._my_layer: C_local = ncp.matmul(self.A, B_block)
-        self.base_comm.Barrier()
-        my_C_rows    = ncp.hstack(self._group_comm.allgather(C_local))
-        
-        mask         = [i % self._P_prime for i in range(self.size)]
-        row_lens     = self.base_comm.allgather(self.A.shape[0])
-        tot_row_lens = np.add.reduce(row_lens, 0)
-        y    = DistributedArray(global_shape=(tot_row_lens, self.N),
-                                local_shapes=[(r, self.N) for r in row_lens],
-                                mask = mask,
-                                partition=Partition.SCATTER)
-        y[:] = my_C_rows
+            if self._layer_id == responsible_layer:
+                B_block = self._layer_comm.bcast(x if self._group_id == t else None, root=t)
+                if t == self._layer_id:
+                    C_local = ncp.vstack(
+                        self._layer_comm.allgather(
+                            ncp.matmul(self.A, B_block, dtype=self.dtype)
+                        )
+                    )
+
+        layer_col_start = self._layer_id * blk_cols
+        layer_col_end   = min(self.N, layer_col_start + blk_cols)
+        layer_ncols     = layer_col_end - layer_col_start
+        layer_col_lens  = self.base_comm.allgather(layer_ncols)
+        mask = [i // self._P_prime for i in range(self.size)]
+
+        y = DistributedArray(global_shape= (self.M * self.dimsd[1]),
+                             local_shapes=[(self.M * c) for c in layer_col_lens],
+                             mask=mask,
+                             #axis=1,
+                             partition=Partition.SCATTER,
+                             dtype=self.dtype)
+        y[:] = C_local.flatten()
+        return y
+
+    def _rmatvec(self, x: DistributedArray) -> DistributedArray:
+        ncp = get_module(x.engine)
+        if x.partition != Partition.SCATTER:
+            raise ValueError(f"x should have partition={Partition.SCATTER}. Got {x.partition} instead.")
+
+        # Determine local column block for this layer
+        blk_cols        = int(math.ceil(self.N / self._P_prime))
+        layer_col_start = self._layer_id * blk_cols
+        layer_col_end   = min(self.N, layer_col_start + blk_cols)
+        layer_ncols     = layer_col_end - layer_col_start
+        layer_col_lens  = self.base_comm.allgather(layer_ncols)
+        x = x.local_array.reshape((self.M, layer_ncols))
+
+        # Determine local row block for this process group
+        blk_rows  = int(math.ceil(self.M / self._P_prime))
+        row_start = self._group_id * blk_rows
+        row_end   = min(self.M, row_start + blk_rows)
+
+        B_tile = x[row_start:row_end, :]
+        A_local = self.A.T.conj()
+
+        # Pad A_local so its first dimension is divisible by _P_prime, then batch it
+        m, b = A_local.shape
+        r = math.ceil(m / self._P_prime)
+        A_pad        = np.zeros((r * self._P_prime, b), dtype=self.dtype)
+        A_pad[:m, :] = A_local
+        A_batch      = A_pad.reshape(self._P_prime, r, b)
+
+        # Perform local matmul and unpad
+        Y_batch = ncp.matmul(A_batch, B_tile)
+        Y_pad   = Y_batch.reshape(r * self._P_prime, -1)
+        y_local = Y_pad[:m, :]
+        y_layer = self._layer_comm.allreduce(y_local, op=MPI.SUM)
+
+        mask = [i // self._P_prime for i in range(self.size)]
+        y = DistributedArray(
+            global_shape=(self.K * self.dimsd[1]),
+            local_shapes=[self.K * c for c in layer_col_lens],
+            mask=mask,
+            #axis=1
+            partition=Partition.SCATTER,
+            dtype=self.dtype,
+        )
+        y[:] = y_layer.flatten()
         return y