Skip to content

Commit fbf892e

Browse files
authored
Merge pull request #100 from PyLops/cupy
Enable use of cupy arrays
2 parents f628bd3 + 45186ef commit fbf892e

File tree

11 files changed

+216
-65
lines changed

11 files changed

+216
-65
lines changed

docs/source/gpu.rst

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
.. _gpu:
2+
3+
GPU Support
4+
===========
5+
6+
Overview
7+
--------
8+
PyLops-mpi supports computations on GPUs leveraging the GPU backend of PyLops. Under the hood,
9+
`CuPy <https://cupy.dev/>`_ (``cupy-cudaXX>=v13.0.0``) is used to perform all of the operations.
10+
This library must be installed *before* PyLops-mpi is installed.
11+
12+
.. note::
13+
14+
Set environment variable ``CUPY_PYLOPS=0`` to force PyLops to ignore the ``cupy`` backend.
15+
This can be also used if a previous (or faulty) version of ``cupy`` is installed in your system,
16+
otherwise you will get an error when importing PyLops.
17+
18+
19+
The :class:`pylops_mpi.DistributedArray` and :class:`pylops_mpi.StackedDistributedArray` objects can be
20+
generated using both ``numpy`` and ``cupy`` based local arrays, and all of the operators and solvers in PyLops-mpi
21+
can handle both scenarios. Note that, since most operators in PyLops-mpi are thin-wrappers around PyLops operators,
22+
some of the operators in PyLops that lack a GPU implementation cannot be used also in PyLops-mpi when working with
23+
cupy arrays.
24+
25+
26+
Example
27+
-------
28+
29+
Finally, let's briefly look at an example. First we write a code snippet using
30+
``numpy`` arrays which PyLops-mpi will run on your CPU:
31+
32+
.. code-block:: python
33+
34+
# MPI helpers
35+
comm = MPI.COMM_WORLD
36+
rank = MPI.COMM_WORLD.Get_rank()
37+
size = MPI.COMM_WORLD.Get_size()
38+
39+
# Create distributed data (broadcast)
40+
nxl, nt = 20, 20
41+
dtype = np.float32
42+
d_dist = pylops_mpi.DistributedArray(global_shape=nxl * nt,
43+
partition=pylops_mpi.Partition.BROADCAST,
44+
engine="numpy", dtype=dtype)
45+
d_dist[:] = np.ones(d_dist.local_shape, dtype=dtype)
46+
47+
# Create and apply VStack operator
48+
Sop = pylops.MatrixMult(np.ones((nxl, nxl)), otherdims=(nt, ))
49+
HOp = pylops_mpi.MPIVStack(ops=[Sop, ])
50+
y_dist = HOp @ d_dist
51+
52+
53+
Now we write a code snippet using ``cupy`` arrays which PyLops will run on
54+
your GPU:
55+
56+
.. code-block:: python
57+
58+
# MPI helpers
59+
comm = MPI.COMM_WORLD
60+
rank = MPI.COMM_WORLD.Get_rank()
61+
size = MPI.COMM_WORLD.Get_size()
62+
63+
# Define gpu to use
64+
cp.cuda.Device(device=rank).use()
65+
66+
# Create distributed data (broadcast)
67+
nxl, nt = 20, 20
68+
dtype = np.float32
69+
d_dist = pylops_mpi.DistributedArray(global_shape=nxl * nt,
70+
partition=pylops_mpi.Partition.BROADCAST,
71+
engine="cupy", dtype=dtype)
72+
d_dist[:] = cp.ones(d_dist.local_shape, dtype=dtype)
73+
74+
# Create and apply VStack operator
75+
Sop = pylops.MatrixMult(cp.ones((nxl, nxl)), otherdims=(nt, ))
76+
HOp = pylops_mpi.MPIVStack(ops=[Sop, ])
77+
y_dist = HOp @ d_dist
78+
79+
The code is almost unchanged apart from the fact that we now use ``cupy`` arrays,
80+
PyLops-mpi will figure this out!
81+
82+
.. note::
83+
84+
The CuPy backend is in active development, with many examples not yet in the docs.
85+
You can find many `other examples <https://github.com/PyLops/pylops_notebooks/tree/master/developement-mpi/Cupy_MPI>`_ from the `PyLops Notebooks repository <https://github.com/PyLops/pylops_notebooks>`_.

docs/source/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ class and implementing the ``_matvec`` and ``_rmatvec``.
7171

7272
self
7373
installation.rst
74+
gpu.rst
7475

7576
.. toctree::
7677
:maxdepth: 2

examples/plot_stacked_array.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
Stacked Array
3-
=========================
3+
=============
44
This example shows how to use the :py:class:`pylops_mpi.StackedDistributedArray`.
55
This class provides a way to combine and act on multiple :py:class:`pylops_mpi.DistributedArray`
66
within the same program. This is very useful in scenarios where an array can be logically

pylops_mpi/DistributedArray.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from enum import Enum
66

77
from pylops.utils import DTypeLike, NDArray
8+
from pylops.utils.backend import get_module, get_array_module, get_module_name
89

910

1011
class Partition(Enum):
@@ -78,6 +79,8 @@ class DistributedArray:
7879
Axis along which distribution occurs. Defaults to ``0``.
7980
local_shapes : :obj:`list`, optional
8081
List of tuples representing local shapes at each rank.
82+
engine : :obj:`str`, optional
83+
Engine used to store array (``numpy`` or ``cupy``)
8184
dtype : :obj:`str`, optional
8285
Type of elements in input array. Defaults to ``numpy.float64``.
8386
"""
@@ -86,6 +89,7 @@ def __init__(self, global_shape: Union[Tuple, Integral],
8689
base_comm: Optional[MPI.Comm] = MPI.COMM_WORLD,
8790
partition: Partition = Partition.SCATTER, axis: int = 0,
8891
local_shapes: Optional[List[Tuple]] = None,
92+
engine: Optional[str] = "numpy",
8993
dtype: Optional[DTypeLike] = np.float64):
9094
if isinstance(global_shape, Integral):
9195
global_shape = (global_shape,)
@@ -103,7 +107,8 @@ def __init__(self, global_shape: Union[Tuple, Integral],
103107
self._check_local_shapes(local_shapes)
104108
self._local_shape = local_shapes[base_comm.rank] if local_shapes else local_split(global_shape, base_comm,
105109
partition, axis)
106-
self._local_array = np.empty(shape=self.local_shape, dtype=self.dtype)
110+
self._engine = engine
111+
self._local_array = get_module(engine).empty(shape=self.local_shape, dtype=self.dtype)
107112

108113
def __getitem__(self, index):
109114
return self.local_array[index]
@@ -160,6 +165,16 @@ def local_shape(self):
160165
"""
161166
return self._local_shape
162167

168+
@property
169+
def engine(self):
170+
"""Engine of the Distributed array
171+
172+
Returns
173+
-------
174+
engine : :obj:`str`
175+
"""
176+
return self._engine
177+
163178
@property
164179
def local_array(self):
165180
"""View of the Local Array
@@ -269,6 +284,7 @@ def to_dist(cls, x: NDArray,
269284
Axis of Distribution
270285
local_shapes : :obj:`list`, optional
271286
Local Shapes at each rank.
287+
272288
Returns
273289
----------
274290
dist_array : :obj:`DistributedArray`
@@ -279,6 +295,7 @@ def to_dist(cls, x: NDArray,
279295
partition=partition,
280296
axis=axis,
281297
local_shapes=local_shapes,
298+
engine=get_module_name(get_array_module(x)),
282299
dtype=x.dtype)
283300
if partition == Partition.BROADCAST:
284301
dist_array[:] = x
@@ -334,6 +351,7 @@ def __neg__(self):
334351
partition=self.partition,
335352
axis=self.axis,
336353
local_shapes=self.local_shapes,
354+
engine=self.engine,
337355
dtype=self.dtype)
338356
arr[:] = -self.local_array
339357
return arr
@@ -365,6 +383,7 @@ def add(self, dist_array):
365383
dtype=self.dtype,
366384
partition=self.partition,
367385
local_shapes=self.local_shapes,
386+
engine=self.engine,
368387
axis=self.axis)
369388
SumArray[:] = self.local_array + dist_array.local_array
370389
return SumArray
@@ -387,6 +406,7 @@ def multiply(self, dist_array):
387406
dtype=self.dtype,
388407
partition=self.partition,
389408
local_shapes=self.local_shapes,
409+
engine=self.engine,
390410
axis=self.axis)
391411
if isinstance(dist_array, DistributedArray):
392412
# multiply two DistributedArray
@@ -480,6 +500,7 @@ def conj(self):
480500
partition=self.partition,
481501
axis=self.axis,
482502
local_shapes=self.local_shapes,
503+
engine=self.engine,
483504
dtype=self.dtype)
484505
conj[:] = self.local_array.conj()
485506
return conj
@@ -492,6 +513,7 @@ def copy(self):
492513
partition=self.partition,
493514
axis=self.axis,
494515
local_shapes=self.local_shapes,
516+
engine=self.engine,
495517
dtype=self.dtype)
496518
arr[:] = self.local_array
497519
return arr
@@ -514,6 +536,7 @@ def ravel(self, order: Optional[str] = "C"):
514536
arr = DistributedArray(global_shape=np.prod(self.global_shape),
515537
local_shapes=local_shapes,
516538
partition=self.partition,
539+
engine=self.engine,
517540
dtype=self.dtype)
518541
local_array = np.ravel(self.local_array, order=order)
519542
x = local_array.copy()

pylops_mpi/LinearOperator.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
8181
base_comm=self.base_comm,
8282
partition=x.partition,
8383
axis=x.axis,
84+
engine=x.engine,
8485
dtype=self.dtype)
8586
y[:] = self.Op._matvec(x.local_array)
8687
return y
@@ -117,6 +118,7 @@ def _rmatvec(self, x: DistributedArray) -> DistributedArray:
117118
base_comm=self.base_comm,
118119
partition=x.partition,
119120
axis=x.axis,
121+
engine=x.engine,
120122
dtype=self.dtype)
121123
y[:] = self.Op._rmatvec(x.local_array)
122124
return y

pylops_mpi/basicoperators/BlockDiag.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
from mpi4py import MPI
44
from typing import Optional, Sequence
55

6-
from pylops.utils import DTypeLike
76
from pylops import LinearOperator
7+
from pylops.utils import DTypeLike
8+
from pylops.utils.backend import get_module
89

910
from pylops_mpi import MPILinearOperator, MPIStackedLinearOperator
1011
from pylops_mpi import DistributedArray, StackedDistributedArray
@@ -113,22 +114,26 @@ def __init__(self, ops: Sequence[LinearOperator],
113114

114115
@reshaped(forward=True, stacking=True)
115116
def _matvec(self, x: DistributedArray) -> DistributedArray:
116-
y = DistributedArray(global_shape=self.shape[0], local_shapes=self.local_shapes_n, dtype=self.dtype)
117+
ncp = get_module(x.engine)
118+
y = DistributedArray(global_shape=self.shape[0], local_shapes=self.local_shapes_n,
119+
engine=x.engine, dtype=self.dtype)
117120
y1 = []
118121
for iop, oper in enumerate(self.ops):
119122
y1.append(oper.matvec(x.local_array[self.mmops[iop]:
120123
self.mmops[iop + 1]]))
121-
y[:] = np.concatenate(y1)
124+
y[:] = ncp.concatenate(y1)
122125
return y
123126

124127
@reshaped(forward=False, stacking=True)
125128
def _rmatvec(self, x: DistributedArray) -> DistributedArray:
126-
y = DistributedArray(global_shape=self.shape[1], local_shapes=self.local_shapes_m, dtype=self.dtype)
129+
ncp = get_module(x.engine)
130+
y = DistributedArray(global_shape=self.shape[1], local_shapes=self.local_shapes_m,
131+
engine=x.engine, dtype=self.dtype)
127132
y1 = []
128133
for iop, oper in enumerate(self.ops):
129134
y1.append(oper.rmatvec(x.local_array[self.nnops[iop]:
130135
self.nnops[iop + 1]]))
131-
y[:] = np.concatenate(y1)
136+
y[:] = ncp.concatenate(y1)
132137
return y
133138

134139

0 commit comments

Comments
 (0)