diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d1e691..d515d1e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,7 @@ sofa_find_package(Sofa.GL QUIET)
 set(PLUGIN_SPH_SRC_DIR src/SofaSphFluid)
 set(HEADER_FILES
     ${PLUGIN_SPH_SRC_DIR}/config.h.in
+    ${PLUGIN_SPH_SRC_DIR}/initSPHFluid.h
     ${PLUGIN_SPH_SRC_DIR}/ParticleSink.h
     ${PLUGIN_SPH_SRC_DIR}/ParticleSink.inl
     ${PLUGIN_SPH_SRC_DIR}/ParticleSource.h
@@ -74,3 +75,5 @@ sofa_create_package_with_targets(
     INCLUDE_INSTALL_DIR ${PROJECT_NAME}
     RELOCATABLE "plugins"
     )
+
+sofa_add_subdirectory(plugin extensions/CUDA SofaSphFluid.CUDA)
diff --git a/extensions/CUDA/CMakeLists.txt b/extensions/CUDA/CMakeLists.txt
new file mode 100644
index 0000000..8acc8bd
--- /dev/null
+++ b/extensions/CUDA/CMakeLists.txt
@@ -0,0 +1,48 @@
+cmake_minimum_required(VERSION 3.12)
+project(SofaSphFluid.CUDA CUDA CXX)
+
+set(HEADER_FILES
+    src/SofaSphFluid/CUDA/init.h
+    src/SofaSphFluid/CUDA/config.h.in
+
+    src/SofaSphFluid/CUDA/CudaParticleSource.h
+    src/SofaSphFluid/CUDA/CudaParticleSource.inl
+    src/SofaSphFluid/CUDA/CudaSPHFluidForceField.h
+    src/SofaSphFluid/CUDA/CudaSPHFluidForceField.inl
+    src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.h
+    src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.inl
+    src/SofaSphFluid/CUDA/CudaSpatialGridContainer.h
+    src/SofaSphFluid/CUDA/CudaSpatialGridContainer.inl
+)
+
+set(SOURCE_FILES
+    src/SofaSphFluid/CUDA/init.cpp
+
+    src/SofaSphFluid/CUDA/CudaParticleSource.cpp
+    src/SofaSphFluid/CUDA/CudaSPHFluidForceField.cpp
+    src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.cpp
+    src/SofaSphFluid/CUDA/CudaSpatialGridContainer.cpp
+)
+
+set(CUDA_SOURCES
+    src/SofaSphFluid/CUDA/CudaParticleSource.cu
+    src/SofaSphFluid/CUDA/CudaSPHFluidForceField.cu
+    src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.cu
+    src/SofaSphFluid/CUDA/CudaSpatialGridContainer.cu
+)
+
+sofa_find_package(SofaSphFluid REQUIRED)
+sofa_find_package(SofaCUDA REQUIRED)
+
+add_library(${PROJECT_NAME} SHARED ${HEADER_FILES} ${SOURCE_FILES} ${CUDA_SOURCES})
+target_link_libraries(${PROJECT_NAME} SofaSphFluid)
+target_link_libraries(${PROJECT_NAME} SofaCUDA)
+
+sofa_create_package_with_targets(
+    PACKAGE_NAME ${PROJECT_NAME}
+    PACKAGE_VERSION ${Sofa_VERSION}
+    TARGETS ${PROJECT_NAME} AUTO_SET_TARGET_PROPERTIES
+    INCLUDE_SOURCE_DIR "src"
+    INCLUDE_INSTALL_DIR "${PROJECT_NAME}"
+    RELOCATABLE "plugins"
+)
diff --git a/extensions/CUDA/SofaSphFluid.CUDAConfig.cmake.in b/extensions/CUDA/SofaSphFluid.CUDAConfig.cmake.in
new file mode 100644
index 0000000..67b3bbe
--- /dev/null
+++ b/extensions/CUDA/SofaSphFluid.CUDAConfig.cmake.in
@@ -0,0 +1,9 @@
+# CMake package configuration file for the SofaSphFluid.CUDA library
+
+@PACKAGE_GUARD@
+@PACKAGE_INIT@
+
+find_package(SofaSphFluid QUIET REQUIRED)
+find_package(SofaCUDA QUIET REQUIRED)
+
+check_required_components(SofaSphFluid.CUDA)
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticleSource.cpp b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticleSource.cpp
new file mode 100644
index 0000000..a7d1714
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticleSource.cpp
@@ -0,0 +1,56 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#include <SofaCUDA/sofa/gpu/cuda/CudaTypes.h>
+#include "CudaParticleSource.inl"
+#include <sofa/core/ObjectFactory.h>
+#include <sofa/core/behavior/ProjectiveConstraintSet.h>
+#include <SofaSphFluid/ParticleSource.inl>
+
+namespace sofa
+{
+
+
+namespace core::behavior
+{
+template class ProjectiveConstraintSet<gpu::cuda::CudaVec3fTypes>;
+#ifdef SOFA_GPU_CUDA_DOUBLE
+template class ProjectiveConstraintSet<gpu::cuda::CudaVec3dTypes>;
+#endif
+} // namespace core::behavior
+
+
+namespace gpu::cuda
+{
+
+
+
+int ParticleSourceCudaClass = core::RegisterObject("Supports GPU-side computations using CUDA")
+        .add< component::misc::ParticleSource<CudaVec3fTypes> >()
+#ifdef SOFA_GPU_CUDA_DOUBLE
+        .add< component::misc::ParticleSource<CudaVec3dTypes> >()
+#endif // SOFA_GPU_CUDA_DOUBLE
+        ;
+
+} // namespace gpu::cuda
+
+
+} // namespace sofa
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticleSource.cu b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticleSource.cu
new file mode 100644
index 0000000..bca3121
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticleSource.cu
@@ -0,0 +1,132 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#include <SofaCUDA/sofa/gpu/cuda/CudaCommon.h>
+#include <SofaCUDA/sofa/gpu/cuda/CudaMath.h>
+#include "cuda.h"
+
+#if defined(__cplusplus) && CUDA_VERSION < 2000
+namespace sofa
+{
+namespace gpu
+{
+namespace cuda
+{
+#endif
+
+extern "C"
+{
+
+    void ParticleSourceCuda3f_fillValues(unsigned int totalsize, unsigned int subsetsize, void* dest, const void* indices, float fx, float fy, float fz);
+    void ParticleSourceCuda3f_copyValuesWithOffset(unsigned int totalsize, unsigned int subsetsize, void* dest, const void* indices, const void* src, float fx, float fy, float fz);
+
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+    void ParticleSourceCuda3d_fillValues(unsigned int totalsize, unsigned int subsetsize, void* dest, const void* indices, double fx, double fy, double fz);
+    void ParticleSourceCuda3d_copyValuesWithOffset(unsigned int totalsize, unsigned int subsetsize, void* dest, const void* indices, const void* src, double fx, double fy, double fz);
+
+#endif // SOFA_GPU_CUDA_DOUBLE
+}
+
+//////////////////////
+// GPU-side methods //
+//////////////////////
+
+
+template<class real>
+__global__ void ParticleSourceCuda3t_fillValues_kernel(unsigned int totalsize, unsigned int subsetsize, real* dest, const unsigned int* indices, real fx, real fy, real fz)
+{
+    unsigned int index0 = blockIdx.x * BSIZE;
+    unsigned int index = index0+threadIdx.x;
+    if (index < subsetsize)
+    {
+        unsigned int dindex = indices[index];
+        if (dindex < totalsize)
+        {
+            unsigned int dindex3 = dindex * 3;
+            dest[dindex3+0] = fx;
+            dest[dindex3+1] = fy;
+            dest[dindex3+2] = fz;
+        }
+    }
+}
+
+template<class real>
+__global__ void ParticleSourceCuda3t_copyValuesWithOffset_kernel(unsigned int totalsize, unsigned int subsetsize, real* dest, const unsigned int* indices, const real* src, real fx, real fy, real fz)
+{
+    unsigned int index0 = blockIdx.x * BSIZE;
+    unsigned int index = index0+threadIdx.x;
+    if (index < subsetsize)
+    {
+        unsigned int dindex = indices[index];
+        if (dindex < totalsize)
+        {
+            unsigned int dindex3 = dindex * 3;
+            unsigned int index3 = index * 3;
+            dest[dindex3+0] = src[index3+0]+fx;
+            dest[dindex3+1] = src[index3+1]+fy;
+            dest[dindex3+2] = src[index3+2]+fz;
+        }
+    }
+}
+
+//////////////////////
+// CPU-side methods //
+//////////////////////
+
+
+void ParticleSourceCuda3f_fillValues(unsigned int totalsize, unsigned int subsetsize, void* dest, const void* indices, float fx, float fy, float fz)
+{
+    dim3 threads(BSIZE,1);
+    dim3 grid((subsetsize+BSIZE-1)/BSIZE,1);
+    {ParticleSourceCuda3t_fillValues_kernel<float><<< grid, threads >>>(totalsize, subsetsize, (float*)dest, (const unsigned int*)indices, fx, fy, fz); mycudaDebugError("ParticleSourceCuda3t_fillValues_kernel<float>");}
+}
+
+void ParticleSourceCuda3f_copyValuesWithOffset(unsigned int totalsize, unsigned int subsetsize, void* dest, const void* indices, const void* src, float fx, float fy, float fz)
+{
+    dim3 threads(BSIZE,1);
+    dim3 grid((subsetsize+BSIZE-1)/BSIZE,1);
+    {ParticleSourceCuda3t_copyValuesWithOffset_kernel<float><<< grid, threads >>>(totalsize, subsetsize, (float*)dest, (const unsigned int*)indices, (const float*)src, fx, fy, fz); mycudaDebugError("ParticleSourceCuda3t_copyValuesWithOffset_kernel<float>");}
+}
+
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+void ParticleSourceCuda3d_fillValues(unsigned int totalsize, unsigned int subsetsize, void* dest, const void* indices, double fx, double fy, double fz)
+{
+    dim3 threads(BSIZE,1);
+    dim3 grid((subsetsize+BSIZE-1)/BSIZE,1);
+    {ParticleSourceCuda3t_fillValues_kernel<double><<< grid, threads >>>(totalsize, subsetsize, (double*)dest, (const unsigned int*)indices, fx, fy, fz); mycudaDebugError("ParticleSourceCuda3t_fillValues_kernel<double>");}
+}
+
+void ParticleSourceCuda3d_copyValuesWithOffset(unsigned int totalsize, unsigned int subsetsize, void* dest, const void* indices, const void* src, double fx, double fy, double fz)
+{
+    dim3 threads(BSIZE,1);
+    dim3 grid((subsetsize+BSIZE-1)/BSIZE,1);
+    {ParticleSourceCuda3t_copyValuesWithOffset_kernel<double><<< grid, threads >>>(totalsize, subsetsize, (double*)dest, (const unsigned int*)indices, (const double*)src, fx, fy, fz); mycudaDebugError("ParticleSourceCuda3t_copyValuesWithOffset_kernel<double>");}
+}
+
+#endif // SOFA_GPU_CUDA_DOUBLE
+
+#if defined(__cplusplus) && CUDA_VERSION < 2000
+} // namespace cuda
+} // namespace gpu
+} // namespace sofa
+#endif
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticleSource.h b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticleSource.h
new file mode 100644
index 0000000..37847f0
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticleSource.h
@@ -0,0 +1,27 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#ifndef SOFA_GPU_CUDA_CUDAPARTICLESOURCE_H
+#define SOFA_GPU_CUDA_CUDAPARTICLESOURCE_H
+
+#include <SofaCUDA/sofa/gpu/cuda/CudaTypes.h>
+
+#endif
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticleSource.inl b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticleSource.inl
new file mode 100644
index 0000000..9a66fdb
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticleSource.inl
@@ -0,0 +1,138 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#ifndef SOFA_GPU_CUDA_CUDAPARTICLESOURCE_INL
+#define SOFA_GPU_CUDA_CUDAPARTICLESOURCE_INL
+
+#include "CudaParticleSource.h"
+#include <SofaCUDA/sofa/gpu/cuda/mycuda.h>
+
+namespace sofa
+{
+
+
+namespace gpu::cuda
+{
+
+extern "C"
+{
+
+    void ParticleSourceCuda3f_fillValues(unsigned int totalsize, unsigned int subsetsize, void* dest, const void* indices, float fx, float fy, float fz);
+    void ParticleSourceCuda3f_copyValuesWithOffset(unsigned int totalsize, unsigned int subsetsize, void* dest, const void* indices, const void* src, float fx, float fy, float fz);
+
+
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+    void ParticleSourceCuda3d_fillValues(unsigned int totalsize, unsigned int subsetsize, void* dest, const void* indices, double fx, double fy, double fz);
+    void ParticleSourceCuda3d_copyValuesWithOffset(unsigned int totalsize, unsigned int subsetsize, void* dest, const void* indices, const void* src, double fx, double fy, double fz);
+
+#endif // SOFA_GPU_CUDA_DOUBLE
+}
+
+} // namespace gpu::cuda
+
+
+namespace component::misc
+{
+
+using namespace gpu::cuda;
+
+// template <>
+// void ParticleSource<gpu::cuda::CudaVec3fTypes>::projectResponse(VecDeriv& res)
+// {
+//     if (!this->mstate) return;
+//     const VecIndex& lastparticles = this->lastparticles.getValue();
+//     if (lastparticles.empty()) return;
+//     double time = getContext()->getTime();
+//     if (time < f_start.getValue() || time > f_stop.getValue()) return;
+//     // constraint the last values
+//     //mycudaMemset(((Deriv*)res.deviceWrite())+lastparticles[0], 0, lastparticles.size()*sizeof(Deriv));
+//     ParticleSourceCuda3f_fillValues(res.size(), lastparticles.size(), res.deviceWrite(), lastparticles.deviceRead(), 0, 0, 0);
+// }
+
+// template <>
+// void ParticleSource<gpu::cuda::CudaVec3fTypes>::projectVelocity(VecDeriv& res)
+// {
+//     if (!this->mstate) return;
+//     const VecIndex& lastparticles = this->lastparticles.getValue();
+//     if (lastparticles.empty()) return;
+//     double time = getContext()->getTime();
+//     if (time < f_start.getValue() || time > f_stop.getValue()) return;
+//     // constraint the last values
+//     Deriv vel = f_velocity.getValue();
+// #if 1
+//     //mycudaMemset(((Deriv*)res.deviceWrite())+lastparticles[0], 0, lastparticles.size()*sizeof(Coord));
+//     ParticleSourceCuda3f_fillValues(res.size(), lastparticles.size(), res.deviceWrite(), lastparticles.deviceRead(), vel[0], vel[1], vel[2]);
+// #else
+//     for (unsigned int s=0; s<lastparticles.size(); s++)
+//         if ( lastparticles[s] < res.size() )
+//             res[lastparticles[s]] = vel;
+// #endif
+// }
+
+// template <>
+// void ParticleSource<gpu::cuda::CudaVec3fTypes>::projectPosition(VecCoord& res)
+// {
+//     if (!this->mstate) return;
+//     const VecIndex& lastparticles = this->lastparticles.getValue();
+//     if (lastparticles.empty()) return;
+//     double time = getContext()->getTime();
+//     if (time < f_start.getValue() || time > f_stop.getValue()) return;
+//     // constraint the last values
+//     Deriv vel = f_velocity.getValue();
+//     vel *= (time-lasttime);
+//     //mycudaMemset(((Deriv*)res.deviceWrite())+lastparticles[0], 0, lastparticles.size()*sizeof(Coord));
+//     ParticleSourceCuda3f_copyValuesWithOffset(res.size(), lastparticles.size(), res.deviceWrite(), lastparticles.deviceRead(), lastpos.deviceRead(), vel[0], vel[1], vel[2]);
+// }
+
+
+// #ifdef SOFA_GPU_CUDA_DOUBLE
+
+// template <>
+// void ParticleSource<gpu::cuda::CudaVec3dTypes>::projectResponse(VecDeriv& res)
+// {
+//     if (!this->mstate) return;
+//     const VecIndex& lastparticles = this->lastparticles.getValue();
+//     if (lastparticles.empty()) return;
+//     double time = getContext()->getTime();
+//     if (time < f_start.getValue() || time > f_stop.getValue()) return;
+//     // constraint the last values
+//     mycudaMemset(((Deriv*)res.deviceWrite())+lastparticles[0], 0, lastparticles.size()*sizeof(Coord));
+// }
+
+// template <>
+// void ParticleSource<gpu::cuda::CudaVec3dTypes>::projectVelocity(VecDeriv& /*res*/)
+// {
+// }
+
+// template <>
+// void ParticleSource<gpu::cuda::CudaVec3dTypes>::projectPosition(VecDeriv& /*res*/)
+// {
+// }
+
+// #endif // SOFA_GPU_CUDA_DOUBLE
+
+} // namespace component::misc
+
+
+} // namespace sofa
+
+#endif
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.cpp b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.cpp
new file mode 100644
index 0000000..68d2d48
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.cpp
@@ -0,0 +1,43 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#include <SofaCUDA/sofa/gpu/cuda/CudaTypes.h>
+#include "CudaParticlesRepulsionForceField.inl"
+#include <sofa/core/ObjectFactory.h>
+#include <sofa/core/behavior/ForceField.inl>
+#include <SofaSphFluid/ParticlesRepulsionForceField.inl>
+
+
+namespace sofa::gpu::cuda
+{
+
+int ParticlesRepulsionForceFieldCudaClass = core::RegisterObject("Supports GPU-side computations using CUDA")
+        .add< component::forcefield::ParticlesRepulsionForceField<CudaVec3fTypes> >()
+#ifdef SOFA_GPU_CUDA_DOUBLE
+        .add< component::forcefield::ParticlesRepulsionForceField<CudaVec3dTypes> >()
+#endif // SOFA_GPU_CUDA_DOUBLE
+        ;
+
+} // namespace sofa::gpu::cuda
+
+
+
+
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.cu b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.cu
new file mode 100644
index 0000000..cafd2e8
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.cu
@@ -0,0 +1,320 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#include <SofaCUDA/sofa/gpu/cuda/CudaCommon.h>
+#include <SofaCUDA/sofa/gpu/cuda/CudaMath.h>
+#include "cuda.h"
+
+#if defined(__cplusplus) && CUDA_VERSION < 2000
+namespace sofa
+{
+namespace gpu
+{
+namespace cuda
+{
+#endif
+
+template<class real>
+class GPURepulsion
+{
+public:
+    real d;
+    real d2;
+    real stiffness;
+    real damping;
+};
+
+typedef GPURepulsion<float> GPURepulsion3f;
+typedef GPURepulsion<double> GPURepulsion3d;
+
+extern "C"
+{
+    void ParticlesRepulsionForceFieldCuda3f_addForce (unsigned int size, const void* cells, const void* cellGhost, GPURepulsion3f* repulsion, void* f, const void* x, const void* v );
+    void ParticlesRepulsionForceFieldCuda3f_addDForce(unsigned int size, const void* cells, const void* cellGhost, GPURepulsion3f* repulsion, void* f, const void* x, const void* dx);
+
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+    void ParticlesRepulsionForceFieldCuda3d_addForce (unsigned int size, const void* cells, const void* cellGhost, GPURepulsion3d* repulsion, void* f, const void* x, const void* v );
+    void ParticlesRepulsionForceFieldCuda3d_addDForce(unsigned int size, const void* cells, const void* cellGhost, GPURepulsion3d* repulsion, void* f, const void* x, const void* dx);
+
+#endif // SOFA_GPU_CUDA_DOUBLE
+}
+
+//////////////////////
+// GPU-side methods //
+//////////////////////
+
+template<class real>
+__device__ void ParticlesRepulsionCalcForce(CudaVec3<real> x1, CudaVec3<real> v1, CudaVec3<real> x2, CudaVec3<real> v2, CudaVec3<real>& force, GPURepulsion<real>& rep)
+{
+    CudaVec3<real> n = x2-x1;
+    real d2 = norm2(n);
+    if (d2 < rep.d2)
+    {
+        CudaVec3<real> vi = v2-v1;
+        real inv_d = rsqrtf(d2);
+        n *= inv_d;
+        real d = d2*inv_d - rep.d;
+        real forceIntensity = rep.stiffness*d;
+        real dampingIntensity = rep.damping*d;
+        force += n*forceIntensity - vi*dampingIntensity;
+    }
+}
+
+template<class real>
+__device__ void ParticlesRepulsionCalcDForce(CudaVec3<real> x1, CudaVec3<real> dx1, CudaVec3<real> x2, CudaVec3<real> dx2, CudaVec3<real>& dforce, GPURepulsion<real>& rep)
+{
+    CudaVec3<real> n = x2-x1;
+    real d2 = norm2(n);
+    if (d2 < rep.d2)
+    {
+        CudaVec3<real> dxi = dx2-dx1;
+        //real inv_d = rsqrtf(d2);
+        //n *= inv_d;
+        //dforce = n * (rep.stiffness * dot(dxi, n));
+
+        //dforce = (n * (rep.stiffness * dot(dxi, n)))/d2;
+
+        dforce += n * (real)__fdividef((rep.stiffness * dot(dxi, n)),d2);
+    }
+}
+
+template<class real>
+__global__ void ParticlesRepulsionForceFieldCuda3t_addForce_kernel(int size, const int *cells, const int *cellGhost, GPURepulsion<real> repulsion, real* f, const real* x, const real* v)
+{
+    __shared__ int2 range;
+    __shared__ int ghost;
+    __shared__ real temp_x[BSIZE*3];
+    __shared__ real temp_v[BSIZE*3];
+    int tx3 = threadIdx.x * 3;
+    for (int cell = blockIdx.x; cell < size; cell += gridDim.x)
+    {
+        if (!threadIdx.x)
+        {
+            //range = *(const int2*)(cells+cell);
+            range.x = cells[cell];
+            range.y = cells[cell+1];
+            range.y &= ~(1U<<31);
+            ghost = cellGhost[cell];
+        }
+        __syncthreads();
+        if (range.x <= 0) continue; // no actual particle in this cell
+        for (int px0 = range.x; px0 < ghost; px0 += BSIZE)
+        {
+            int px = px0 + threadIdx.x;
+            CudaVec3<real> xi;
+            CudaVec3<real> vi;
+            CudaVec3<real> force;
+            int index;
+            if (px < range.y)
+            {
+                index = cells[px];
+                xi = ((const CudaVec3<real>*)x)[index];
+                temp_x[tx3  ] = xi.x;
+                temp_x[tx3+1] = xi.y;
+                temp_x[tx3+2] = xi.z;
+                vi = ((const CudaVec3<real>*)v)[index];
+                temp_v[tx3  ] = vi.x;
+                temp_v[tx3+1] = vi.y;
+                temp_v[tx3+2] = vi.z;
+            }
+            __syncthreads();
+            if (px < ghost)
+            {
+                // actual particle -> compute interactions
+                force = CudaVec3<real>::make(0,0,0);
+                int np = min(range.y-px0,BSIZE);
+                for (int i=0; i < np; ++i)
+                {
+                    if (i != threadIdx.x)
+                        ParticlesRepulsionCalcForce(xi, vi, ((const CudaVec3<real>*)temp_x)[i], ((const CudaVec3<real>*)temp_v)[i], force, repulsion);
+                }
+            }
+            __syncthreads();
+            // loop through other groups of particles
+            for (int py0 = range.x; py0 < range.y; py0 += BSIZE)
+            {
+                if (py0 == px0) continue;
+                int py = py0 + threadIdx.x;
+                if (py < range.y)
+                {
+                    int index2 = cells[py];
+                    CudaVec3<real> xj = ((const CudaVec3<real>*)x)[index2];
+                    temp_x[tx3  ] = xj.x;
+                    temp_x[tx3+1] = xj.y;
+                    temp_x[tx3+2] = xj.z;
+                    CudaVec3<real> vj = ((const CudaVec3<real>*)v)[index2];
+                    temp_v[tx3  ] = vj.x;
+                    temp_v[tx3+1] = vj.y;
+                    temp_v[tx3+2] = vj.z;
+                }
+                __syncthreads();
+                if (px < ghost)
+                {
+                    // actual particle -> compute interactions
+                    int np = min(range.y-py0,BSIZE);
+                    for (int i=0; i < np; ++i)
+                    {
+                        ParticlesRepulsionCalcForce(xi, vi, ((const CudaVec3<real>*)temp_x)[i], ((const CudaVec3<real>*)temp_v)[i], force, repulsion);
+                    }
+                }
+                __syncthreads();
+            }
+            if (px < ghost)
+            {
+                // actual particle -> write computed force
+                ((CudaVec3<real>*)f)[index] += force;
+            }
+        }
+    }
+}
+
+template<class real>
+__global__ void ParticlesRepulsionForceFieldCuda3t_addDForce_kernel(int size, const int *cells, const int *cellGhost, GPURepulsion<real> repulsion, real* df, const real* x, const real* dx)
+{
+    __shared__ int2 range;
+    __shared__ int ghost;
+    __shared__ real temp_x[BSIZE*3];
+    __shared__ real temp_dx[BSIZE*3];
+    int tx3 = threadIdx.x * 3;
+    for (int cell = blockIdx.x; cell < size; cell += gridDim.x)
+    {
+        if (!threadIdx.x)
+        {
+            //range = *(const int2*)(cells+cell);
+            range.x = cells[cell];
+            range.y = cells[cell+1];
+            range.y &= ~(1U<<31);
+            ghost = cellGhost[cell];
+        }
+        __syncthreads();
+        if (range.x <= 0) continue; // no actual particle in this cell
+        for (int px0 = range.x; px0 < ghost; px0 += BSIZE)
+        {
+            int px = px0 + threadIdx.x;
+            CudaVec3<real> xi;
+            CudaVec3<real> dxi;
+            CudaVec3<real> dforce;
+            int index;
+            if (px < range.y)
+            {
+                index = cells[px];
+                xi = ((const CudaVec3<real>*)x)[index];
+                temp_x[tx3  ] = xi.x;
+                temp_x[tx3+1] = xi.y;
+                temp_x[tx3+2] = xi.z;
+                dxi = ((const CudaVec3<real>*)dx)[index];
+                temp_dx[tx3  ] = dxi.x;
+                temp_dx[tx3+1] = dxi.y;
+                temp_dx[tx3+2] = dxi.z;
+            }
+            __syncthreads();
+            if (px < ghost)
+            {
+                // actual particle -> compute interactions
+                dforce = CudaVec3<real>::make(0,0,0);
+                int np = min(range.y-px0,BSIZE);
+                for (int i=0; i < np; ++i)
+                {
+                    if (i != threadIdx.x)
+                        ParticlesRepulsionCalcDForce(xi, dxi, ((const CudaVec3<real>*)temp_x)[i], ((const CudaVec3<real>*)temp_dx)[i], dforce, repulsion);
+                }
+            }
+            __syncthreads();
+            // loop through other groups of particles
+            for (int py0 = range.x; py0 < range.y; py0 += BSIZE)
+            {
+                if (py0 == px0) continue;
+                int py = py0 + threadIdx.x;
+                if (py < range.y)
+                {
+                    int index2 = cells[py];
+                    CudaVec3<real> xj = ((const CudaVec3<real>*)x)[index2];
+                    temp_x[tx3  ] = xj.x;
+                    temp_x[tx3+1] = xj.y;
+                    temp_x[tx3+2] = xj.z;
+                    CudaVec3<real> dxj = ((const CudaVec3<real>*)dx)[index2];
+                    temp_dx[tx3  ] = dxj.x;
+                    temp_dx[tx3+1] = dxj.y;
+                    temp_dx[tx3+2] = dxj.z;
+                }
+                __syncthreads();
+                if (px < ghost)
+                {
+                    // actual particle -> compute interactions
+                    int np = min(range.y-py0,BSIZE);
+                    for (int i=0; i < np; ++i)
+                    {
+                        ParticlesRepulsionCalcDForce(xi, dxi, ((const CudaVec3<real>*)temp_x)[i], ((const CudaVec3<real>*)temp_dx)[i], dforce, repulsion);
+                    }
+                }
+                __syncthreads();
+            }
+            if (px < ghost)
+            {
+                // actual particle -> write computed force
+                ((CudaVec3<real>*)df)[index] += dforce;
+            }
+        }
+    }
+}
+
+//////////////////////
+// CPU-side methods //
+//////////////////////
+
+void ParticlesRepulsionForceFieldCuda3f_addForce(unsigned int size, const void* cells, const void* cellGhost, GPURepulsion3f* repulsion, void* f, const void* x, const void* v)
+{
+    dim3 threads(BSIZE,1);
+    dim3 grid(60,1);
+    {ParticlesRepulsionForceFieldCuda3t_addForce_kernel<float><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *repulsion, (float*)f, (const float*)x, (const float*)v); mycudaDebugError("ParticlesRepulsionForceFieldCuda3t_addForce_kernel<float>");}
+}
+
+void ParticlesRepulsionForceFieldCuda3f_addDForce(unsigned int size, const void* cells, const void* cellGhost, GPURepulsion3f* repulsion, void* df, const void* x, const void* dx)
+{
+    dim3 threads(BSIZE,1);
+    dim3 grid(60/BSIZE,1);
+    {ParticlesRepulsionForceFieldCuda3t_addDForce_kernel<float><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *repulsion, (float*)df, (const float*)x, (const float*)dx); mycudaDebugError("ParticlesRepulsionForceFieldCuda3t_addDForce_kernel<float>");}
+}
+
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+void ParticlesRepulsionForceFieldCuda3d_addForce(unsigned int size, const void* cells, const void* cellGhost, GPURepulsion3d* repulsion, void* f, const void* x, const void* v)
+{
+    dim3 threads(BSIZE,1);
+    dim3 grid(60/BSIZE,1);
+    {ParticlesRepulsionForceFieldCuda3t_addForce_kernel<double><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *repulsion, (double*)f, (const double*)x, (const double*)v); mycudaDebugError("ParticlesRepulsionForceFieldCuda3t_addForce_kernel<double>");}
+}
+
+void ParticlesRepulsionForceFieldCuda3d_addDForce(unsigned int size, const void* cells, const void* cellGhost, GPURepulsion3d* repulsion, void* df, const void* x, const void* dx)
+{
+    dim3 threads(BSIZE,1);
+    dim3 grid(60/BSIZE,1);
+    {ParticlesRepulsionForceFieldCuda3t_addDForce_kernel<double><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *repulsion, (double*)df, (const double*)x, (const double*)dx); mycudaDebugError("ParticlesRepulsionForceFieldCuda3t_addDForce_kernel<double>");}
+}
+
+#endif // SOFA_GPU_CUDA_DOUBLE
+
+#if defined(__cplusplus) && CUDA_VERSION < 2000
+} // namespace cuda
+} // namespace gpu
+} // namespace sofa
+#endif
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.h b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.h
new file mode 100644
index 0000000..7631366
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.h
@@ -0,0 +1,75 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#ifndef SOFA_GPU_CUDA_CUDAPARTICLESREPULSIONFORCEFIELD_H
+#define SOFA_GPU_CUDA_CUDAPARTICLESREPULSIONFORCEFIELD_H
+
+#include <SofaCUDA/sofa/gpu/cuda/CudaTypes.h>
+#include <SofaSphFluid/ParticlesRepulsionForceField.h>
+#include <SofaSphFluid/CUDA/CudaSpatialGridContainer.h>
+
+namespace sofa
+{
+
+
+namespace gpu::cuda
+{
+
+template<class real>
+struct GPURepulsion
+{
+    real d;
+    real d2;
+    real stiffness;
+    real damping;
+};
+
+typedef GPURepulsion<float> GPURepulsion3f;
+typedef GPURepulsion<double> GPURepulsion3d;
+
+} // namespace gpu::cuda
+
+
+namespace component::forcefield
+{
+
+template <>
+void ParticlesRepulsionForceField<gpu::cuda::CudaVec3fTypes>::addForce(const core::MechanicalParams* mparams, DataVecDeriv& d_f, const DataVecCoord& d_x, const DataVecDeriv& d_v);
+
+template <>
+void ParticlesRepulsionForceField<gpu::cuda::CudaVec3fTypes>::addDForce(const core::MechanicalParams* mparams, DataVecDeriv& d_df, const DataVecDeriv& d_dx);
+
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+template <>
+void ParticlesRepulsionForceField<gpu::cuda::CudaVec3dTypes>::addForce(const core::MechanicalParams* mparams, DataVecDeriv& d_f, const DataVecCoord& d_x, const DataVecDeriv& d_v);
+
+template <>
+void ParticlesRepulsionForceField<gpu::cuda::CudaVec3dTypes>::addDForce(const core::MechanicalParams* mparams, DataVecDeriv& d_df, const DataVecDeriv& d_dx);
+
+#endif // SOFA_GPU_CUDA_DOUBLE
+
+} // namespace component::forcefield
+
+
+} // namespace sofa
+
+#endif
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.inl b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.inl
new file mode 100644
index 0000000..2931e07
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaParticlesRepulsionForceField.inl
@@ -0,0 +1,168 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#ifndef SOFA_GPU_CUDA_CUDAPARTICLESREPULSIONFORCEFIELD_INL
+#define SOFA_GPU_CUDA_CUDAPARTICLESREPULSIONFORCEFIELD_INL
+
+#include "CudaParticlesRepulsionForceField.h"
+#include <SofaSphFluid/ParticlesRepulsionForceField.inl>
+//#include <SofaSphFluid/CUDA/CudaSpatialGridContainer.inl>
+
+namespace sofa
+{
+
+
+namespace gpu::cuda
+{
+
+extern "C"
+{
+
+    void ParticlesRepulsionForceFieldCuda3f_addForce (unsigned int size, const void* cells, const void* cellGhost, GPURepulsion3f* repulsion, void* f, const void* x, const void* v );
+    void ParticlesRepulsionForceFieldCuda3f_addDForce(unsigned int size, const void* cells, const void* cellGhost, GPURepulsion3f* repulsion, void* f, const void* x, const void* dx);
+
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+    void ParticlesRepulsionForceFieldCuda3d_addForce (unsigned int size, const void* cells, const void* cellGhost, GPURepulsion3d* repulsion, void* f, const void* x, const void* v );
+    void ParticlesRepulsionForceFieldCuda3d_addDForce(unsigned int size, const void* cells, const void* cellGhost, GPURepulsion3d* repulsion, void* f, const void* x, const void* dx);
+
+#endif // SOFA_GPU_CUDA_DOUBLE
+}
+
+} // namespace gpu::cuda
+
+
+namespace component::forcefield
+{
+
+using namespace gpu::cuda;
+
+
+template <>
+void ParticlesRepulsionForceField<gpu::cuda::CudaVec3fTypes>::addForce(const core::MechanicalParams* /*mparams*/, DataVecDeriv& d_f, const DataVecCoord& d_x, const DataVecDeriv& d_v)
+{
+    if (grid == NULL) return;
+
+    VecDeriv& f = *d_f.beginEdit();
+    const VecCoord& x = d_x.getValue();
+    const VecDeriv& v = d_v.getValue();
+
+    grid->updateGrid(x);
+    GPURepulsion3f repulsion;
+    repulsion.d = distance.getValue();
+    repulsion.d2 = repulsion.d*repulsion.d;
+    repulsion.stiffness = stiffness.getValue();
+    repulsion.damping = damping.getValue();
+    f.resize(x.size());
+    Grid::Grid* g = grid->getGrid();
+    ParticlesRepulsionForceFieldCuda3f_addForce(
+        g->getNbCells(), g->getCellsVector().deviceRead(), g->getCellGhostVector().deviceRead(),
+        &repulsion, f.deviceWrite(), x.deviceRead(), v.deviceRead());
+
+    d_f.endEdit();
+}
+
+template <>
+void ParticlesRepulsionForceField<gpu::cuda::CudaVec3fTypes>::addDForce(const core::MechanicalParams* mparams, DataVecDeriv& d_df, const DataVecDeriv& d_dx)
+{
+    if (grid == NULL) return;
+
+    VecDeriv& df = *d_df.beginEdit();
+    const VecDeriv& dx = d_dx.getValue();
+    Real kFactor = (Real)sofa::core::mechanicalparams::kFactorIncludingRayleighDamping(mparams,this->rayleighStiffness.getValue());
+    Real bFactor = (Real)sofa::core::mechanicalparams::bFactor(mparams);
+
+    const VecCoord& x = this->mstate->read(sofa::core::vec_id::read_access::position)->getValue();
+    GPURepulsion3f repulsion;
+    repulsion.d = distance.getValue();
+    repulsion.d2 = repulsion.d*repulsion.d;
+    repulsion.stiffness = (float)(stiffness.getValue()*kFactor);
+    repulsion.damping = (float)(damping.getValue()*bFactor);
+    df.resize(dx.size());
+    Grid::Grid* g = grid->getGrid();
+    ParticlesRepulsionForceFieldCuda3f_addDForce(
+        g->getNbCells(), g->getCellsVector().deviceRead(), g->getCellGhostVector().deviceRead(),
+        &repulsion, df.deviceWrite(), x.deviceRead(), dx.deviceRead());
+
+    d_df.endEdit();
+}
+
+
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+template <>
+void ParticlesRepulsionForceField<gpu::cuda::CudaVec3dTypes>::addForce(const core::MechanicalParams* /*mparams*/, DataVecDeriv& d_f, const DataVecCoord& d_x, const DataVecDeriv& d_v)
+{
+    if (grid == NULL) return;
+
+    VecDeriv& f = *d_f.beginEdit();
+    const VecCoord& x = d_x.getValue();
+    const VecDeriv& v = d_v.getValue();
+
+    grid->updateGrid(x);
+    GPURepulsion3d repulsion;
+    repulsion.d = distance.getValue();
+    repulsion.d2 = repulsion.d*repulsion.d;
+    repulsion.stiffness = stiffness.getValue();
+    repulsion.damping = damping.getValue();
+    f.resize(x.size());
+    Grid::Grid* g = grid->getGrid();
+    ParticlesRepulsionForceFieldCuda3d_addForce(
+        g->getNbCells(), g->getCellsVector().deviceRead(), g->getCellGhostVector().deviceRead(),
+        &repulsion, f.deviceWrite(), x.deviceRead(), v.deviceRead());
+
+    d_f.endEdit();
+}
+
+template <>
+void ParticlesRepulsionForceField<gpu::cuda::CudaVec3dTypes>::addDForce(const core::MechanicalParams* mparams, DataVecDeriv& d_df, const DataVecDeriv& d_dx)
+{
+    if (grid == NULL) return;
+
+    VecDeriv& df = *d_df.beginEdit();
+    const VecDeriv& dx = d_dx.getValue();
+    Real kFactor = (Real)sofa::core::mechanicalparams::kFactorIncludingRayleighDamping(mparams,this->rayleighStiffness.getValue());
+    Real bFactor = (Real)sofa::core::mechanicalparams::bFactor(mparams);
+
+    const VecCoord& x = this->mstate->read(sofa::core::vec_id::read_access::position)->getValue();
+    GPURepulsion3d repulsion;
+    repulsion.d = distance.getValue();
+    repulsion.d2 = repulsion.d*repulsion.d;
+    repulsion.stiffness = stiffness.getValue()*kFactor;
+    repulsion.damping = damping.getValue()*bFactor;
+    df.resize(dx.size());
+    Grid::Grid* g = grid->getGrid();
+    ParticlesRepulsionForceFieldCuda3d_addDForce(
+        g->getNbCells(), g->getCellsVector().deviceRead(), g->getCellGhostVector().deviceRead(),
+        &repulsion, df.deviceWrite(), x.deviceRead(), dx.deviceRead());
+
+    d_df.endEdit();
+}
+
+#endif // SOFA_GPU_CUDA_DOUBLE
+
+
+} // namespace component::forcefield
+
+
+} // namespace sofa
+
+#endif
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSPHFluidForceField.cpp b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSPHFluidForceField.cpp
new file mode 100644
index 0000000..c6dd42f
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSPHFluidForceField.cpp
@@ -0,0 +1,43 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#include <SofaCUDA/sofa/gpu/cuda/CudaTypes.h>
+#include "CudaSPHFluidForceField.inl"
+#include <sofa/core/ObjectFactory.h>
+#include <sofa/core/behavior/ForceField.inl>
+#include <SofaSphFluid/SPHFluidForceField.inl>
+
+
+namespace sofa::gpu::cuda
+{
+
+int SPHFluidForceFieldCudaClass = core::RegisterObject("Supports GPU-side computations using CUDA")
+        .add< component::forcefield::SPHFluidForceField<CudaVec3fTypes> >()
+#ifdef SOFA_GPU_CUDA_DOUBLE
+        .add< component::forcefield::SPHFluidForceField<CudaVec3dTypes> >()
+#endif // SOFA_GPU_CUDA_DOUBLE
+        ;
+
+} // namespace sofa::gpu::cuda
+
+
+
+
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSPHFluidForceField.cu b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSPHFluidForceField.cu
new file mode 100644
index 0000000..1ab6b88
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSPHFluidForceField.cu
@@ -0,0 +1,1246 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#include <SofaCUDA/sofa/gpu/cuda/CudaCommon.h>
+#include <SofaCUDA/sofa/gpu/cuda/CudaMath.h>
+#include "cuda.h"
+
+#if defined(__cplusplus) && CUDA_VERSION < 2000
+namespace sofa
+{
+namespace gpu
+{
+namespace cuda
+{
+#endif
+
+template<class real>
+class GPUSPHFluid
+{
+public:
+    real h;         ///< particles radius
+    real h2;        ///< particles radius squared
+    real inv_h2;    ///< particles radius squared inverse
+    real stiffness; ///< pressure stiffness
+    real mass;      ///< particles mass
+    real mass2;     ///< particles mass squared
+    real density0;  ///< 1000 kg/m3 for water
+    real viscosity;
+    real surfaceTension;
+
+    // Precomputed constants for smoothing kernels
+    real CWd;          ///< =     constWd(h)
+    //real CgradWd;      ///< = constGradWd(h)
+    real CgradWp;      ///< = constGradWp(h)
+    real ClaplacianWv; ///< =  constLaplacianWv(h)
+};
+
+typedef GPUSPHFluid<float> GPUSPHFluid3f;
+typedef GPUSPHFluid<double> GPUSPHFluid3d;
+
+extern "C"
+{
+
+    void SPHFluidForceFieldCuda3f_computeDensity(int kernelType, int pressureType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3f* params, void* pos4, const void* x);
+    void SPHFluidForceFieldCuda3f_addForce (int kernelType, int pressureType, int viscosityType, int surfaceTensionType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3f* params, void* f, const void* pos4, const void* v);
+//void SPHFluidForceFieldCuda3f_addDForce(int kernelType, int pressureType, int viscosityType, int surfaceTensionType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3f* params, void* f, const void* pos4, const void* v, const void* dx);
+
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+    void SPHFluidForceFieldCuda3d_computeDensity(int kernelType, int pressureType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3d* params, void* pos4, const void* x);
+    void SPHFluidForceFieldCuda3d_addForce (int kernelType, int pressureType, int viscosityType, int surfaceTensionType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3d* params, void* f, const void* pos4, const void* v);
+//void SPHFluidForceFieldCuda3d_addDForce(int kernelType, int pressureType, int viscosityType, int surfaceTensionType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3d* params, void* f, const void* pos4, const void* v, const void* dx);
+
+#endif // SOFA_GPU_CUDA_DOUBLE
+
+}
+
+//////////////////////
+// GPU-side methods //
+//////////////////////
+
+#ifndef R_PI
+#ifdef M_PI
+#define R_PI M_PI
+#else
+#define R_PI 3.141592653589793238462
+#endif
+#endif
+
+template<class real, int kernelType>
+class SPH;
+
+template<class real>
+class SPH<real,1>
+{
+public:
+    typedef real Real;
+    typedef CudaVec3<Real> Deriv;
+    static Real constW(Real h)
+    {
+        return (Real) (48/R_PI)/(h*h*h);
+    }
+
+    static __device__ Real  W0(Real C)
+    {
+        return C*((Real)(1.0/6.0));
+    }
+    static __device__ Real  W0()
+    {
+        return ((Real)(1.0/6.0));
+    }
+    static __device__ Real W(Real r_h, Real C)
+    {
+        Real s = 1-r_h;
+        if (r_h < (Real)0.5)   return C*((Real)(1.0/6.0) - r_h*r_h*s);
+        else                   return C*((Real)(1.0/3.0) * (s*s*s));
+    }
+    static __device__ Real W2(Real r2_h2, Real C)
+    {
+        Real r_h = sqrtf(r2_h2);
+        Real s = 1-r_h;
+        if (r2_h2 < (Real)0.25) return C*((Real)(1.0/6.0) - r2_h2*s);
+        else                    return C*((Real)(1.0/3.0) * (s*s*s));
+    }
+    static __device__ Real W2(Real r2_h2)
+    {
+        Real r_h = sqrtf(r2_h2);
+        Real s = 1-r_h;
+        if (r2_h2 < (Real)0.25) return   ((Real)(1.0/6.0) - r2_h2*s);
+        else                    return   ((Real)(1.0/3.0) * (s*s*s));
+    }
+    static Real constGradW(Real h)
+    {
+        return constW(h)/(h*h);
+    }
+    static __device__ Deriv gradW(const Deriv& d, Real r_h, Real C)
+    {
+        Real g;
+        if (r_h < (Real)0.5)    g = 3*r_h - 2;
+        else  { Real s = 1-r_h; g = -s*s/r_h; }
+        return d*(C*g);
+    }
+    static Real  constLaplacianW(Real h)
+    {
+        return constW(h)/(h*h);
+    }
+    static __device__ Real  laplacianW(Real r_h, Real C)
+    {
+        if (r_h < (Real)0.5)    return C*(12*r_h-6);
+        else if (r_h < (Real)1) return C*(6-4*r_h-2/r_h);
+        else return 0;
+    }
+    /// Density Smoothing Kernel
+    static Real  constWd(Real h)
+    {
+        return constW(h);
+    }
+    static __device__ Real  Wd(Real r_h, Real C)
+    {
+        return W(r_h,C);
+    }
+    static __device__ Real  Wd2(Real r2_h2, Real C)
+    {
+        return W2(r2_h2,C);
+    }
+    static __device__ Real  Wd2(Real r2_h2)
+    {
+        return W2(r2_h2);
+    }
+    static __device__ Real  Wd0()
+    {
+        return W0();
+    }
+    static Real  constGradWd(Real h)
+    {
+        return constGradW(h);
+    }
+    static __device__ Deriv gradWd(const Deriv& d, Real r_h, Real C)
+    {
+        return gradW(d,r_h,C);
+    }
+    static __device__ Deriv gradWd2(const Deriv& d, Real r2_h2, Real C)
+    {
+        return gradW(d,sqrtf(r2_h2),C);
+    }
+    static Real  constLaplacianWd(Real h)
+    {
+        return constLaplacianW(h);
+    }
+    static __device__ Real  laplacianWd(Real r_h, Real C)
+    {
+        return laplacianW(r_h,C);
+    }
+    static __device__ Real  laplacianWd2(Real r2_h2, Real C)
+    {
+        return laplacianW(sqrtf(r2_h2),C);
+    }
+
+    /// Pressure Smoothing Kernel
+    static Real  constWp(Real h)
+    {
+        return constW(h);
+    }
+    static __device__ Real  Wp(Real r_h, Real C)
+    {
+        return W(r_h,C);
+    }
+    static Real  constGradWp(Real h)
+    {
+        return constGradW(h);
+    }
+    static __device__ Deriv gradWp(const Deriv& d, Real r_h, Real C)
+    {
+        return gradW(d,r_h,C);
+    }
+
+    /// Viscosity Smoothing Kernel
+    static Real  constWv(Real h)
+    {
+        return constW(h);
+    }
+    static __device__ Real  Wv(Real r_h, Real C)
+    {
+        return W(r_h,C);
+    }
+    static __device__ Real  Wv2(Real r2_h2, Real r_h, Real C)
+    {
+        return W2(r2_h2,r_h,C);
+    }
+    static Real  constGradWv(Real h)
+    {
+        return constGradW(h);
+    }
+    static __device__ Deriv gradWv(const Deriv& d, Real r_h, Real C)
+    {
+        return gradW(d,r_h,C);
+    }
+    static __device__ Deriv gradWv2(const Deriv& d, Real r2_h2, Real r_h, Real C)
+    {
+        return gradW(d,r_h,C);
+    }
+    static Real  constLaplacianWv(Real h)
+    {
+        return constLaplacianW(h);
+    }
+
+    static __device__ Real  laplacianWv(Real r_h, Real C)
+    {
+        return laplacianW(r_h,C);
+    }
+};
+
+template<class real>
+class SPH<real,0>
+{
+public:
+    typedef real Real;
+    typedef CudaVec3<Real> Deriv;
+    /// Density Smoothing Kernel:  W = 315 / 64pih9 * (h2 - r2)3 = 315 / 64pih3 * (1 - (r/h)2)3
+    static Real  constWd(Real h)
+    {
+        return (Real)(315 / (64*R_PI*h*h*h));
+    }
+    static __device__ Real  Wd(Real r_h, Real C)
+    {
+        Real a = (1-r_h*r_h);
+        return  C*a*a*a;
+    }
+    static __device__ Real  Wd2(Real r2_h2, Real C)
+    {
+        Real a = (1-r2_h2);
+        return  C*a*a*a;
+    }
+    static __device__ Real  Wd2(Real r2_h2)
+    {
+        Real a = (1-r2_h2);
+        return  a*a*a;
+    }
+    static __device__ Real  Wd0(Real C)
+    {
+        return C;
+    }
+    static __device__ Real  Wd0()
+    {
+        return 1;
+    }
+    static Real  constGradWd(Real h)
+    {
+        return -6*constWd(h)/(h*h);
+    }
+    static __device__ Deriv gradWd(const Deriv& d, Real r_h, Real C)
+    {
+        Real a = (1-r_h*r_h);
+        return d*(C*a*a);
+    }
+    static __device__ Deriv gradWd2(const Deriv& d, Real r2_h2, Real C)
+    {
+        Real a = (1-r2_h2);
+        return d*(C*a*a);
+    }
+    static Real  constLaplacianWd(Real h)
+    {
+        return -6*constWd(h)/(h*h);
+    }
+    static __device__ Real  laplacianWd(Real r_h, Real C)
+    {
+        Real r2_h2 = r_h*r_h;
+        return C*((1-r2_h2)*(3-7*r2_h2));
+    }
+    static __device__ Real  laplacianWd2(Real r2_h2, Real C)
+    {
+        return C*((1-r2_h2)*(3-7*r2_h2));
+    }
+
+    /// Pressure Smoothing Kernel:  W = 15 / pih6 (h - r)3 = 15 / pih3 (1 - r/h)3
+    static Real  constWp(Real h)
+    {
+        return (Real)(15 / (R_PI*h*h*h));
+    }
+    static __device__ Real  Wp(Real r_h, Real C)
+    {
+        Real a = (1-r_h);
+        return  C*a*a*a;
+    }
+    static Real  constGradWp(Real h)
+    {
+        return (-3*constWp(h)) / (h*h);
+    }
+    static __device__ Deriv gradWp(const Deriv& d, Real r_h, Real C)
+    {
+        Real a = (1-r_h);
+        return d * (C*a*a/r_h);
+    }
+
+    /// Viscosity Smoothing Kernel:  W = 15/(2pih3) (-r3/2h3 + r2/h2 + h/2r - 1)
+    static Real  constWv(Real h)
+    {
+        return (Real)(15/(2*R_PI*h*h*h));
+    }
+    static __device__ Real  Wv(Real r_h, Real C)
+    {
+        Real r2_h2 = r_h*r_h;
+        Real r3_h3 = r2_h2*r_h;
+        return C*(-0.5f*r3_h3 + r2_h2 + 0.5f/r_h - 1);
+    }
+    static __device__ Real  Wv2(Real r2_h2, Real r_h, Real C)
+    {
+        Real r3_h3 = r2_h2*r_h;
+        return C*(-0.5f*r3_h3 + r2_h2 + 0.5f/r_h - 1);
+    }
+    static Real  constGradWv(Real h)
+    {
+        return constWv(h)/(h*h);
+    }
+    static __device__ Deriv gradWv(const Deriv& d, Real r_h, Real C)
+    {
+        Real r3_h3 = r_h*r_h*r_h;
+        return d * (C*(2.0f -3.0f*r_h  - 0.5f/r3_h3));
+    }
+    static __device__ Deriv gradWv2(const Deriv& d, Real r2_h2, Real r_h, Real C)
+    {
+        Real r3_h3 = r2_h2*r_h;
+        return d * (C*(-3*r_h  + 4 - 1/r3_h3));
+    }
+    static Real  constLaplacianWv(Real h)
+    {
+        return 6*constWv(h)/(h*h);
+    }
+
+    static __device__ Real  laplacianWv(Real r_h, Real C)
+    {
+        return C*(1-r_h);
+    }
+};
+
+template<class real, int kernelType, int pressureType>
+__device__ void SPHFluidInitDensity(CudaVec3<real> /*x1*/, real& density, GPUSPHFluid<real>& params)
+{
+    density = 0; //params.mass * params.CWd; //SPH<real,kernelType>::Wd(0,params.CWd);
+}
+
+template<class real, int kernelType, int pressureType>
+__device__ void SPHFluidCalcDensity(CudaVec3<real> x1, CudaVec3<real> x2, real& density, GPUSPHFluid<real>& params)
+{
+    CudaVec3<real> n = x2-x1;
+    real d2 = norm2(n);
+    if (sqrtf(d2) < params.h)
+        //if (d2 < params.h2)
+    {
+        //real r_h = rsqrtf(params.h2/d2);
+        //real r_h = sqrtf(d2/params.h2);
+        real r2_h2 = (d2/params.h2);
+        real d = SPH<real,kernelType>::Wd2(r2_h2);
+        density += d;
+    }
+}
+
+template<class real, int kernelType, int pressureType>
+__device__ void SPHFluidFinalDensity(CudaVec3<real> /*x1*/, real& density, GPUSPHFluid<real>& params)
+{
+    density = (SPH<real,kernelType>::Wd0()+density) * params.CWd * params.mass; //SPH<real,kernelType>::Wd(0,params.CWd);
+}
+
+template<class real, int kernelType, int pressureType, int viscosityType, int surfaceTensionType>
+__device__ void SPHFluidCalcForce(CudaVec4<real> x1, CudaVec3<real> v1, CudaVec4<real> x2, CudaVec3<real> v2, CudaVec3<real>& force, GPUSPHFluid<real>& params)
+{
+    CudaVec3<real> n = CudaVec3<real>::make(x2.x-x1.x,x2.y-x1.y,x2.z-x1.z);
+    real d2 = norm2(n);
+    if (d2 < params.h2)
+    {
+        //real r_h = rsqrtf(params.h2/d2);
+        //real r_h = sqrtf(d2/params.h2);
+        real r2_h2 = (d2/params.h2);
+        real r_h = sqrtf(r2_h2);
+        real density1 = x1.w;
+        real density2 = x2.w;
+
+        // Pressure
+        real pressure1 = params.stiffness * (density1 - params.density0);
+        real pressure2 = params.stiffness * (density2 - params.density0);
+        real pressureFV = params.mass2 * (pressure1 / (density1*density1) + pressure2 / (density2*density2) );
+
+        // Viscosity
+        if (viscosityType == 1)
+        {
+            force += ( v2 - v1 ) * ( params.mass2 * params.viscosity * SPH<real,kernelType>::laplacianWv(r_h,params.ClaplacianWv) / (density1 * density2) );
+        }
+        else if (viscosityType == 2)
+        {
+            real vx = dot(n,v2-v1);
+            if (vx < 0)
+                pressureFV += - (vx * params.viscosity * params.h * params.mass / ((r2_h2 + 0.01f*params.h2)*(density1+density2)*0.5f));
+        }
+
+        force += SPH<real,kernelType>::gradWp(n, r_h, params.CgradWp) * pressureFV;
+    }
+}
+/*
+template<class real, int kernelType, int pressureType, int viscosityType, int surfaceTensionType>
+__device__ void SPHFluidCalcDForce(CudaVec4<real> x1, CudaVec3<real> v1, CudaVec3<real> dx1, CudaVec4<real> x2, CudaVec3<real> v2,  CudaVec3<real> dx2, CudaVec3<real>& dforce, GPUSPHFluid<real>& params)
+{
+    CudaVec3<real> n = CudaVec3<real>::make(x2.x-x1.x,x2.y-x1.y,x2.z-x1.z);
+    real d2 = norm2(n);
+    if (d2 < params.h2)
+    {
+        CudaVec3<real> dxi = dx2-dx1;
+
+        real inv_d = rsqrtf(d2);
+
+        real r2_h2 = (d2/params.h2);
+        real r_h = sqrtf(r2_h2);
+
+        real density1 = x1.w;
+        real density2 = x2.w;
+
+        // Pressure
+        real pressure1 = params.stiffness * (density1 - params.density0);
+        real pressure2 = params.stiffness * (density2 - params.density0);
+
+        real fpressure = params.mass2 * (pressure1 / (density1*density1) + pressure2 / (density2*density2) );
+
+        // Derivatives
+
+        CudaVec3<real> dn = dxi * inv_d;
+
+        real dr_h = dot(dn,n)/params.h;
+        //real dr2_h2 = 2*r_h*dr_h;
+
+        real ddensity = dot(dxi,SPH<real,kernelType>::gradWd2(n, r2_h2, params.CgradWd));
+        real dpressure = params.stiffness*ddensity;
+        // d(a/b^2) = (d(a)*b-2*a*d(b))/b^3
+        real dfpressure = params.mass2 * ( (dpressure*density1-2*pressure1*ddensity)/(density1*density1*density1)
+                                          +(dpressure*density2-2*pressure2*ddensity)/(density2*density2*density2) );
+        real a = (1-r_h);
+        real dWp = params.CgradWp*a*a;
+        real ddWp = -2*params.CgradWp*a*dr_h;
+
+        // f = n * dWp * fpressure
+        // df = dn * (dWp * fpressure) + n * (ddWp * fpressure + dWp * dfpressure);
+
+        dforce += dn * (dWp  * fpressure) + n * (ddWp * fpressure + dWp * dfpressure);
+
+        // Viscosity
+        // force += ( v2 - v1 ) * ( params.mass2 * params.viscosity * params.ClaplacianWv * (1-r_h) / (density1 * density2) );
+        real d1d2 = density1*density2;
+        dforce += dxi * ( params.mass2 * params.viscosity * params.ClaplacianWv * (1-r_h) / (density1 * density2) )
+            + (v2-v1) * ( params.mass2 * params.viscosity * params.ClaplacianWv * (-dr_h*d1d2 - (1-r_h)*ddensity*(density1+density2))/(d1d2*d1d2));
+
+    }
+}
+*/
+template<class real, int kernelType, int pressureType>
+__global__ void SPHFluidForceFieldCuda3t_computeDensity_kernel(int size, const int *cells, const int *cellGhost, GPUSPHFluid<real> params, real* pos4, const real* x)
+{
+    __shared__ int2 range;
+    __shared__ int ghost;
+    __shared__ real temp_x[BSIZE*3];
+    int tx3 = threadIdx.x * 3;
+    for (int cell = blockIdx.x; cell < size; cell += gridDim.x)
+    {
+        __syncthreads();
+        if (!threadIdx.x)
+        {
+            //range = *(const int2*)(cells+cell);
+            range.x = cells[cell];
+            range.y = cells[cell+1];
+            range.y &= ~(1U<<31);
+            ghost = cellGhost[cell];
+        }
+        __syncthreads();
+        if (range.x <= 0) continue; // no actual particle in this cell
+        for (int px0 = range.x; px0 < ghost; px0 += BSIZE)
+        {
+            int px = px0 + threadIdx.x;
+            CudaVec3<real> xi;
+            real density;
+            int index;
+            if (px < range.y)
+            {
+                index = cells[px];
+                xi = ((const CudaVec3<real>*)x)[index];
+                temp_x[tx3  ] = xi.x;
+                temp_x[tx3+1] = xi.y;
+                temp_x[tx3+2] = xi.z;
+            }
+            __syncthreads();
+            if (px < ghost)
+            {
+                // actual particle -> compute interactions
+                SPHFluidInitDensity<real,kernelType,pressureType>(xi, density, params);
+
+                int np = min(range.y-px0,BSIZE);
+                for (int i=0; i < np; ++i)
+                {
+                    if (i != threadIdx.x)
+                        SPHFluidCalcDensity<real,kernelType,pressureType>(xi, ((const CudaVec3<real>*)temp_x)[i], density, params);
+                }
+            }
+            __syncthreads();
+            // loop through other groups of particles
+            for (int py0 = range.x; py0 < range.y; py0 += BSIZE)
+            {
+                if (py0 == px0) continue;
+                int py = py0 + threadIdx.x;
+                if (py < range.y)
+                {
+                    int index2 = cells[py];
+                    CudaVec3<real> xj = ((const CudaVec3<real>*)x)[index2];
+                    temp_x[tx3  ] = xj.x;
+                    temp_x[tx3+1] = xj.y;
+                    temp_x[tx3+2] = xj.z;
+                }
+                __syncthreads();
+                if (px < ghost)
+                {
+                    // actual particle -> compute interactions
+                    int np = min(range.y-py0,BSIZE);
+                    for (int i=0; i < np; ++i)
+                    {
+                        SPHFluidCalcDensity<real,kernelType,pressureType>(xi, ((const CudaVec3<real>*)temp_x)[i], density, params);
+                    }
+                }
+                __syncthreads();
+            }
+            if (px < ghost)
+            {
+                // actual particle -> write computed density
+                SPHFluidFinalDensity<real,kernelType,pressureType>(xi, density, params);
+                CudaVec4<real> res = CudaVec4<real>::make(xi.x,xi.y,xi.z,density);
+                ((CudaVec4<real>*)pos4)[index] = res;
+            }
+        }
+    }
+}
+
+template<class real, int kernelType, int pressureType, int viscosityType, int surfaceTensionType>
+__global__ void SPHFluidForceFieldCuda3t_addForce_kernel(int size, const int *cells, const int *cellGhost, GPUSPHFluid<real> params, real* f, const real* pos4, const real* v)
+{
+    __shared__ int2 range;
+    __shared__ int ghost;
+    __shared__ real temp_x[BSIZE*4];
+    __shared__ real temp_v[BSIZE*3];
+    int tx3 = threadIdx.x * 3;
+    int tx4 = threadIdx.x << 2;
+    for (int cell = blockIdx.x; cell < size; cell += gridDim.x)
+    {
+        if (!threadIdx.x)
+        {
+            //range = *(const int2*)(cells+cell);
+            range.x = cells[cell];
+            range.y = cells[cell+1];
+            range.y &= ~(1U<<31);
+            ghost = cellGhost[cell];
+        }
+        __syncthreads();
+        if (range.x <= 0) continue; // no actual particle in this cell
+        for (int px0 = range.x; px0 < ghost; px0 += BSIZE)
+        {
+            int px = px0 + threadIdx.x;
+            CudaVec4<real> xi;
+            CudaVec3<real> vi;
+            CudaVec3<real> force;
+            int index;
+            if (px < range.y)
+            {
+                index = cells[px];
+                xi = ((const CudaVec4<real>*)pos4)[index];
+                temp_x[tx4  ] = xi.x;
+                temp_x[tx4+1] = xi.y;
+                temp_x[tx4+2] = xi.z;
+                temp_x[tx4+3] = xi.w;
+                vi = ((const CudaVec3<real>*)v)[index];
+                temp_v[tx3  ] = vi.x;
+                temp_v[tx3+1] = vi.y;
+                temp_v[tx3+2] = vi.z;
+            }
+            __syncthreads();
+            if (px < ghost)
+            {
+                // actual particle -> compute interactions
+                force = CudaVec3<real>::make(0,0,0);
+                int np = min(range.y-px0,BSIZE);
+                for (int i=0; i < np; ++i)
+                {
+                    if (i != threadIdx.x)
+                        SPHFluidCalcForce<real,kernelType,pressureType,viscosityType,surfaceTensionType>(xi, vi, ((const CudaVec4<real>*)temp_x)[i], ((const CudaVec3<real>*)temp_v)[i], force, params);
+                }
+            }
+            __syncthreads();
+            // loop through other groups of particles
+            for (int py0 = range.x; py0 < range.y; py0 += BSIZE)
+            {
+                if (py0 == px0) continue;
+                int py = py0 + threadIdx.x;
+                if (py < range.y)
+                {
+                    int index2 = cells[py];
+                    CudaVec4<real> xj = ((const CudaVec4<real>*)pos4)[index2];
+                    temp_x[tx4  ] = xj.x;
+                    temp_x[tx4+1] = xj.y;
+                    temp_x[tx4+2] = xj.z;
+                    temp_x[tx4+3] = xj.w;
+                    CudaVec3<real> vj = ((const CudaVec3<real>*)v)[index2];
+                    temp_v[tx3  ] = vj.x;
+                    temp_v[tx3+1] = vj.y;
+                    temp_v[tx3+2] = vj.z;
+                }
+                __syncthreads();
+                if (px < ghost)
+                {
+                    // actual particle -> compute interactions
+                    int np = min(range.y-py0,BSIZE);
+                    for (int i=0; i < np; ++i)
+                    {
+                        SPHFluidCalcForce<real,kernelType,pressureType,viscosityType,surfaceTensionType>(xi, vi, ((const CudaVec4<real>*)temp_x)[i], ((const CudaVec3<real>*)temp_v)[i], force, params);
+                    }
+                }
+                __syncthreads();
+            }
+            if (px < ghost)
+            {
+                // actual particle -> write computed force
+                ((CudaVec3<real>*)f)[index] += force;
+            }
+        }
+    }
+}
+/*
+template<class real, int kernelType, int pressureType, int viscosityType, int surfaceTensionType>
+__global__ void SPHFluidForceFieldCuda3t_addDForce_kernel(int size, const int *cells, const int *cellGhost, GPUSPHFluid<real> params, real* df, const real* pos4, const real* v, const real* dx)
+{
+    __shared__ int2 range;
+    __shared__ int ghost;
+    __shared__ real temp_x[BSIZE*4];
+    __shared__ real temp_v[BSIZE*3];
+    __shared__ real temp_dx[BSIZE*3];
+    int tx3 = threadIdx.x * 3;
+    int tx4 = threadIdx.x << 2;
+    for (int cell = blockIdx.x; cell < size; cell += gridDim.x)
+    {
+        if (!threadIdx.x)
+        {
+            //range = *(const int2*)(cells+cell);
+            range.x = cells[cell];
+            range.y = cells[cell+1];
+            range.y &= ~(1U<<31);
+            ghost = cellGhost[cell];
+        }
+        __syncthreads();
+        if (range.x <= 0) continue; // no actual particle in this cell
+        for (int px0 = range.x; px0 < ghost; px0 += BSIZE)
+        {
+            int px = px0 + threadIdx.x;
+            CudaVec4<real> xi;
+            CudaVec3<real> vi;
+            CudaVec3<real> dxi;
+            CudaVec3<real> dforce;
+            int index;
+            if (px < range.y)
+            {
+                index = cells[px];
+                xi = ((const CudaVec4<real>*)pos4)[index];
+                temp_x[tx4  ] = xi.x;
+                temp_x[tx4+1] = xi.y;
+                temp_x[tx4+2] = xi.z;
+                temp_x[tx4+3] = xi.w;
+                vi = ((const CudaVec3<real>*)v)[index];
+                temp_v[tx3  ] = vi.x;
+                temp_v[tx3+1] = vi.y;
+                temp_v[tx3+2] = vi.z;
+                dxi = ((const CudaVec3<real>*)dx)[index];
+                temp_dx[tx3  ] = dxi.x;
+                temp_dx[tx3+1] = dxi.y;
+                temp_dx[tx3+2] = dxi.z;
+            }
+            __syncthreads();
+            if (px < ghost)
+            { // actual particle -> compute interactions
+                dforce = CudaVec3<real>::make(0,0,0);
+                int np = min(range.y-px0,BSIZE);
+                for (int i=0; i < np; ++i)
+                {
+                    if (i != threadIdx.x)
+                        SPHFluidCalcDForce(xi, vi, dxi, ((const CudaVec4<real>*)temp_x)[i], ((const CudaVec3<real>*)temp_v)[i], ((const CudaVec3<real>*)temp_dx)[i], dforce, params);
+                }
+            }
+            __syncthreads();
+            // loop through other groups of particles
+            for (int py0 = range.x; py0 < range.y; py0 += BSIZE)
+            {
+                if (py0 == px0) continue;
+                int py = py0 + threadIdx.x;
+                if (py < range.y)
+                {
+                    int index2 = cells[py];
+                    CudaVec4<real> xj = ((const CudaVec4<real>*)pos4)[index2];
+                    temp_x[tx4  ] = xj.x;
+                    temp_x[tx4+1] = xj.y;
+                    temp_x[tx4+2] = xj.z;
+                    temp_x[tx4+3] = xj.w;
+                    CudaVec3<real> vj = ((const CudaVec3<real>*)v)[index2];
+                    temp_v[tx3  ] = vj.x;
+                    temp_v[tx3+1] = vj.y;
+                    temp_v[tx3+2] = vj.z;
+                    CudaVec3<real> dxj = ((const CudaVec3<real>*)dx)[index2];
+                    temp_dx[tx3  ] = dxj.x;
+                    temp_dx[tx3+1] = dxj.y;
+                    temp_dx[tx3+2] = dxj.z;
+                }
+                __syncthreads();
+                if (px < ghost)
+                { // actual particle -> compute interactions
+                    int np = min(range.y-py0,BSIZE);
+                    for (int i=0; i < np; ++i)
+                    {
+                        SPHFluidCalcDForce(xi, vi, dxi, ((const CudaVec4<real>*)temp_x)[i], ((const CudaVec3<real>*)temp_v)[i], ((const CudaVec3<real>*)temp_dx)[i], dforce, params);
+                    }
+                }
+                __syncthreads();
+            }
+            if (px < ghost)
+            { // actual particle -> write computed force
+                ((CudaVec3<real>*)df)[index] += dforce;
+            }
+        }
+    }
+}
+*/
+//////////////////////
+// CPU-side methods //
+//////////////////////
+
+void SPHFluidForceFieldCuda3f_computeDensity(int kernelType, int pressureType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3f* params, void* pos4, const void* x)
+{
+    dim3 threads(BSIZE,1);
+    dim3 grid(60,1);
+    switch(kernelType)
+    {
+    case 0:
+        switch(pressureType)
+        {
+        case 0: //SPHFluidForceFieldCuda3t_computeDensity_kernel<float,0,0><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)pos4, (const float*)x);
+            break;
+        case 1: SPHFluidForceFieldCuda3t_computeDensity_kernel<float,0,1><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)pos4, (const float*)x);
+            break;
+        default: break;
+        }
+        break;
+    case 1:
+        switch(pressureType)
+        {
+        case 0: //SPHFluidForceFieldCuda3t_computeDensity_kernel<float,1,0><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)pos4, (const float*)x);
+            break;
+        case 1: SPHFluidForceFieldCuda3t_computeDensity_kernel<float,1,1><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)pos4, (const float*)x);
+            break;
+        default: break;
+        }
+        break;
+    default: break;
+    }
+    mycudaDebugError("SPHFluidForceFieldCuda3t_computeDensity_kernel<float>");
+}
+
+void SPHFluidForceFieldCuda3f_addForce(int kernelType, int pressureType, int viscosityType, int surfaceTensionType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3f* params, void* f, const void* x, const void* v)
+{
+    dim3 threads(BSIZE,1);
+    dim3 grid(60,1);
+    switch(kernelType)
+    {
+    case 0:
+        switch(pressureType)
+        {
+        case 0:
+            switch(viscosityType)
+            {
+            case 0:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,0,0,0><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,0,0,1><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,0,0,2><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 1:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,0,1,0><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,0,1,1><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,0,1,2><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 2:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,0,2,0><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,0,2,1><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,0,2,2><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                default: break;
+                }
+                break;
+            default: break;
+            }
+            break;
+        case 1:
+            switch(viscosityType)
+            {
+            case 0:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,1,0,0><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,1,0,1><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,1,0,2><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 1:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,1,1,0><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,1,1,1><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,1,1,2><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 2:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,1,2,0><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,1,2,1><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<float,0,1,2,2><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                default: break;
+                }
+                break;
+            default: break;
+            }
+            break;
+        default: break;
+        }
+        break;
+    case 1:
+        switch(pressureType)
+        {
+        case 0:
+            switch(viscosityType)
+            {
+            case 0:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,0,0,0><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,0,0,1><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,0,0,2><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 1:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,0,1,0><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,0,1,1><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,0,1,2><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 2:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,0,2,0><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,0,2,1><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,0,2,2><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                default: break;
+                }
+                break;
+            default: break;
+            }
+            break;
+        case 1:
+            switch(viscosityType)
+            {
+            case 0:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,1,0,0><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,1,0,1><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,1,0,2><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 1:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,1,1,0><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,1,1,1><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,1,1,2><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 2:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,1,2,0><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,1,2,1><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<float,1,1,2,2><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)f, (const float*)x, (const float*)v);
+                    break;
+                default: break;
+                }
+                break;
+            default: break;
+            }
+            break;
+        default: break;
+        }
+        break;
+    default: break;
+    }
+    mycudaDebugError("SPHFluidForceFieldCuda3t_addForce_kernel<float>");
+}
+/*
+void SPHFluidForceFieldCuda3f_addDForce(unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3f* params, void* df, const void* x, const void* v, const void* dx)
+{
+	dim3 threads(BSIZE,1);
+	dim3 grid(60/BSIZE,1);
+	{SPHFluidForceFieldCuda3t_addDForce_kernel<float><<< grid, threads, BSIZE*3*sizeof(float) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (float*)df, (const float*)x, (const float*)v, (const float*)dx); mycudaDebugError("SPHFluidForceFieldCuda3t_addDForce_kernel<float>");}
+}
+*/
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+void SPHFluidForceFieldCuda3d_computeDensity(int kernelType, int pressureType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3d* params, void* pos4, const void* x)
+{
+    dim3 threads(BSIZE,1);
+    dim3 grid(60,1);
+    switch(kernelType)
+    {
+    case 0:
+        switch(pressureType)
+        {
+        case 0: //SPHFluidForceFieldCuda3t_computeDensity_kernel<double,0,0><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)pos4, (const double*)x);
+            break;
+        case 1: SPHFluidForceFieldCuda3t_computeDensity_kernel<double,0,1><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)pos4, (const double*)x);
+            break;
+        default: break;
+        }
+        break;
+    case 1:
+        switch(pressureType)
+        {
+        case 0: //SPHFluidForceFieldCuda3t_computeDensity_kernel<double,1,0><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)pos4, (const double*)x);
+            break;
+        case 1: SPHFluidForceFieldCuda3t_computeDensity_kernel<double,1,1><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)pos4, (const double*)x);
+            break;
+        default: break;
+        }
+        break;
+    default: break;
+    }
+    mycudaDebugError("SPHFluidForceFieldCuda3t_computeDensity_kernel<double>");
+
+}
+
+void SPHFluidForceFieldCuda3d_addForce (int kernelType, int pressureType, int viscosityType, int surfaceTensionType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3d* params, void* f, const void* x, const void* v)
+{
+    dim3 threads(BSIZE,1);
+    dim3 grid(60,1);
+    switch(kernelType)
+    {
+    case 0:
+        switch(pressureType)
+        {
+        case 0:
+            switch(viscosityType)
+            {
+            case 0:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,0,0,0><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,0,0,1><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,0,0,2><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 1:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,0,1,0><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,0,1,1><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,0,1,2><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 2:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,0,2,0><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,0,2,1><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,0,2,2><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                default: break;
+                }
+                break;
+            default: break;
+            }
+            break;
+        case 1:
+            switch(viscosityType)
+            {
+            case 0:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,1,0,0><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,1,0,1><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,1,0,2><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 1:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,1,1,0><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,1,1,1><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,1,1,2><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 2:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,1,2,0><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,1,2,1><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<double,0,1,2,2><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                default: break;
+                }
+                break;
+            default: break;
+            }
+            break;
+        default: break;
+        }
+        break;
+    case 1:
+        switch(pressureType)
+        {
+        case 0:
+            switch(viscosityType)
+            {
+            case 0:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,0,0,0><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,0,0,1><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,0,0,2><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 1:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,0,1,0><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,0,1,1><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,0,1,2><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 2:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,0,2,0><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,0,2,1><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,0,2,2><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                default: break;
+                }
+                break;
+            default: break;
+            }
+            break;
+        case 1:
+            switch(viscosityType)
+            {
+            case 0:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,1,0,0><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,1,0,1><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,1,0,2><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 1:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,1,1,0><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,1,1,1><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,1,1,2><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                default: break;
+                }
+                break;
+            case 2:
+                switch(surfaceTensionType)
+                {
+                case 0: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,1,2,0><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 1: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,1,2,1><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                case 2: SPHFluidForceFieldCuda3t_addForce_kernel<double,1,1,2,2><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v);
+                    break;
+                default: break;
+                }
+                break;
+            default: break;
+            }
+            break;
+        default: break;
+        }
+        break;
+    default: break;
+    }
+    mycudaDebugError("SPHFluidForceFieldCuda3t_addForce_kernel<double>");
+    /*
+    	dim3 threads(BSIZE,1);
+    	dim3 grid(60/BSIZE,1);
+        if (params->surfaceTension > 0)
+    	{SPHFluidForceFieldCuda3t_addForce_kernel<double,true><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v); mycudaDebugError("SPHFluidForceFieldCuda3t_addForce_kernel<double,true>");}
+        else
+    	{SPHFluidForceFieldCuda3t_addForce_kernel<double,false><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)f, (const double*)x, (const double*)v); mycudaDebugError("SPHFluidForceFieldCuda3t_addForce_kernel<double,false>");}
+    */
+}
+/*
+void SPHFluidForceFieldCuda3d_addDForce(unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3d* params, void* df, const void* x, const void* v, const void* dx)
+{
+	dim3 threads(BSIZE,1);
+	dim3 grid(60/BSIZE,1);
+	{SPHFluidForceFieldCuda3t_addDForce_kernel<double><<< grid, threads, BSIZE*3*sizeof(double) >>>(size, (const int*) cells, (const int*) cellGhost, *params, (double*)df, (const double*)x, (const double*)v, (const double*)dx); mycudaDebugError("SPHFluidForceFieldCuda3t_addDForce_kernel<double>");}
+}
+*/
+#endif // SOFA_GPU_CUDA_DOUBLE
+
+#if defined(__cplusplus) && CUDA_VERSION < 2000
+} // namespace cuda
+} // namespace gpu
+} // namespace sofa
+#endif
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSPHFluidForceField.h b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSPHFluidForceField.h
new file mode 100644
index 0000000..7b9ead1
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSPHFluidForceField.h
@@ -0,0 +1,136 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#ifndef SOFA_GPU_CUDA_CUDASPHFLUIDFORCEFIELD_H
+#define SOFA_GPU_CUDA_CUDASPHFLUIDFORCEFIELD_H
+
+#include <SofaSphFluid/CUDA/config.h>
+#include <SofaCUDA/sofa/gpu/cuda/CudaTypes.h>
+#include <SofaSphFluid/SPHFluidForceField.h>
+#include <SofaSphFluid/CUDA/CudaSpatialGridContainer.h>
+
+namespace sofa
+{
+
+
+namespace gpu::cuda
+{
+
+template<class real>
+struct GPUSPHFluid
+{
+    real h;         ///< particles radius
+    real h2;        ///< particles radius squared
+    real inv_h2;    ///< particles radius squared inverse
+    real stiffness; ///< pressure stiffness
+    real mass;      ///< particles mass
+    real mass2;     ///< particles mass squared
+    real density0;  ///< 1000 kg/m3 for water
+    real viscosity;
+    real surfaceTension;
+
+    // Precomputed constants for smoothing kernels
+    real CWd;          ///< =     constWd(h)
+    //real CgradWd;      ///< = constGradWd(h)
+    real CgradWp;      ///< = constGradWp(h)
+    real ClaplacianWv; ///< =  constLaplacianWv(h)
+};
+
+typedef GPUSPHFluid<float> GPUSPHFluid3f;
+typedef GPUSPHFluid<double> GPUSPHFluid3d;
+
+} // namespace gpu::cuda
+
+
+namespace component::forcefield
+{
+
+template <class TCoord, class TDeriv, class TReal>
+class SPHFluidForceFieldInternalData< gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> >
+{
+public:
+    typedef gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> DataTypes;
+    typedef SPHFluidForceFieldInternalData<DataTypes> Data;
+    typedef SPHFluidForceField<DataTypes> Main;
+    typedef typename DataTypes::Coord Coord;
+    typedef typename DataTypes::Real Real;
+    gpu::cuda::GPUSPHFluid<Real> params;
+    gpu::cuda::CudaVector<type::Vec4f> pos4;
+
+    void fillParams(Main* m, int kernelType, double kFactor=1.0, double bFactor=1.0)
+    {
+        Real h = m->d_particleRadius.getValue();
+        params.h = h;
+        params.h2 = h*h;
+        params.inv_h2 = 1/(h*h);
+        params.stiffness = (Real)(kFactor*m->d_pressureStiffness.getValue());
+        params.mass = m->d_particleMass.getValue();
+        params.mass2 = params.mass*params.mass;
+        params.density0 = m->d_density0.getValue();
+        params.viscosity = (Real)(bFactor*m->d_viscosity.getValue());
+        params.surfaceTension = (Real)(kFactor*m->d_surfaceTension.getValue());
+        if (kernelType == 1)
+        {
+            params.CWd          = SPHKernel<SPH_KERNEL_CUBIC,Coord>::constW(h);
+            //params.CgradWd      = SPHKernel<SPH_KERNEL_CUBIC,Coord>::constGradW(h);
+            params.CgradWp      = SPHKernel<SPH_KERNEL_CUBIC,Coord>::constGradW(h);
+            params.ClaplacianWv = SPHKernel<SPH_KERNEL_CUBIC,Coord>::constLaplacianW(h);
+        }
+        else
+        {
+            params.CWd          = SPHKernel<SPH_KERNEL_DEFAULT_DENSITY,Coord>::constW(h);
+            //params.CgradWd      = SPHKernel<SPH_KERNEL_DEFAULT_DENSITY,Coord>::constGradW(h);
+            params.CgradWp      = SPHKernel<SPH_KERNEL_DEFAULT_PRESSURE,Coord>::constGradW(h);
+            params.ClaplacianWv = SPHKernel<SPH_KERNEL_DEFAULT_VISCOSITY,Coord>::constLaplacianW(h);
+        }
+    }
+
+    void Kernels_computeDensity(int kernelType, int pressureType, int gsize, const void* cells, const void* cellGhost, void* pos4, const void* x);
+    void Kernels_addForce(int kernelType, int pressureType, int viscosityType, int surfaceTensionType, int gsize, const void* cells, const void* cellGhost, void* f, const void* pos4, const void* vel);
+    //void Kernels_addDForce(int kernelType, int pressureType, int viscosityType, int surfaceTensionType, int gsize, const void* cells, const void* cellGhost, void* f, const void* pos4, const void* dx, const void* vel);
+};
+
+
+template <>
+void SPHFluidForceField<gpu::cuda::CudaVec3fTypes>::addForce(const core::MechanicalParams* mparams, DataVecDeriv& d_f, const DataVecCoord& d_x, const DataVecDeriv& d_v);
+
+template <>
+void SPHFluidForceField<gpu::cuda::CudaVec3fTypes>::addDForce(const core::MechanicalParams* mparams, DataVecDeriv& d_df, const DataVecDeriv& d_dx);
+
+template <>
+void SPHFluidForceField<gpu::cuda::CudaVec3fTypes>::draw(const core::visual::VisualParams* vparams);
+
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+template <>
+void SPHFluidForceField<gpu::cuda::CudaVec3dTypes>::addForce(const core::MechanicalParams* mparams, DataVecDeriv& d_f, const DataVecCoord& d_x, const DataVecDeriv& d_v);
+
+template <>
+void SPHFluidForceField<gpu::cuda::CudaVec3dTypes>::addDForce(const core::MechanicalParams* mparams, DataVecDeriv& d_df, const DataVecDeriv& d_dx);
+
+#endif // SOFA_GPU_CUDA_DOUBLE
+
+} // namespace component::forcefield
+
+
+} // namespace sofa
+
+#endif
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSPHFluidForceField.inl b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSPHFluidForceField.inl
new file mode 100644
index 0000000..752ea1f
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSPHFluidForceField.inl
@@ -0,0 +1,273 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#ifndef SOFA_GPU_CUDA_CUDASPHFLUIDFORCEFIELD_INL
+#define SOFA_GPU_CUDA_CUDASPHFLUIDFORCEFIELD_INL
+
+#include "CudaSPHFluidForceField.h"
+#include <SofaSphFluid/SPHFluidForceField.inl>
+#include <sofa/gl/template.h>
+#include <sofa/core/MechanicalParams.h>
+
+namespace sofa
+{
+
+
+namespace gpu::cuda
+{
+
+extern "C"
+{
+
+void SPHFluidForceFieldCuda3f_computeDensity(int kernelType, int pressureType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3f* params, void* pos4, const void* x);
+void SPHFluidForceFieldCuda3f_addForce (int kernelType, int pressureType, int viscosityType, int surfaceTensionType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3f* params, void* f, const void* pos4, const void* v);
+//void SPHFluidForceFieldCuda3f_addDForce(int kernelType, int pressureType, int viscosityType, int surfaceTensionType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3f* params, void* f, const void* pos4, const void* v, const void* dx);
+
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+void SPHFluidForceFieldCuda3d_computeDensity(int kernelType, int pressureType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3d* params, void* pos4, const void* x);
+void SPHFluidForceFieldCuda3d_addForce (int kernelType, int pressureType, int viscosityType, int surfaceTensionType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3d* params, void* f, const void* pos4, const void* v);
+//void SPHFluidForceFieldCuda3d_addDForce(int kernelType, int pressureType, int viscosityType, int surfaceTensionType, unsigned int size, const void* cells, const void* cellGhost, GPUSPHFluid3d* params, void* f, const void* pos4, const void* v, const void* dx);
+
+#endif // SOFA_GPU_CUDA_DOUBLE
+}
+
+} // namespace gpu::cuda
+
+
+namespace component::forcefield
+{
+
+using namespace gpu::cuda;
+
+template<>
+void SPHFluidForceFieldInternalData<gpu::cuda::CudaVec3fTypes>::Kernels_computeDensity(int kernelType, int pressureType, int gsize, const void* cells, const void* cellGhost, void* pos4, const void* x)
+{
+    SPHFluidForceFieldCuda3f_computeDensity(kernelType, pressureType, gsize, cells, cellGhost, &params, pos4, x);
+}
+
+template<>
+void SPHFluidForceFieldInternalData<gpu::cuda::CudaVec3fTypes>::Kernels_addForce(int kernelType, int pressureType, int viscosityType, int surfaceTensionType, int gsize, const void* cells, const void* cellGhost, void* f, const void* pos4, const void* v)
+{
+    SPHFluidForceFieldCuda3f_addForce (kernelType, pressureType, viscosityType, surfaceTensionType, gsize, cells, cellGhost, &params, f, pos4, v);
+}
+/*
+template<>
+void SPHFluidForceFieldInternalData<gpu::cuda::CudaVec3fTypes>::Kernels_addDForce(int kernelType, int pressureType, int viscosityType, int surfaceTensionType, int gsize, const void* cells, const void* cellGhost, void* f, const void* pos4, const void* v, const void* dx)
+{
+    SPHFluidForceFieldCuda3f_addDForce(kernelType, pressureType, viscosityType, surfaceTensionType, gsize, cells, cellGhost, &params, f, pos4, v, dx);
+}
+*/
+template <>
+void SPHFluidForceField<gpu::cuda::CudaVec3fTypes>::addForce(const core::MechanicalParams* /*mparams*/, DataVecDeriv& d_f, const DataVecCoord& d_x, const DataVecDeriv& d_v)
+{
+    if (m_grid == NULL) return;
+
+    const int kernelT = d_kernelType.getValue();
+    const int pressureT = d_pressureType.getValue();
+    const Real viscosity = this->d_viscosity.getValue();
+    const int viscosityT = (viscosity == 0) ? 0 : d_viscosityType.getValue();
+    const Real surfaceTension = this->d_surfaceTension.getValue();
+    const int surfaceTensionT = (surfaceTension <= 0) ? 0 : d_surfaceTensionType.getValue();
+
+    VecDeriv& f = *d_f.beginEdit();
+    const VecCoord& x = d_x.getValue();
+    const VecDeriv& v = d_v.getValue();
+
+    m_grid->updateGrid(x);
+    data.fillParams(this, kernelT);
+    f.resize(x.size());
+    Grid::Grid* g = m_grid->getGrid();
+    data.pos4.recreate(x.size());
+    data.Kernels_computeDensity( kernelT, pressureT,
+                                 g->getNbCells(), g->getCellsVector().deviceRead(), g->getCellGhostVector().deviceRead(),
+                                 data.pos4.deviceWrite(), x.deviceRead());
+
+    msg_info() << "density[" << 0 << "] = " << data.pos4[0][3]
+            << "density[" << data.pos4.size()/2 << "] = " << data.pos4[data.pos4.size()/2][3];
+
+    data.Kernels_addForce( kernelT, pressureT, viscosityT, surfaceTensionT,
+                           g->getNbCells(), g->getCellsVector().deviceRead(), g->getCellGhostVector().deviceRead(),
+                           f.deviceWrite(), data.pos4.deviceRead(), v.deviceRead());
+
+    d_f.endEdit();
+}
+
+template <>
+void SPHFluidForceField<gpu::cuda::CudaVec3fTypes>::addDForce(const core::MechanicalParams* mparams, DataVecDeriv& /*d_df*/, const DataVecDeriv& /*d_dx*/)
+{
+    mparams->setKFactorUsed(true);
+#if 0
+    if (m_grid == NULL) return;
+
+    const int kernelT = d_kernelType.getValue();
+    const int pressureT = d_pressureType.getValue();
+    const Real viscosity = this->d_viscosity.getValue();
+    const int viscosityT = (viscosity == 0) ? 0 : d_viscosityType.getValue();
+    const Real surfaceTension = this->d_surfaceTension.getValue();
+    const int surfaceTensionT = (surfaceTension <= 0) ? 0 : d_surfaceTensionType.getValue();
+
+    VecDeriv& df = *d_df.beginEdit();
+    const VecDeriv& dx = d_dx.getValue();
+
+    const VecDeriv& v = this->mstate->read(core::vec_id::read_access::velocity)->getValue();
+    data.fillParams(this, kernelT, mparams->kFactor(), sofa::core::mechanicalparams::bFactor(mparams));
+    df.resize(dx.size());
+    Grid::Grid* g = m_grid->getGrid();
+    data.Kernels_addDForce( kernelT, pressureT, viscosityT, surfaceTensionT,
+                            g->getNbCells(), g->getCellsVector().deviceRead(), g->getCellGhostVector().deviceRead(),
+                            df.deviceWrite(), data.pos4.deviceRead(), v.deviceRead(), dx.deviceRead());
+
+    d_df.endEdit();
+#endif
+}
+
+
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+
+template<>
+void SPHFluidForceFieldInternalData<gpu::cuda::CudaVec3dTypes>::Kernels_computeDensity(int kernelType, int pressureType, int gsize, const void* cells, const void* cellGhost, void* pos4, const void* x)
+{
+    SPHFluidForceFieldCuda3d_computeDensity(kernelType, pressureType, gsize, cells, cellGhost, &params, pos4, x);
+}
+
+template<>
+void SPHFluidForceFieldInternalData<gpu::cuda::CudaVec3dTypes>::Kernels_addForce(int kernelType, int pressureType, int viscosityType, int surfaceTensionType, int gsize, const void* cells, const void* cellGhost, void* f, const void* pos4, const void* v)
+{
+    SPHFluidForceFieldCuda3d_addForce (kernelType, pressureType, viscosityType, surfaceTensionType, gsize, cells, cellGhost, &params, f, pos4, v);
+}
+/*
+template<>
+void SPHFluidForceFieldInternalData<gpu::cuda::CudaVec3dTypes>::Kernels_addDForce(int kernelType, int pressureType, int viscosityType, int surfaceTensionType, int gsize, const void* cells, const void* cellGhost, void* f, const void* pos4, const void* v, const void* dx)
+{
+    SPHFluidForceFieldCuda3d_addDForce(kernelType, pressureType, viscosityType, surfaceTensionType, gsize, cells, cellGhost, &params, f, pos4, v, dx);
+}
+*/
+template <>
+void SPHFluidForceField<gpu::cuda::CudaVec3dTypes>::addForce(const core::MechanicalParams* /*mparams*/, DataVecDeriv& d_f, const DataVecCoord& d_x, const DataVecDeriv& d_v)
+{
+    if (m_grid == NULL) return;
+
+    const int kernelT = d_kernelType.getValue();
+    const int pressureT = d_pressureType.getValue();
+    const Real viscosity = this->d_viscosity.getValue();
+    const int viscosityT = (viscosity == 0) ? 0 : d_viscosityType.getValue();
+    const Real surfaceTension = this->d_surfaceTension.getValue();
+    const int surfaceTensionT = (surfaceTension <= 0) ? 0 : d_surfaceTensionType.getValue();
+
+    VecDeriv& f = *d_f.beginEdit();
+    const VecCoord& x = d_x.getValue();
+    const VecDeriv& v = d_v.getValue();
+
+    m_grid->updateGrid(x);
+    data.fillParams(this, kernelT);
+    f.resize(x.size());
+    Grid::Grid* g = m_grid->getGrid();
+    data.pos4.recreate(x.size());
+    data.Kernels_computeDensity( kernelT, pressureT,
+                                 g->getNbCells(), g->getCellsVector().deviceRead(), g->getCellGhostVector().deviceRead(),
+                                 data.pos4.deviceWrite(), x.deviceRead());
+    msg_info() << "density[" << 0 << "] = " << data.pos4[0][3]
+            << "density[" << data.pos4.size()/2 << "] = " << data.pos4[data.pos4.size()/2][3];
+    data.Kernels_addForce( kernelT, pressureT, viscosityT, surfaceTensionT,
+                           g->getNbCells(), g->getCellsVector().deviceRead(), g->getCellGhostVector().deviceRead(),
+                           f.deviceWrite(), data.pos4.deviceRead(), v.deviceRead());
+
+    d_f.endEdit();
+}
+
+template <>
+void SPHFluidForceField<gpu::cuda::CudaVec3dTypes>::addDForce(const core::MechanicalParams* mparams, DataVecDeriv& /*d_df*/, const DataVecDeriv& /*d_dx*/)
+{
+    mparams->setKFactorUsed(true);
+#if 0
+    if (m_grid == NULL) return;
+
+    const int kernelT = d_kernelType.getValue();
+    const int pressureT = d_pressureType.getValue();
+    const Real viscosity = this->d_viscosity.getValue();
+    const int viscosityT = (viscosity == 0) ? 0 : d_viscosityType.getValue();
+    const Real surfaceTension = this->d_surfaceTension.getValue();
+    const int surfaceTensionT = (surfaceTension <= 0) ? 0 : d_surfaceTensionType.getValue();
+
+    VecDeriv& df = *d_df.beginEdit();
+    const VecDeriv& dx = d_dx.getValue();
+    //const VecCoord& x = this->mstate->read(core::ConstVecCoordId::position())->getValue();
+    const VecDeriv& v = this->mstate->read(core::vec_id::read_access::velocity)->getValue();
+    data.fillParams(this, mparams->kFactor(), sofa::core::mechanicalparams::bFactor(mparams));
+    df.resize(dx.size());
+    Grid::Grid* g = m_grid->getGrid();
+    data.Kernels_addDForce( kernelT, pressureT, viscosityT, surfaceTensionT,
+                            g->getNbCells(), g->getCellsVector().deviceRead(), g->getCellGhostVector().deviceRead(),
+                            df.deviceWrite(), data.pos4.deviceRead(), v.deviceRead(), dx.deviceRead());
+    d_df.endEdit();
+#endif
+}
+
+#endif // SOFA_GPU_CUDA_DOUBLE
+
+
+
+template <>
+void SPHFluidForceField<gpu::cuda::CudaVec3fTypes>::draw(const core::visual::VisualParams* vparams)
+{
+#if SOFACUDA_HAVE_SOFA_GL == 1
+    if (!vparams->displayFlags().getShowForceFields()) return;
+    //if (m_grid != NULL)
+    //	grid->draw(vparams);
+    helper::ReadAccessor<VecCoord> x = this->mstate->read(sofa::core::vec_id::read_access::position)->getValue();
+    helper::ReadAccessor<gpu::cuda::CudaVector<type::Vec4f> > pos4 = this->data.pos4;
+    if (pos4.empty()) return;
+    glDisable(GL_LIGHTING);
+    glColor3f(0,1,1);
+    glDisable(GL_BLEND);
+    glDepthMask(1);
+    glPointSize(5);
+    glBegin(GL_POINTS);
+    for (unsigned int i=0; i<pos4.size(); i++)
+    {
+        float density = pos4[i][3];
+        float f = (float)(density / d_density0.getValue());
+        f = 1+10*(f-1);
+        if (f < 1)
+        {
+            glColor3f(0,1-f,f);
+        }
+        else
+        {
+            glColor3f(f-1,0,2-f);
+        }
+        sofa::gl::glVertexT(x[i]);
+    }
+    glEnd();
+    glPointSize(1);
+#else
+    SOFA_UNUSED(vparams);
+#endif // SOFACUDA_HAVE_SOFA_GL == 1
+}
+
+} // namespace component::forcefield
+
+
+} // namespace sofa
+
+#endif
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSpatialGridContainer.cpp b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSpatialGridContainer.cpp
new file mode 100644
index 0000000..3fe81be
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSpatialGridContainer.cpp
@@ -0,0 +1,54 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#define SOFASPHFLUID_CUDA_CUDASPATIALGRIDCONTAINER_CPP
+#include <SofaSphFluid/CUDA/CudaSpatialGridContainer.inl>
+#include <sofa/component/statecontainer/MechanicalObject.inl>
+#include <sofa/core/ObjectFactory.h>
+
+
+namespace sofa::component::container
+{
+
+using namespace sofa::defaulttype;
+using namespace sofa::gpu::cuda;
+using namespace core::behavior;
+
+
+int SpatialGridContainerCudaClass = core::RegisterObject("GPU support using CUDA.")
+        .add< SpatialGridContainer<CudaVec3fTypes> >()
+        ;
+
+template class SOFA_SOFASPHFLUID_CUDA_API SpatialGridContainer< CudaVec3fTypes >;
+template class SOFA_SOFASPHFLUID_CUDA_API SpatialGrid< SpatialGridTypes< CudaVec3fTypes > >;
+
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+template class SOFA_SOFASPHFLUID_CUDA_API SpatialGridContainer< CudaVec3dTypes >;
+template class SOFA_SOFASPHFLUID_CUDA_API SpatialGrid< SpatialGridTypes< CudaVec3dTypes > >;
+
+#endif // SOFA_GPU_CUDA_DOUBLE
+
+} // namespace sofa::component::container
+
+
+
+
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSpatialGridContainer.cu b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSpatialGridContainer.cu
new file mode 100644
index 0000000..83af602
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSpatialGridContainer.cu
@@ -0,0 +1,377 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+/* PART OF THIS FILE IS FROM NVIDIA CUDA SDK particles demo:
+ *
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer software" and "commercial computer software
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ */
+
+
+#include <SofaCUDA/sofa/gpu/cuda/CudaCommon.h>
+#include <SofaCUDA/sofa/gpu/cuda/CudaMath.h>
+#include <SofaCUDA/sofa/gpu/cuda/mycuda.h>
+#include <cuda.h>
+
+#if defined(__cplusplus) && CUDA_VERSION < 2000
+namespace sofa
+{
+namespace gpu
+{
+namespace cuda
+{
+#endif
+
+extern "C"
+{
+    void SpatialGridContainer3f_computeHash(int cellBits, float cellWidth, int nbPoints, void* particleIndex8, void* particleHash8, const void* x);
+    void SpatialGridContainer3f1_computeHash(int cellBits, float cellWidth, int nbPoints, void* particleIndex8, void* particleHash8, const void* x);
+    void SpatialGridContainer_findCellRange(int cellBits, int index0, float cellWidth, int nbPoints, const void* particleHash8, void* cellRange, void* cellGhost);
+//void SpatialGridContainer3f_reorderData(int nbPoints, const void* particleHash, void* sorted, const void* x);
+//void SpatialGridContainer3f1_reorderData(int nbPoints, const void* particleHash, void* sorted, const void* x);
+}
+
+#define USE_TEX 0
+
+struct GridParams
+{
+    float cellWidth;
+    float invCellWidth;
+    int cellMask;
+    float halfCellWidth;
+    float invHalfCellWidth;
+};
+
+// large prime numbers
+#define HASH_PX 73856093
+#define HASH_PY 19349663
+#define HASH_PZ 83492791
+
+//////////////////////
+// GPU-side methods //
+//////////////////////
+
+#if USE_TEX
+texture<int, 1, cudaReadModeElementType> cellRangeTex;
+#endif
+
+__constant__ GridParams gridParams;
+
+// calculate cell in grid from position
+template<class T>
+__device__ int3 calcGridPos(T p)
+{
+    int3 i;
+    i.x = __float2int_rd(p.x * gridParams.invCellWidth);
+    i.y = __float2int_rd(p.y * gridParams.invCellWidth);
+    i.z = __float2int_rd(p.z * gridParams.invCellWidth);
+    return i;
+}
+
+// calculate address in grid from position
+__device__ unsigned int calcGridHashI(int3 p)
+{
+    //return ((p.x<<10)^(p.y<<5)^(p.z)) & gridParams.cellMask;
+    //return ((p.x)^(p.y)^(p.z)) & gridParams.cellMask;
+    return (__mul24(HASH_PX,p.x)^__mul24(HASH_PY,p.y)^__mul24(HASH_PZ,p.z)) & gridParams.cellMask;
+    //return (p.x) & gridParams.cellMask;
+}
+
+// calculate address in grid from position
+template<class T>
+__device__ unsigned int calcGridHash(T p)
+{
+    return calcGridHashI(calcGridPos(p));
+}
+
+
+__device__ __inline__ float3 getPos3(const float4* pos, int index0, int index)
+{
+    float4 p = pos[index];
+    return make_float3(p.x,p.y,p.z);
+}
+
+__shared__ float ftemp[BSIZE*3];
+
+__device__ __inline__ float3 getPos3(const float3* pos, int index0, int index)
+{
+    //return pos[index];
+
+    int index03 = index0 * 3;
+    int index3 = threadIdx.x * 3;
+    ftemp[threadIdx.x] = ((const float*)pos)[index03+threadIdx.x];
+    ftemp[threadIdx.x+BSIZE] = ((const float*)pos)[index03+threadIdx.x+BSIZE];
+    ftemp[threadIdx.x+2*BSIZE] = ((const float*)pos)[index03+threadIdx.x+2*BSIZE];
+    __syncthreads();
+    return make_float3(ftemp[index3],ftemp[index3+1],ftemp[index3+2]);
+}
+
+__device__ __inline__ float4 getPos4(const float4* pos, int index0, int index)
+{
+    return pos[index];
+}
+
+__device__ __inline__ float4 getPos4(const float3* pos, int index0, int index)
+{
+    int index3 = threadIdx.x * 3;
+    pos += index0;
+    ftemp[threadIdx.x] = ((const float*)pos)[threadIdx.x];
+    ftemp[threadIdx.x+BSIZE] = ((const float*)pos)[threadIdx.x+BSIZE];
+    ftemp[threadIdx.x+2*BSIZE] = ((const float*)pos)[threadIdx.x+2*BSIZE];
+    __syncthreads();
+    return make_float4(ftemp[index3],ftemp[index3+1],ftemp[index3+2],0.0f);
+}
+
+__device__ __inline__ float4 getPos4(const float4* pos, int index)
+{
+    return pos[index];
+}
+
+__device__ __inline__ float4 getPos4(const float3* pos, int index)
+{
+    float3 p = pos[index];
+    return make_float4(p.x,p.y,p.z,1.0f);
+}
+
+// calculate grid hash value for each particle
+template<class TIn>
+__global__ void
+computeHashD(const TIn* pos,
+        unsigned int* particleIndex8, unsigned int*  particleHash8, int n)
+{
+    int index0 = (blockIdx.x*BSIZE);
+    int index = index0 + threadIdx.x;
+    int nt = n - index0; if (nt > BSIZE) nt = BSIZE;
+    float3 p = getPos3(pos,index0,index);
+
+    int3 hgpos;
+    hgpos.x = __float2int_rd(p.x * gridParams.invHalfCellWidth);
+    hgpos.y = __float2int_rd(p.y * gridParams.invHalfCellWidth);
+    hgpos.z = __float2int_rd(p.z * gridParams.invHalfCellWidth);
+    int halfcell = ((hgpos.x&1) + ((hgpos.y&1)<<1) + ((hgpos.z&1)<<2))^7;
+    // compute the first cell to be influenced by the particle
+    hgpos.x = (hgpos.x-1) >> 1;
+    hgpos.y = (hgpos.y-1) >> 1;
+    hgpos.z = (hgpos.z-1) >> 1;
+
+    __syncthreads();
+
+    __shared__ int hx[3*BSIZE];
+    int x = threadIdx.x;
+
+//    hx[x] = (__mul24(HASH_PX,hgpos.x) << 3)+halfcell;
+//    hy[x] = __mul24(HASH_PY,hgpos.y);
+//    hz[x] = __mul24(HASH_PZ,hgpos.z);
+    hx[x] = ((HASH_PX*hgpos.x) << 3)+halfcell;
+    hx[BSIZE+x] = (HASH_PY*hgpos.y);
+    hx[2*BSIZE+x] = (HASH_PZ*hgpos.z);
+    __syncthreads();
+    int3 dH;
+    dH.x = (x&1 ? HASH_PX : 0);
+    dH.y = (x&2 ? HASH_PY : 0);
+    dH.z = (x&4 ? HASH_PZ : 0);
+    int x_7 = x&7;
+    int index0_8_x_7 = (index0 << 3) + x_7;
+    for (unsigned int lx = x>>3; lx < nt; lx+=(BSIZE>>3))
+    {
+        particleIndex8[index0_8_x_7 + (lx<<3)] = index0 + lx;
+        int3 h;
+        h.x = hx[lx];
+        h.y = hx[BSIZE+lx];
+        h.z = hx[2*BSIZE+lx];
+        int hc = h.x & 7;
+        h.x = (h.x>>3) + dH.x;
+        h.y += dH.y;
+        h.z += dH.z;
+        unsigned int hash = ((h.x ^ h.y ^ h.z) & gridParams.cellMask)<<1;
+        if (hc != x_7) ++hash;
+        particleHash8[index0_8_x_7 + (lx<<3)] = hash;
+    }
+}
+
+// find start of each cell in sorted particle list by comparing with previous hash value
+// one thread per particle
+__global__ void
+findCellRangeD(int index0, const unsigned int* particleHash,
+        int * cellRange, int* cellGhost, int n)
+{
+    unsigned int i = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    __shared__ unsigned int hash[BSIZE];
+    if (i < n)
+        hash[threadIdx.x] = particleHash[i];
+
+    __syncthreads();
+
+    if (i < n)
+    {
+        bool firstInCell;
+        bool firstGhost;
+        unsigned int cur = hash[threadIdx.x];
+        if (i == 0)
+        {
+            firstInCell = true;
+            firstGhost = cur&1;
+        }
+        else
+        {
+            unsigned int prev;
+            if (threadIdx.x > 0)
+                prev = hash[threadIdx.x-1];
+            else
+                prev = particleHash[i-1];
+            firstInCell = ((prev>>1) != (cur>>1));
+            firstGhost = ((prev != cur) && (cur&1));
+            if (firstInCell)
+            {
+                if ((prev>>1) < (cur>>1)-1)
+                    cellRange[ (prev>>1)+1 ] =  (index0+i) | (1U<<31);
+                if (!(prev&1)) // no ghost particles in previous cell
+                    cellGhost[ prev>>1 ] = index0+i;
+            }
+        }
+        if (firstInCell)
+            cellRange[ cur>>1 ] = index0+i;
+        if (firstGhost)
+            cellGhost[ cur>>1 ] = index0+i;
+        if (i == n-1)
+        {
+            cellRange[ (cur>>1)+1 ] = (index0+n) | (1U<<31);
+            if (!(cur&1))
+                cellGhost[ cur>>1 ] = index0+n;
+        }
+    }
+}
+
+// rearrange particle data into sorted order
+template<class TIn>
+__global__ void
+reorderDataD(const uint2*  particleHash,  // particle id sorted by hash
+        const TIn* oldPos,
+        float4* sortedPos, int n
+            )
+{
+    int index0 = __mul24(blockIdx.x, blockDim.x);
+    int index = index0 + threadIdx.x;
+    if (index < n)
+    {
+        volatile uint2 sortedData = particleHash[index];
+        //float4 pos = getPos4(oldPos,index0,index);
+        float4 pos = getPos4(oldPos,sortedData.y);
+        sortedPos[index] = pos;
+    }
+}
+
+
+//////////////////////
+// CPU-side methods //
+//////////////////////
+
+void SpatialGridContainer3f_computeHash(int cellBits, float cellWidth, int nbPoints, void* particleIndex8, void* particleHash8, const void* x)
+{
+    GridParams p;
+    p.cellWidth = cellWidth;
+    p.invCellWidth = 1.0f/cellWidth;
+    p.cellMask = (1<<cellBits)-1;
+    p.halfCellWidth = cellWidth*0.5f;
+    p.invHalfCellWidth = 2.0f/cellWidth;
+    cudaMemcpyToSymbol(gridParams, &p, sizeof(GridParams));
+
+    // First compute hash of each particle
+    {
+        dim3 threads(BSIZE,1);
+        dim3 grid((nbPoints+BSIZE-1)/BSIZE,1);
+        {computeHashD<float3><<< grid, threads >>>((const float3*)x, (unsigned int*)particleIndex8, (unsigned int*)particleHash8, nbPoints); mycudaDebugError("computeHashD<float3>");}
+    }
+}
+
+void SpatialGridContainer3f1_computeHash(int cellBits, float cellWidth, int nbPoints, void* particleIndex8, void* particleHash8, const void* x)
+{
+    GridParams p;
+    p.cellWidth = cellWidth;
+    p.invCellWidth = 1.0f/cellWidth;
+    p.cellMask = (1<<cellBits)-1;
+    p.halfCellWidth = cellWidth*0.5f;
+    p.invHalfCellWidth = 2.0f/cellWidth;
+    cudaMemcpyToSymbol(gridParams, &p, sizeof(GridParams));
+
+    // First compute hash of each particle
+    {
+        dim3 threads(BSIZE,1);
+        dim3 grid((nbPoints+BSIZE-1)/BSIZE,1);
+        {computeHashD<float4><<< grid, threads >>>((const float4*)x, (unsigned int*)particleIndex8, (unsigned int*)particleHash8, nbPoints); mycudaDebugError("computeHashD<float4>");}
+    }
+}
+
+void SpatialGridContainer_findCellRange(int cellBits, int index0, float cellWidth, int nbPoints, const void* particleHash8, void* cellRange, void* cellGhost)
+{
+    cudaMemset(cellRange, 0, ((1<<cellBits)+1)*sizeof(int));
+
+    // Then find the start of each cell
+    {
+        dim3 threads(BSIZE,1);
+        dim3 grid((8*nbPoints+BSIZE-1)/BSIZE,1);
+        {findCellRangeD<<< grid, threads >>>(index0, (const unsigned int*)particleHash8, (int*)cellRange, (int*)cellGhost, 8*nbPoints); mycudaDebugError("findCellRangeD");}
+    }
+}
+/*
+void SpatialGridContainer3f_reorderData(int nbPoints, const void* particleHash, void* sorted, const void* x)
+{
+    dim3 threads(BSIZE,1);
+    dim3 grid((nbPoints+BSIZE-1)/BSIZE,1);
+    {reorderDataD<float3><<< grid, threads >>>((const uint2*)particleHash, (const float3*)x, (float4*)sorted, nbPoints); mycudaDebugError("reorderDataD<float3>");}
+}
+
+void SpatialGridContainer3f1_reorderData(int nbPoints, const void* particleHash, void* sorted, const void* x)
+{
+    dim3 threads(BSIZE,1);
+    dim3 grid((nbPoints+BSIZE-1)/BSIZE,1);
+    {reorderDataD<float4><<< grid, threads >>>((const uint2*)particleHash, (const float4*)x, (float4*)sorted, nbPoints); mycudaDebugError("reorderDataD<float4>");}
+}
+*/
+#if defined(__cplusplus) && CUDA_VERSION < 2000
+} // namespace cuda
+} // namespace gpu
+} // namespace sofa
+#endif
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSpatialGridContainer.h b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSpatialGridContainer.h
new file mode 100644
index 0000000..da55f21
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSpatialGridContainer.h
@@ -0,0 +1,141 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+//
+// C++ Interface: SpatialGridContainer
+//
+// Description:
+//
+//
+// Author: The SOFA team <http://www.sofa-framework.org>, (C) 2006
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+
+#ifndef SOFA_GPU_CUDA_CUDASPATIALGRIDCONTAINER_H
+#define SOFA_GPU_CUDA_CUDASPATIALGRIDCONTAINER_H
+
+#include <SofaSphFluid/CUDA/config.h>
+#include <SofaSphFluid/SpatialGridContainer.h>
+#include <SofaCUDA/sofa/gpu/cuda/CudaTypes.h>
+#include <sofa/type/Vec.h>
+
+
+namespace sofa::component::container
+{
+
+using namespace sofa::defaulttype;
+
+template<class TCoord, class TDeriv, class TReal>
+class SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> > >
+{
+public:
+    typedef SpatialGridTypes < gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> > DataTypes;
+    typedef typename DataTypes::Real Real;
+    typedef typename DataTypes::Coord Coord;
+    typedef typename DataTypes::VecCoord VecCoord;
+    typedef typename DataTypes::CellData CellData;
+    typedef typename DataTypes::GridData GridData;
+    //typedef typename DataTypes::NeighborListener NeighborListener;
+    typedef typename DataTypes::ParticleField ParticleField;
+
+    enum
+    {
+        HASH_PX = 73856093,
+        HASH_PY = 19349663,
+        HASH_PZ = 83492791,
+    };
+
+public:
+    SpatialGrid(Real cellWidth);
+
+    ~SpatialGrid();
+
+    void update(const VecCoord& x);
+
+    void draw(const core::visual::VisualParams*);
+
+    template<class NeighborListener>
+    void findNeighbors(NeighborListener* dest, Real dist);
+
+    void computeField(ParticleField* field, Real dist);
+
+    /// Change particles ordering inside a given cell have contiguous indices
+    ///
+    /// Fill the old2new and new2old arrays giving the permutation to apply
+    void reorderIndices(type::vector<Index>* old2new, type::vector<Index>* new2old);
+    GridData data;
+
+    Real getCellWidth() const { return cellWidth; }
+    Real getInvCellWidth() const { return invCellWidth; }
+
+    int getCellBits() const { return cellBits; }
+    int getNbCells() const { return nbCells; }
+
+    int getCell(const Coord& c) const
+    {
+        return ( (helper::rfloor(c[0]*invCellWidth*0.5f) * HASH_PX) ^
+                (helper::rfloor(c[1]*invCellWidth*0.5f) * HASH_PY) ^
+                (helper::rfloor(c[2]*invCellWidth*0.5f) * HASH_PZ) ) & ((1 << cellBits)-1);
+    }
+
+    //const sofa::gpu::cuda::CudaVector< unsigned int >& getParticleIndexVector() const { return particleIndex; }
+    const sofa::gpu::cuda::CudaVector< int >& getCellsVector() const { return cells; }
+    const sofa::gpu::cuda::CudaVector< int >& getCellGhostVector() const { return cellGhost; }
+
+protected:
+    const Real cellWidth;
+    const Real invCellWidth;
+    int cellBits, nbCells;
+    sofa::gpu::cuda::CudaVector< unsigned int > /*particleIndex,*/ particleHash;
+    //sofa::gpu::cuda::CudaVector< int > cellRange;
+    sofa::gpu::cuda::CudaVector< int > cells;
+    sofa::gpu::cuda::CudaVector< int > cellGhost;
+    sofa::gpu::cuda::CudaVector< sofa::gpu::cuda::Vec3f1 > sortedPos;
+    const VecCoord* lastX;
+
+    void kernel_computeHash(
+        int cellBits, Real cellWidth, int nbPoints, void* particleIndex, void* particleHash, const void* x);
+
+    void kernel_updateGrid(
+        int cellBits, int index0, Real cellWidth, int nbPoints, const void* particleHash,
+        void* cells, void* cellGhost);
+    //void kernel_reorderData(int nbPoints, const void* particleIndex, const void* particleHash, void* sorted, const void* x);
+
+};
+
+#if !defined(SOFASPHFLUID_CUDA_CUDASPATIALGRIDCONTAINER_CPP)
+extern template class SOFA_SOFASPHFLUID_CUDA_API SpatialGridContainer<sofa::gpu::cuda::CudaVec3fTypes >;
+extern template class SOFA_SOFASPHFLUID_CUDA_API SpatialGrid<sofa::component::container::SpatialGridTypes<sofa::gpu::cuda::CudaVec3fTypes > >;
+
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+extern template class SOFA_SOFASPHFLUID_CUDA_API SpatialGridContainer< sofa::gpu::cuda::CudaVec3dTypes >;
+extern template class SOFA_SOFASPHFLUID_CUDA_API SpatialGrid< sofa::component::container::SpatialGridTypes< sofa::gpu::cuda::CudaVec3dTypes > >;
+
+#endif // SOFA_GPU_CUDA_DOUBLE
+
+#endif
+
+} // namespace sofa::component::container
+
+#endif
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSpatialGridContainer.inl b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSpatialGridContainer.inl
new file mode 100644
index 0000000..e75986a
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/CudaSpatialGridContainer.inl
@@ -0,0 +1,336 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+//
+// C++ Interface: SpatialGridContainer
+//
+// Description:
+//
+//
+// Author: The SOFA team <http://www.sofa-framework.org>, (C) 2006
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+
+#ifndef SOFA_GPU_CUDA_CUDASPATIALGRIDCONTAINER_INL
+#define SOFA_GPU_CUDA_CUDASPATIALGRIDCONTAINER_INL
+
+#include <SofaSphFluid/CUDA/CudaSpatialGridContainer.h>
+#include <SofaCUDA/sofa/gpu/cuda/CudaSort.h>
+#include <SofaSphFluid/SpatialGridContainer.inl>
+#include <sofa/gl/template.h>
+
+namespace sofa
+{
+
+
+namespace gpu::cuda
+{
+
+extern "C"
+{
+    void SpatialGridContainer3f_computeHash(int cellBits, float cellWidth, int nbPoints, void* particleIndex8, void* particleHash8, const void* x);
+    void SpatialGridContainer3f1_computeHash(int cellBits, float cellWidth, int nbPoints, void* particleIndex8, void* particleHash8, const void* x);
+    void SpatialGridContainer_findCellRange(int cellBits, int index0, float cellWidth, int nbPoints, const void* particleHash8, void* cellRange, void* cellGhost);
+//void SpatialGridContainer3f_reorderData(int nbPoints, const void* particleHash, void* sorted, const void* x);
+//void SpatialGridContainer3f1_reorderData(int nbPoints, const void* particleHash, void* sorted, const void* x);
+}
+
+} // namespace gpu::cuda
+
+
+namespace component::container
+{
+
+using namespace sofa::helper;
+
+//      template<class TCoord, class TDeriv, class TReal>
+//      typename SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> > >::Grid SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> > >::emptyGrid;
+
+template<class TCoord, class TDeriv, class TReal>
+SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> > >::SpatialGrid(Real cellWidth)
+    : cellWidth(cellWidth), invCellWidth(1/cellWidth), lastX(NULL)
+{
+    cellBits = 15;
+    nbCells = 1<<cellBits;
+}
+
+template<class TCoord, class TDeriv, class TReal>
+SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> > >::~SpatialGrid()
+{
+}
+
+template<class TCoord, class TDeriv, class TReal> template<class NeighborListener>
+void SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> > >::findNeighbors(NeighborListener* /*dest*/, Real /*dist*/)
+{
+    msg_error("SpatialGrid") << "TODO: SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> > >::findNeighbors(NeighborListener* dest, Real dist)";
+}
+
+template<class TCoord, class TDeriv, class TReal>
+void SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> > >::computeField(ParticleField* /*field*/, Real /*dist*/)
+{
+    msg_error("SpatialGrid") << "TODO: SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> > >::computeField(ParticleField* field, Real dist)";
+}
+
+template<class TCoord, class TDeriv, class TReal>
+void SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> > >::reorderIndices(type::vector<Index>* /*old2new*/, type::vector<Index>* /*new2old*/)
+{
+    msg_error("SpatialGrid") << "TODO: SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> > >::reorderIndices(type::vector<Index>* old2new, type::vector<Index>* new2old)";
+}
+
+
+template<>
+void SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVec3fTypes > >::kernel_computeHash(
+    int cellBits, Real cellWidth, int nbPoints, void* particleIndex, void* particleHash,
+    const void* x)
+{
+    /*
+        {
+            helper::ReadAccessor< sofa::gpu::cuda::CudaVector< unsigned int > > pparticleHash = this->particleHash;
+            for (int i=0;i<nbPoints; i+= nbPoints/9+1)
+            {
+                std::cout << "Chash["<<i<<"] =" ;
+                for (int x=0;x<8;++x) std::cout << "\t" << pparticleHash[i*8+x];
+                std::cout<<std::endl;
+            }
+        }
+    */
+    gpu::cuda::SpatialGridContainer3f_computeHash(cellBits, cellWidth, nbPoints, particleIndex, particleHash, x);
+    /*
+        this->particleHash.deviceWrite();
+        {
+            helper::ReadAccessor< sofa::gpu::cuda::CudaVector< unsigned int > > pparticleHash = this->particleHash;
+            for (int i=0;i<nbPoints; i+= nbPoints/9+1)
+            {
+                std::cout << "Ghash["<<i<<"] =" ;
+                for (int x=0;x<8;++x) std::cout << "\t" << pparticleHash[i*8+x];
+                std::cout<<std::endl;
+            }
+        }
+    */
+}
+
+template<>
+void SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVec3fTypes > >::kernel_updateGrid(
+    int cellBits, int index0, Real cellWidth, int nbPoints, const void* particleHash,
+    void* cells, void* cellGhost)
+{
+    //int nbbits = 8;
+    //while (nbbits < cellBits + 1) nbbits+=8;
+    //CudaSort(particleHash,particleIndex,nbbits,nbPoints*8);
+    gpu::cuda::SpatialGridContainer_findCellRange(cellBits, index0, cellWidth, nbPoints, particleHash, cells, cellGhost);
+}
+
+template<>
+void SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVec3f1Types > >::kernel_computeHash(
+    int cellBits, Real cellWidth, int nbPoints, void* particleIndex, void* particleHash,
+    const void* x)
+{
+    gpu::cuda::SpatialGridContainer3f1_computeHash(cellBits, cellWidth, nbPoints, particleIndex, particleHash, x);
+}
+
+template<>
+void SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVec3f1Types > >::kernel_updateGrid(
+    int cellBits, int index0, Real cellWidth, int nbPoints, const void* particleHash,
+    void* cells, void* cellGhost)
+{
+    gpu::cuda::SpatialGridContainer_findCellRange(cellBits, index0, cellWidth, nbPoints, particleHash, cells, cellGhost);
+}
+
+#ifdef SOFA_GPU_CUDA_DOUBLE
+
+template<>
+void SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVec3dTypes > >::kernel_computeHash(
+    int /*cellBits*/, Real /*cellWidth*/, int /*nbPoints*/, void* /*particleIndex*/, void* /*particleHash*/,
+    const void* /*x*/)
+{
+    msg_error("SpatialGrid") << "TODO: SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVec3dTypes > >::kernel_computeHash()";
+}
+
+template<>
+void SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVec3dTypes > >::kernel_updateGrid(
+    int /*cellBits*/, int /*index0*/, Real /*cellWidth*/, int /*nbPoints*/, const void* /*particleHash*/,
+    void* /*cells*/, void* /*cellGhost*/)
+{
+    msg_error("SpatialGrid") << "TODO: SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVec3dTypes > >::kernel_updateGrid()";
+}
+
+#endif // SOFA_GPU_CUDA_DOUBLE
+
+/*
+template<>
+void SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVec3fTypes > >::kernel_reorderData(int nbPoints, const void* particleHash, void* sorted, const void* x)
+{
+    gpu::cuda::SpatialGridContainer3f_reorderData(nbPoints, particleHash, sorted, x);
+}
+
+template<>
+void SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVec3f1Types > >::kernel_reorderData(int nbPoints, const void* particleHash, void* sorted, const void* x)
+{
+    gpu::cuda::SpatialGridContainer3f1_reorderData(nbPoints, particleHash, sorted, x);
+}
+*/
+
+template<class TCoord, class TDeriv, class TReal>
+void SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> > >::update(const VecCoord& x)
+{
+    lastX = &x;
+    data.clear();
+    int nbPoints = x.size();
+    int index0 = nbCells+BSIZE;
+    /*particleIndex*/ cells.recreate(index0+nbPoints*8,8*BSIZE);
+    particleHash.recreate(nbPoints*8,8*BSIZE);
+
+    {
+        unsigned int numElements = (unsigned int)nbPoints*8;
+        sofa::gpu::cuda::CudaSortPrepare(numElements);
+    }
+
+    //cells.recreate(nbCells+1);
+    cellGhost.recreate(nbCells);
+    //sortedPos.recreate(nbPoints);
+#if 0
+    {
+        helper::WriteAccessor< sofa::gpu::cuda::CudaVector< int > > pcells = cells;
+        helper::WriteAccessor< sofa::gpu::cuda::CudaVector< unsigned int > > pparticleHash = particleHash;
+        helper::ReadAccessor< VecCoord > px = x;
+        const Real pcellWidth = cellWidth*2.0f;
+        //const Real pinvCellWidth = 1.0f/pcellWidth;
+        const int pcellMask = (1<<cellBits)-1;
+        //const Real phalfCellWidth = pcellWidth*0.5f;
+        const Real pinvHalfCellWidth = 2.0f/pcellWidth;
+        for (int i=0; i<nbPoints; ++i)
+        {
+            Coord p = px[i];
+            int hgpos_x,hgpos_y,hgpos_z;
+            hgpos_x = helper::rfloor(p[0] * pinvHalfCellWidth);
+            hgpos_y = helper::rfloor(p[1] * pinvHalfCellWidth);
+            hgpos_z = helper::rfloor(p[2] * pinvHalfCellWidth);
+            int halfcell = ((hgpos_x&1) + ((hgpos_y&1)<<1) + ((hgpos_z&1)<<2))^7;
+            // compute the first cell to be influenced by the particle
+            hgpos_x = (hgpos_x-1) >> 1;
+            hgpos_y = (hgpos_y-1) >> 1;
+            hgpos_z = (hgpos_z-1) >> 1;
+            unsigned int hx = (HASH_PX*hgpos_x);
+            unsigned int hy = (HASH_PY*hgpos_y);
+            unsigned int hz = (HASH_PZ*hgpos_z);
+            for (int x=0; x<8; ++x)
+            {
+                unsigned int h_x = hx; if (x&1) h_x += HASH_PX;
+                unsigned int h_y = hy; if (x&2) h_y += HASH_PY;
+                unsigned int h_z = hz; if (x&4) h_z += HASH_PZ;
+                unsigned int hash = ((h_x ^ h_y ^ h_z) & pcellMask)<<1;
+                if (halfcell != x) ++hash;
+                pcells[index0 + i*8 + x] = i;
+                pparticleHash[i*8 + x] = hash;
+            }
+        }
+    }
+#endif
+#if 0
+    type::vector< std::pair<unsigned int,int> > cpusort;
+    cpusort.resize(8*nbPoints);
+    for (int i=0; i<8*nbPoints; ++i)
+        cpusort[i] = std::make_pair(pparticleHash[i],pcells[index0+i]);
+    std::sort(cpusort.begin(),cpusort.end(),compare_pair_first);
+
+    for (int i=0; i<8*nbPoints; ++i)
+    {
+        pparticleHash[i] = cpusort[i].first;
+        pcells[index0+i] = cpusort[i].second;
+    }
+#endif
+
+    kernel_computeHash(
+        cellBits, cellWidth*2, nbPoints, cells.deviceWriteAt(index0), particleHash.deviceWrite(),
+        x.deviceRead());
+
+    {
+        int nbbits = 8;
+        while (nbbits < cellBits + 1) nbbits+=8;
+        sofa::gpu::cuda::CudaSort(&particleHash,0, &cells,index0, nbPoints*8, nbbits);
+    }
+
+    kernel_updateGrid(
+        cellBits, index0, cellWidth*2, nbPoints, particleHash.deviceRead(),
+        cells.deviceWrite(), cellGhost.deviceWrite());
+#if 0
+    std::cout << nbPoints*8 << " entries in " << nbCells << " cells." << std::endl;
+    int nfill = 0;
+    for (int c=0; c<nbCells; ++c)
+    {
+        int cellBegin = cells[c];
+        int cellEnd = cells[c+1]&~(1U<<31);
+        if (cells[c] <= 0) continue;
+        if (cellEnd > cellBegin + nbPoints/2) // || nfill >= 100 && nfill < 110)
+            std::cout << "Cell " << c << ": range = " << cellBegin-index0 << " - " << cellEnd-index0 << "     ghost = " << cellGhost[c]-index0 << std::endl;
+        ++nfill;
+    }
+    std::cout << ((1000*nfill)/nbCells) * 0.1 << " % cells with particles." << std::endl;
+#endif
+
+    //kernel_reorderData(nbPoints, particleHash.deviceRead(), sortedPos.deviceWrite(), x.deviceRead());
+}
+
+template<class TCoord, class TDeriv, class TReal>
+void SpatialGrid< SpatialGridTypes < gpu::cuda::CudaVectorTypes<TCoord,TDeriv,TReal> > >::draw(const core::visual::VisualParams* )
+{
+#if SOFACUDA_HAVE_SOFA_GL == 1
+    if (!lastX) return;
+    int nbPoints = particleHash.size();
+    int index0 = nbCells+BSIZE;
+    glDisable(GL_LIGHTING);
+    glColor4f(1,1,1,1);
+    glPointSize(3);
+    glBegin(GL_POINTS);
+    unsigned int last = 0;
+    for (int i=0; i<nbPoints; i++)
+    {
+        unsigned int cell = particleHash[i];
+        unsigned int p = cells[index0+i]; //particleIndex[i];
+        if (cell < last)
+        {
+            msg_error("SpatialGrid") << "SORT ERROR: index " << i << " key " << cell << " value " << p << " last key " << last;
+        }
+        last = cell;
+        if (!(cell&1)) continue; // this is a ghost particle from a neighbor cell
+        cell>>=1;
+        //if (cell != 0 && cell != 65535)
+        //    std::cout << i << ": "<<p<<" -> "<<cell<<", "<<(*lastX)[p]<<" -> "<<sortedPos[i]<<std::endl;
+        int r = cell&3;
+        int g = (cell>>2)&3;
+        int b = (cell>>4)&3;
+        glColor4ub(63+r*64,63+g*64,63+b*64,255);
+        //glVertex3fv(sortedPos[i].ptr());
+        sofa::gl::glVertexT((*lastX)[p]);
+    }
+    glEnd();
+    glPointSize(1);
+#endif // SOFACUDA_HAVE_SOFA_GL == 1
+}
+
+} // namespace component::container
+
+
+} // namespace sofa
+
+#endif
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/config.h.in b/extensions/CUDA/src/SofaSphFluid/CUDA/config.h.in
new file mode 100644
index 0000000..3c937ec
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/config.h.in
@@ -0,0 +1,37 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#pragma once
+
+#include <sofa/config.h>
+
+#ifdef SOFA_BUILD_SOFASPHFLUID_CUDA
+#  define SOFA_TARGET @PROJECT_NAME@
+#  define SOFA_SOFASPHFLUID_CUDA_API SOFA_EXPORT_DYNAMIC_LIBRARY
+#else
+#  define SOFA_SOFASPHFLUID_CUDA_API SOFA_IMPORT_DYNAMIC_LIBRARY
+#endif
+
+namespace sofasphfluid::cuda
+{
+    constexpr const char* MODULE_NAME = "@PROJECT_NAME@";
+    constexpr const char* MODULE_VERSION = "@PROJECT_VERSION@";
+} // namespace sofasphfluid::cuda
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/init.cpp b/extensions/CUDA/src/SofaSphFluid/CUDA/init.cpp
new file mode 100644
index 0000000..93bd1e6
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/init.cpp
@@ -0,0 +1,61 @@
+/******************************************************************************
+*                              SofaSphFluid plugin                             *
+*                  (c) 2006 Inria, University of Lille, CNRS                  *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: see Authors.md                                                     *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#include <SofaCUDA/init.h>
+#include <SofaSphFluid/CUDA/init.h>
+#include <SofaSphFluid/initSPHFluid.h>
+
+namespace sofasphfluid::cuda
+{
+
+extern "C" {
+    SOFA_EXPORT_DYNAMIC_LIBRARY void initExternalModule();
+    SOFA_EXPORT_DYNAMIC_LIBRARY const char* getModuleName();
+    SOFA_EXPORT_DYNAMIC_LIBRARY const char* getModuleVersion();
+}
+
+void initExternalModule()
+{
+    init();
+}
+
+const char* getModuleName()
+{
+    return MODULE_NAME;
+}
+
+const char* getModuleVersion()
+{
+    return MODULE_VERSION;
+}
+
+void init()
+{
+    static bool first = true;
+    if (first)
+    {
+        sofasphfluid::init();
+        sofa::gpu::cuda::init();
+        first = false;
+    }
+}
+
+} // namespace sofasphfluid::cuda
diff --git a/extensions/CUDA/src/SofaSphFluid/CUDA/init.h b/extensions/CUDA/src/SofaSphFluid/CUDA/init.h
new file mode 100644
index 0000000..3267619
--- /dev/null
+++ b/extensions/CUDA/src/SofaSphFluid/CUDA/init.h
@@ -0,0 +1,29 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#pragma once
+
+#include <SofaSphFluid/CUDA/config.h>
+
+namespace sofasphfluid::cuda
+{
+SOFA_SOFASPHFLUID_CUDA_API void init();
+} // namespace sofasphdluid::cuda
diff --git a/src/SofaSphFluid/initSPHFluid.cpp b/src/SofaSphFluid/initSPHFluid.cpp
index 0c146b3..de55649 100644
--- a/src/SofaSphFluid/initSPHFluid.cpp
+++ b/src/SofaSphFluid/initSPHFluid.cpp
@@ -20,6 +20,7 @@
 * Contact information: contact@sofa-framework.org                             *
 ******************************************************************************/
 #include <SofaSphFluid/config.h>
+#include <SofaSphFluid/initSPHFluid.h>
 
 #include <sofa/core/ObjectFactory.h>
 #include <sofa/helper/system/PluginManager.h>
@@ -65,7 +66,7 @@ SOFA_SPH_FLUID_API const char* getModuleDescription();
 SOFA_SPH_FLUID_API void registerObjects(sofa::core::ObjectFactory* factory);
 }
 
-void initExternalModule()
+void init()
 {
     static bool first = true;
     if (first)
@@ -77,6 +78,11 @@ void initExternalModule()
     }
 }
 
+void initExternalModule()
+{
+    init();
+}
+
 const char* getModuleName()
 {
     return MODULE_NAME;
diff --git a/src/SofaSphFluid/initSPHFluid.h b/src/SofaSphFluid/initSPHFluid.h
new file mode 100644
index 0000000..a0a7b4a
--- /dev/null
+++ b/src/SofaSphFluid/initSPHFluid.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#pragma once
+#include <SofaSphFluid/config.h>
+
+namespace sofasphfluid
+{
+
+SOFA_SPH_FLUID_API void init();
+
+}