diff --git a/hoomd/mpcd/CMakeLists.txt b/hoomd/mpcd/CMakeLists.txt
index 7ae9b10b8d..7761c99879 100644
--- a/hoomd/mpcd/CMakeLists.txt
+++ b/hoomd/mpcd/CMakeLists.txt
@@ -6,7 +6,6 @@ endif()
 set(_mpcd_cc_sources
     module.cc
     ATCollisionMethod.cc
-    CellCommunicator.cc
     CellThermoCompute.cc
     CellList.cc
     CollisionMethod.cc
@@ -27,7 +26,6 @@ set(_mpcd_headers
     BounceBackNVE.h
     BulkGeometry.h
     BulkStreamingMethod.h
-    CellCommunicator.h
     CellThermoCompute.h
     CellList.h
     CollisionMethod.h
@@ -88,7 +86,6 @@ if(ENABLE_HIP)
         BounceBackStreamingMethodGPU.cuh
         BounceBackStreamingMethodGPU.h
         BulkStreamingMethodGPU.h
-        CellCommunicator.cuh
         CellThermoComputeGPU.cuh
         CellThermoComputeGPU.h
         CellListGPU.cuh
diff --git a/hoomd/mpcd/CellCommunicator.cc b/hoomd/mpcd/CellCommunicator.cc
deleted file mode 100644
index ee52380b1c..0000000000
--- a/hoomd/mpcd/CellCommunicator.cc
+++ /dev/null
@@ -1,374 +0,0 @@
-// Copyright (c) 2009-2025 The Regents of the University of Michigan.
-// Part of HOOMD-blue, released under the BSD 3-Clause License.
-
-/*!
- * \file mpcd/CellCommunicator.cc
- * \brief Definition of mpcd::CellCommunicator
- */
-
-#ifdef ENABLE_MPI
-
-#include "CellCommunicator.h"
-
-namespace hoomd
-    {
-// initialize with zero instances of the communicator
-unsigned int mpcd::CellCommunicator::num_instances = 0;
-
-/*!
- * \param sysdef System definition
- * \param cl MPCD cell list
- */
-mpcd::CellCommunicator::CellCommunicator(std::shared_ptr<SystemDefinition> sysdef,
-                                         std::shared_ptr<mpcd::CellList> cl)
-    : m_id(num_instances++), m_sysdef(sysdef), m_pdata(sysdef->getParticleData()),
-      m_exec_conf(m_pdata->getExecConf()), m_mpi_comm(m_exec_conf->getMPICommunicator()),
-      m_decomposition(m_pdata->getDomainDecomposition()), m_cl(cl), m_communicating(false),
-      m_send_buf(m_exec_conf), m_recv_buf(m_exec_conf), m_needs_init(true)
-    {
-    m_exec_conf->msg->notice(5) << "Constructing MPCD CellCommunicator" << std::endl;
-#ifdef ENABLE_HIP
-    if (m_exec_conf->isCUDAEnabled())
-        {
-        m_tuner_pack.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
-                                            m_exec_conf,
-                                            "mpcd_cell_comm_pack_" + std::to_string(m_id)));
-        m_tuner_unpack.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
-                                              m_exec_conf,
-                                              "mpcd_cell_comm_unpack_" + std::to_string(m_id)));
-        m_autotuners.insert(m_autotuners.end(), {m_tuner_pack, m_tuner_unpack});
-        }
-#endif // ENABLE_HIP
-
-    m_cl->getSizeChangeSignal().connect<mpcd::CellCommunicator, &mpcd::CellCommunicator::slotInit>(
-        this);
-    }
-
-mpcd::CellCommunicator::~CellCommunicator()
-    {
-    m_exec_conf->msg->notice(5) << "Destroying MPCD CellCommunicator" << std::endl;
-    m_cl->getSizeChangeSignal()
-        .disconnect<mpcd::CellCommunicator, &mpcd::CellCommunicator::slotInit>(this);
-    }
-
-namespace mpcd
-    {
-namespace detail
-    {
-//! Unary operator to wrap global cell indexes into the local domain
-struct LocalCellWrapOp
-    {
-    LocalCellWrapOp(std::shared_ptr<mpcd::CellList> cl_)
-        : cl(cl_), ci(cl_->getCellIndexer()), gci(cl_->getGlobalCellIndexer())
-        {
-        }
-
-    //! Transform the global 1D cell index into a local 1D cell index
-    inline unsigned int operator()(unsigned int cell_idx)
-        {
-        // convert the 1D global cell index to a global cell tuple
-        const uint3 cell = gci.getTriple(cell_idx);
-
-        // convert the global cell tuple to a local cell tuple
-        int3 local_cell = cl->getLocalCell(make_int3(cell.x, cell.y, cell.z));
-
-        // wrap the local cell through the global boundaries, which should work for all reasonable
-        // cell comms.
-        if (local_cell.x >= (int)gci.getW())
-            local_cell.x -= gci.getW();
-        else if (local_cell.x < 0)
-            local_cell.x += gci.getW();
-
-        if (local_cell.y >= (int)gci.getH())
-            local_cell.y -= gci.getH();
-        else if (local_cell.y < 0)
-            local_cell.y += gci.getH();
-
-        if (local_cell.z >= (int)gci.getD())
-            local_cell.z -= gci.getD();
-        else if (local_cell.z < 0)
-            local_cell.z += gci.getD();
-
-        // convert the local cell tuple back to an index
-        return ci(local_cell.x, local_cell.y, local_cell.z);
-        }
-
-    std::shared_ptr<mpcd::CellList> cl; //!< Cell list
-    const Index3D ci;                   //!< Cell indexer
-    const Index3D gci;                  //!< Global cell indexer
-    };
-    } // end namespace detail
-    } // end namespace mpcd
-
-void mpcd::CellCommunicator::initialize()
-    {
-    // obtain domain decomposition
-    const Index3D& di = m_decomposition->getDomainIndexer();
-    ArrayHandle<unsigned int> h_cart_ranks(m_decomposition->getCartRanks(),
-                                           access_location::host,
-                                           access_mode::read);
-    const uint3 my_pos = m_decomposition->getGridPos();
-
-    // use the cell list to compute the bounds
-    const Index3D& ci = m_cl->getCellIndexer();
-    const Index3D& global_ci = m_cl->getGlobalCellIndexer();
-    auto num_comm_cells = m_cl->getNComm();
-    const uint3 max_lo
-        = make_uint3(num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::west)],
-                     num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::south)],
-                     num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::down)]);
-    const uint3 min_hi = make_uint3(
-        ci.getW() - num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::east)],
-        ci.getH() - num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::north)],
-        ci.getD() - num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::up)]);
-
-        // check to make sure box is not overdecomposed
-        {
-        const unsigned int nextra = m_cl->getNExtraCells();
-        unsigned int err = ((max_lo.x + nextra) > min_hi.x || (max_lo.y + nextra) > min_hi.y
-                            || (max_lo.z + nextra) > min_hi.z);
-        MPI_Allreduce(MPI_IN_PLACE, &err, 1, MPI_UNSIGNED, MPI_MAX, m_mpi_comm);
-        if (err)
-            {
-            m_exec_conf->msg->error()
-                << "mpcd: Simulation box is overdecomposed, decrease the number of ranks."
-                << std::endl;
-            throw std::runtime_error("Simulation box is overdecomposed for MPCD");
-            }
-        }
-
-    // loop over all cells in the grid and determine where to send them
-    std::multimap<unsigned int, unsigned int> send_map;
-    std::set<unsigned int> neighbors;
-    for (unsigned int k = 0; k < ci.getD(); ++k)
-        {
-        for (unsigned int j = 0; j < ci.getH(); ++j)
-            {
-            for (unsigned int i = 0; i < ci.getW(); ++i)
-                {
-                // skip any cells interior to the grid, which will not be communicated
-                // this is wasteful loop logic, but initialize will only be called rarely
-                if (i >= max_lo.x && i < min_hi.x && j >= max_lo.y && j < min_hi.y && k >= max_lo.z
-                    && k < min_hi.z)
-                    continue;
-
-                // obtain the 1D global index of this cell
-                const int3 global_cell = m_cl->getGlobalCell(make_int3(i, j, k));
-                const unsigned int global_cell_idx
-                    = global_ci(global_cell.x, global_cell.y, global_cell.z);
-
-                // check which direction the cell lies off rank in x,y,z
-                std::vector<int> dx = {0};
-                if (i < max_lo.x)
-                    dx.push_back(-1);
-                else if (i >= min_hi.x)
-                    dx.push_back(1);
-
-                std::vector<int> dy = {0};
-                if (j < max_lo.y)
-                    dy.push_back(-1);
-                else if (j >= min_hi.y)
-                    dy.push_back(1);
-
-                std::vector<int> dz = {0};
-                if (k < max_lo.z)
-                    dz.push_back(-1);
-                else if (k >= min_hi.z)
-                    dz.push_back(1);
-
-                // generate all permutations of these neighbors for the cell
-                for (auto ddx = dx.begin(); ddx != dx.end(); ++ddx)
-                    {
-                    for (auto ddy = dy.begin(); ddy != dy.end(); ++ddy)
-                        {
-                        for (auto ddz = dz.begin(); ddz != dz.end(); ++ddz)
-                            {
-                            // skip self
-                            if (*ddx == 0 && *ddy == 0 && *ddz == 0)
-                                continue;
-
-                            // get neighbor rank tuple
-                            int3 neigh = make_int3((int)my_pos.x + *ddx,
-                                                   (int)my_pos.y + *ddy,
-                                                   (int)my_pos.z + *ddz);
-
-                            // wrap neighbor through the boundaries
-                            if (neigh.x < 0)
-                                neigh.x += di.getW();
-                            else if (neigh.x >= (int)di.getW())
-                                neigh.x -= di.getW();
-
-                            if (neigh.y < 0)
-                                neigh.y += di.getH();
-                            else if (neigh.y >= (int)di.getH())
-                                neigh.y -= di.getH();
-
-                            if (neigh.z < 0)
-                                neigh.z += di.getD();
-                            else if (neigh.z >= (int)di.getD())
-                                neigh.z -= di.getD();
-
-                            // convert neighbor to a linear rank and push it into the unique
-                            // neighbor set
-                            const unsigned int neigh_rank
-                                = h_cart_ranks.data[di(neigh.x, neigh.y, neigh.z)];
-                            neighbors.insert(neigh_rank);
-                            send_map.insert(std::make_pair(neigh_rank, global_cell_idx));
-                            } // ddz
-                        } // ddy
-                    } // ddx
-                } // i
-            } // j
-        } // k
-
-        // allocate send / receive index arrays
-        {
-        GPUArray<unsigned int> send_idx(send_map.size(), m_exec_conf);
-        m_send_idx.swap(send_idx);
-        }
-
-        // fill the send indexes with the global values
-        // flood the array of unique neighbors and count the number to send
-        {
-        ArrayHandle<unsigned int> h_send_idx(m_send_idx,
-                                             access_location::host,
-                                             access_mode::overwrite);
-        unsigned int idx = 0;
-        for (auto it = send_map.begin(); it != send_map.end(); ++it)
-            {
-            h_send_idx.data[idx++] = it->second;
-            }
-
-        m_neighbors.resize(neighbors.size());
-        m_begin.resize(m_neighbors.size());
-        m_num_send.resize(m_neighbors.size());
-        idx = 0;
-        for (auto it = neighbors.begin(); it != neighbors.end(); ++it)
-            {
-            auto lower = send_map.lower_bound(*it);
-            auto upper = send_map.upper_bound(*it);
-
-            m_neighbors[idx] = *it;
-            m_begin[idx] = (unsigned int)std::distance(send_map.begin(), lower);
-            m_num_send[idx] = (unsigned int)std::distance(lower, upper);
-            ++idx;
-            }
-        }
-
-    // send / receive the global cell indexes to be communicated with neighbors
-    std::vector<unsigned int> recv_idx(m_send_idx.getNumElements());
-        {
-        ArrayHandle<unsigned int> h_send_idx(m_send_idx, access_location::host, access_mode::read);
-
-        m_reqs.resize(2 * m_neighbors.size());
-        for (unsigned int idx = 0; idx < m_neighbors.size(); ++idx)
-            {
-            const unsigned int offset = m_begin[idx];
-            MPI_Isend(h_send_idx.data + offset,
-                      m_num_send[idx],
-                      MPI_INT,
-                      m_neighbors[idx],
-                      0,
-                      m_mpi_comm,
-                      &m_reqs[2 * idx]);
-            MPI_Irecv(recv_idx.data() + offset,
-                      m_num_send[idx],
-                      MPI_INT,
-                      m_neighbors[idx],
-                      0,
-                      m_mpi_comm,
-                      &m_reqs[2 * idx + 1]);
-            }
-        MPI_Waitall((unsigned int)m_reqs.size(), m_reqs.data(), MPI_STATUSES_IGNORE);
-        }
-
-        // transform all of the global cell indexes back into local cell indexes
-        {
-        ArrayHandle<unsigned int> h_send_idx(m_send_idx,
-                                             access_location::host,
-                                             access_mode::readwrite);
-
-        mpcd::detail::LocalCellWrapOp wrapper(m_cl);
-        std::transform(h_send_idx.data,
-                       h_send_idx.data + m_send_idx.getNumElements(),
-                       h_send_idx.data,
-                       wrapper);
-        std::transform(recv_idx.begin(), recv_idx.end(), recv_idx.begin(), wrapper);
-        }
-
-        // map the received cells from a rank-basis to a cell-basis
-        {
-        std::multimap<unsigned int, unsigned int> cell_map;
-        std::set<unsigned int> unique_cells;
-        for (unsigned int idx = 0; idx < recv_idx.size(); ++idx)
-            {
-            const unsigned int cell = recv_idx[idx];
-            unique_cells.insert(cell);
-            cell_map.insert(std::make_pair(cell, idx));
-            }
-        m_num_cells = (unsigned int)unique_cells.size();
-
-            /*
-             * Allocate auxiliary memory for receiving cell reordering
-             */
-            {
-            GPUArray<unsigned int> recv(recv_idx.size(), m_exec_conf);
-            m_recv.swap(recv);
-
-            GPUArray<unsigned int> cells(m_num_cells, m_exec_conf);
-            m_cells.swap(cells);
-
-            GPUArray<unsigned int> recv_begin(m_num_cells, m_exec_conf);
-            m_recv_begin.swap(recv_begin);
-
-            GPUArray<unsigned int> recv_end(m_num_cells, m_exec_conf);
-            m_recv_end.swap(recv_end);
-            }
-
-        /*
-         * Generate the compacted list of unique cells
-         */
-        ArrayHandle<unsigned int> h_cells(m_cells, access_location::host, access_mode::overwrite);
-        unsigned int idx = 0;
-        for (auto it = unique_cells.begin(); it != unique_cells.end(); ++it)
-            {
-            h_cells.data[idx++] = *it;
-            }
-
-        /*
-         * Loop over the cell map to do run-length encoding on the keys. This
-         * determines the range of data belonging to each received cell.
-         */
-        ArrayHandle<unsigned int> h_recv(m_recv, access_location::host, access_mode::overwrite);
-        ArrayHandle<unsigned int> h_recv_begin(m_recv_begin,
-                                               access_location::host,
-                                               access_mode::overwrite);
-        ArrayHandle<unsigned int> h_recv_end(m_recv_end,
-                                             access_location::host,
-                                             access_mode::overwrite);
-        unsigned int last_cell = UINT_MAX;
-        unsigned int cell_idx = 0;
-        idx = 0;
-        h_recv_begin.data[cell_idx] = idx;
-        for (auto it = cell_map.begin(); it != cell_map.end(); ++it)
-            {
-            // record the sorted receive index
-            h_recv.data[idx] = it->second;
-
-            // if not very first pass and the current cell does not match the
-            // last cell, then we are on a new cell, and need to demark an end / begin
-            if (last_cell != UINT_MAX && it->first != last_cell)
-                {
-                h_recv_end.data[cell_idx] = idx;
-                h_recv_begin.data[++cell_idx] = idx;
-                }
-            last_cell = it->first;
-
-            ++idx;
-            }
-        h_recv_end.data[cell_idx] = idx;
-        }
-    }
-    } // end namespace hoomd
-
-#endif // ENABLE_MPI
diff --git a/hoomd/mpcd/CellCommunicator.cuh b/hoomd/mpcd/CellCommunicator.cuh
deleted file mode 100644
index 3bba09f5f6..0000000000
--- a/hoomd/mpcd/CellCommunicator.cuh
+++ /dev/null
@@ -1,227 +0,0 @@
-// Copyright (c) 2009-2025 The Regents of the University of Michigan.
-// Part of HOOMD-blue, released under the BSD 3-Clause License.
-
-/*!
- * \file mpcd/CellCommunicator.cuh
- * \brief Declaration of CUDA kernels for mpcd::CellCommunicator
- */
-
-#ifndef MPCD_CELL_COMMUNICATOR_CUH_
-#define MPCD_CELL_COMMUNICATOR_CUH_
-
-#include "hoomd/HOOMDMath.h"
-#include "hoomd/Index1D.h"
-
-#include <cuda_runtime.h>
-
-namespace hoomd
-    {
-namespace mpcd
-    {
-namespace gpu
-    {
-//! Kernel driver to pack cell communication buffer
-template<typename T, class PackOpT>
-cudaError_t __attribute__((visibility("default")))
-pack_cell_buffer(typename PackOpT::element* d_send_buf,
-                 const T* d_props,
-                 const unsigned int* d_send_idx,
-                 const PackOpT op,
-                 const unsigned int num_send,
-                 unsigned int block_size);
-
-//! Kernel driver to unpack cell communication buffer
-template<typename T, class PackOpT>
-cudaError_t __attribute__((visibility("default")))
-unpack_cell_buffer(T* d_props,
-                   const unsigned int* d_cells,
-                   const unsigned int* d_recv,
-                   const unsigned int* d_recv_begin,
-                   const unsigned int* d_recv_end,
-                   const typename PackOpT::element* d_recv_buf,
-                   const PackOpT op,
-                   const unsigned int num_cells,
-                   const unsigned int block_size);
-
-#ifdef __HIPCC__
-
-namespace kernel
-    {
-//! Kernel to pack cell communication buffer
-/*!
- * \param d_send_buf Send buffer to pack (output)
- * \param d_props Cell property to pack
- * \param d_send_idx Cell indexes to pack into the buffer
- * \param op Pack operator
- * \param num_send Number of cells to pack
- * \param block_size Number of threads per block
- *
- * \tparam T Type of data to pack (inferred)
- * \tparam PackOpT Pack operation type (inferred)
- *
- * \b Implementation details:
- * Using one thread per cell to pack, each cell to send has its properties read
- * and packed into the send buffer.
- */
-template<typename T, class PackOpT>
-__global__ void pack_cell_buffer(typename PackOpT::element* d_send_buf,
-                                 const T* d_props,
-                                 const unsigned int* d_send_idx,
-                                 const PackOpT op,
-                                 const unsigned int num_send)
-    {
-    // one thread per buffer cell
-    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= num_send)
-        return;
-
-    const unsigned int send_idx = d_send_idx[idx];
-    d_send_buf[idx] = op.pack(d_props[send_idx]);
-    }
-
-//! Kernel to unpack cell communication buffer
-/*!
- * \param d_props Cell property array to unpack into
- * \param d_cells List of unique cells to unpack
- * \param d_recv Map of unique cells onto the receive buffer
- * \param d_recv_begin Beginning index into \a d_recv for cells to unpack
- * \param d_recv_end End index (exclusive) into \a d_recv for cells to unpack
- * \param d_recv_buf Received buffer from neighbor ranks
- * \param op Packing operator
- * \param num_cells Number of cells to unpack
- *
- * \tparam T Data type to unpack (inferred)
- * \tparam PackOpT Pack operation type (inferred)
- *
- * \b Implementation details:
- * Using one thread per cell to unpack, each cell iterates over the cells it
- * has received, and applies the unpacking operator to each in turn.
- */
-template<typename T, class PackOpT>
-__global__ void unpack_cell_buffer(T* d_props,
-                                   const unsigned int* d_cells,
-                                   const unsigned int* d_recv,
-                                   const unsigned int* d_recv_begin,
-                                   const unsigned int* d_recv_end,
-                                   const typename PackOpT::element* d_recv_buf,
-                                   const PackOpT op,
-                                   const unsigned int num_cells)
-    {
-    // one thread per particle
-    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= num_cells)
-        return;
-
-    const unsigned int cell_idx = d_cells[idx];
-    const unsigned int begin = d_recv_begin[idx];
-    const unsigned int end = d_recv_end[idx];
-
-    // loop through all received data for this cell, and unpack it iteratively
-    T val = d_props[cell_idx];
-    for (unsigned int i = begin; i < end; ++i)
-        {
-        val = op.unpack(d_recv_buf[d_recv[i]], val);
-        }
-
-    // save the accumulated unpacked value
-    d_props[cell_idx] = val;
-    }
-
-    } // end namespace kernel
-
-/*!
- * \param d_send_buf Send buffer to pack (output)
- * \param d_props Cell property to pack
- * \param d_send_idx Cell indexes to pack into the buffer
- * \param op Pack operator
- * \param num_send Number of cells to pack
- * \param block_size Number of threads per block
- *
- * \tparam T Type of data to pack (inferred)
- * \tparam PackOpT Pack operator type
- *
- * \returns cudaSuccess on completion
- *
- * \sa mpcd::gpu::kernel::pack_cell_buffer
- */
-template<typename T, class PackOpT>
-cudaError_t pack_cell_buffer(typename PackOpT::element* d_send_buf,
-                             const T* d_props,
-                             const unsigned int* d_send_idx,
-                             const PackOpT op,
-                             const unsigned int num_send,
-                             unsigned int block_size)
-    {
-    // determine runtime block size
-    unsigned int max_block_size;
-    cudaFuncAttributes attr;
-    cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::pack_cell_buffer<T, PackOpT>);
-    max_block_size = attr.maxThreadsPerBlock;
-
-    const unsigned int run_block_size = min(block_size, max_block_size);
-
-    dim3 grid(num_send / run_block_size + 1);
-    mpcd::gpu::kernel::pack_cell_buffer<<<grid, run_block_size>>>(d_send_buf,
-                                                                  d_props,
-                                                                  d_send_idx,
-                                                                  op,
-                                                                  num_send);
-
-    return cudaSuccess;
-    }
-
-/*!
- * \param d_props Cell property array to unpack into
- * \param d_cells List of unique cells to unpack
- * \param d_recv Map of unique cells onto the receive buffer
- * \param d_recv_begin Beginning index into \a d_recv for cells to unpack
- * \param d_recv_end End index (exclusive) into \a d_recv for cells to unpack
- * \param d_recv_buf Received buffer from neighbor ranks
- * \param op Packing operator
- * \param num_cells Number of cells to unpack
- *
- * \tparam T Data type to unpack (inferred)
- * \tparam PackOpT Pack operation type (inferred)
- *
- * \returns cudaSuccess on completion
- *
- * \sa mpcd::gpu::kernel::unpack_cell_buffer
- */
-template<typename T, class PackOpT>
-cudaError_t unpack_cell_buffer(T* d_props,
-                               const unsigned int* d_cells,
-                               const unsigned int* d_recv,
-                               const unsigned int* d_recv_begin,
-                               const unsigned int* d_recv_end,
-                               const typename PackOpT::element* d_recv_buf,
-                               const PackOpT op,
-                               const unsigned int num_cells,
-                               const unsigned int block_size)
-    {
-    // determine runtime block size
-    unsigned int max_block_size;
-    cudaFuncAttributes attr;
-    cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::unpack_cell_buffer<T, PackOpT>);
-    max_block_size = attr.maxThreadsPerBlock;
-
-    const unsigned int run_block_size = min(block_size, max_block_size);
-
-    dim3 grid(num_cells / run_block_size + 1);
-    mpcd::gpu::kernel::unpack_cell_buffer<<<grid, run_block_size>>>(d_props,
-                                                                    d_cells,
-                                                                    d_recv,
-                                                                    d_recv_begin,
-                                                                    d_recv_end,
-                                                                    d_recv_buf,
-                                                                    op,
-                                                                    num_cells);
-
-    return cudaSuccess;
-    }
-#endif // __HIPCC__
-
-    } // end namespace gpu
-    } // end namespace mpcd
-    } // end namespace hoomd
-
-#endif // MPCD_CELL_COMMUNICATOR_CUH_
diff --git a/hoomd/mpcd/CellCommunicator.h b/hoomd/mpcd/CellCommunicator.h
deleted file mode 100644
index ea78f0f8d8..0000000000
--- a/hoomd/mpcd/CellCommunicator.h
+++ /dev/null
@@ -1,437 +0,0 @@
-// Copyright (c) 2009-2025 The Regents of the University of Michigan.
-// Part of HOOMD-blue, released under the BSD 3-Clause License.
-
-/*!
- * \file mpcd/CellCommunicator.h
- * \brief Declares and defines the mpcd::CellCommunicator class
- */
-
-#ifdef ENABLE_MPI
-
-#ifndef MPCD_CELL_COMMUNICATOR_H_
-#define MPCD_CELL_COMMUNICATOR_H_
-
-#ifdef __HIPCC__
-#error This header cannot be compiled by nvcc
-#endif
-
-#ifdef ENABLE_HIP
-#include "CellCommunicator.cuh"
-#endif // ENABLE_HIP
-
-#include "CellList.h"
-#include "CommunicatorUtilities.h"
-
-#include "hoomd/Autotuned.h"
-#include "hoomd/DomainDecomposition.h"
-#include "hoomd/GPUArray.h"
-#include "hoomd/GPUVector.h"
-#include "hoomd/HOOMDMath.h"
-#include "hoomd/Index1D.h"
-#include "hoomd/SystemDefinition.h"
-
-#include <map>
-#include <set>
-
-namespace hoomd
-    {
-namespace mpcd
-    {
-//! Communicates properties across the MPCD cell list
-class PYBIND11_EXPORT CellCommunicator : public Autotuned
-    {
-    public:
-    //! Constructor
-    CellCommunicator(std::shared_ptr<SystemDefinition> sysdef, std::shared_ptr<mpcd::CellList> cl);
-
-    //! Destructor
-    virtual ~CellCommunicator();
-
-    //! Reduce cell list properties
-    /*!
-     * \param props Properties to reduce
-     * \param op Binary reduction operator
-     *
-     * Properties are reduced across domain boundaries, as marked in the cell list.
-     */
-    template<typename T, class PackOpT> void communicate(const GPUArray<T>& props, const PackOpT op)
-        {
-        begin(props, op);
-        finalize(props, op);
-        }
-
-    //! Begin communication of the grid
-    template<typename T, class PackOpT> void begin(const GPUArray<T>& props, const PackOpT op);
-
-    //! Finalize communication of the grid
-    template<typename T, class PackOpT> void finalize(const GPUArray<T>& props, const PackOpT op);
-
-    //! Get the number of unique cells with communication
-    unsigned int getNCells()
-        {
-        if (m_needs_init)
-            {
-            initialize();
-            m_needs_init = false;
-            }
-        return m_num_cells;
-        }
-
-    //! Get the list of unique cells with communication
-    const GPUArray<unsigned int>& getCells()
-        {
-        if (m_needs_init)
-            {
-            initialize();
-            m_needs_init = false;
-            }
-        return m_cells;
-        }
-
-    private:
-    static unsigned int num_instances; //!< Number of communicator instances
-    const unsigned int m_id;           //!< Id for this communicator to use in tags
-
-    std::shared_ptr<SystemDefinition> m_sysdef;                //!< System definition
-    std::shared_ptr<hoomd::ParticleData> m_pdata;              //!< HOOMD particle data
-    std::shared_ptr<const ExecutionConfiguration> m_exec_conf; //!< Execution configuration
-    const MPI_Comm m_mpi_comm;                                 //!< MPI Communicator
-    std::shared_ptr<DomainDecomposition> m_decomposition;      //!< Domain decomposition
-
-    std::shared_ptr<mpcd::CellList> m_cl; //!< MPCD cell list
-
-    bool m_communicating;                //!< Flag if communication is occurring
-    GPUVector<unsigned char> m_send_buf; //!< Send buffer
-    GPUVector<unsigned char> m_recv_buf; //!< Receive buffer
-    GPUArray<unsigned int> m_send_idx;   //!< Indexes of cells in send buffer
-    std::vector<MPI_Request> m_reqs;     //!< MPI request objects
-
-    std::vector<unsigned int> m_neighbors; //!< Unique neighbor ranks
-    std::vector<unsigned int> m_begin;     //!< Begin offset of every neighbor
-    std::vector<unsigned int> m_num_send;  //!< Number of cells to send to every neighbor
-
-    unsigned int m_num_cells;       //!< Number of unique cells to receive
-    GPUArray<unsigned int> m_cells; //!< Unique cells to receive
-    GPUArray<unsigned int>
-        m_recv; //!< Reordered mapping of buffer from ranks to group received cells together
-    GPUArray<unsigned int> m_recv_begin; //!< Begin offset of every unique cell
-    GPUArray<unsigned int> m_recv_end;   //!< End offset of every unique cell
-
-    bool m_needs_init; //!< Flag if grid needs to be initialized
-    //! Slot that communicator needs to be reinitialized
-    void slotInit()
-        {
-        m_needs_init = true;
-        }
-
-    //! Initialize the grid
-    void initialize();
-
-    //! Packs the property buffer
-    template<typename T, class PackOpT> void packBuffer(const GPUArray<T>& props, const PackOpT op);
-
-    //! Unpacks the property buffer
-    template<typename T, class PackOpT>
-    void unpackBuffer(const GPUArray<T>& props, const PackOpT op);
-
-#ifdef ENABLE_HIP
-    std::shared_ptr<Autotuner<1>> m_tuner_pack;   //!< Tuner for pack kernel
-    std::shared_ptr<Autotuner<1>> m_tuner_unpack; //!< Tuner for unpack kernel
-
-    //! Packs the property buffer on the GPU
-    template<typename T, class PackOpT>
-    void packBufferGPU(const GPUArray<T>& props, const PackOpT op);
-
-    //! Unpacks the property buffer on the GPU
-    template<typename T, class PackOpT>
-    void unpackBufferGPU(const GPUArray<T>& props, const PackOpT op);
-#endif // ENABLE_HIP
-    };
-
-    } // end namespace mpcd
-
-/*!
- * \param props Property buffer to pack
- * \param op Packing operator to apply to the send buffer
- *
- * \tparam T Type of buffer to pack (inferred)
- * \tparam PackOpT Packing operator type (inferred)
- *
- * The data in \a props is packed into the send buffers, and nonblocking MPI
- * send / receive operations are initiated. If communication is already occurring,
- * the method returns immediately and no action is taken.
- */
-template<typename T, class PackOpT>
-void mpcd::CellCommunicator::begin(const GPUArray<T>& props, const PackOpT op)
-    {
-    // check if communication is occurring and place a lock on
-    if (m_communicating)
-        return;
-    m_communicating = true;
-
-    // ensure that the property grid is sufficiently sized compared to the cell list
-    if (props.getNumElements() < m_cl->getNCells())
-        {
-        m_exec_conf->msg->error()
-            << "mpcd: cell property to be reduced is smaller than cell list dimensions"
-            << std::endl;
-        throw std::runtime_error("MPCD cell property has insufficient dimensions");
-        }
-
-    // initialize grid if required
-    if (m_needs_init)
-        {
-        initialize();
-        m_needs_init = false;
-        }
-
-    // resize send / receive buffers for this comm element
-    m_send_buf.resize(m_send_idx.getNumElements() * sizeof(typename PackOpT::element));
-    m_recv_buf.resize(m_send_idx.getNumElements() * sizeof(typename PackOpT::element));
-
-#ifdef ENABLE_HIP
-    if (m_exec_conf->isCUDAEnabled())
-        {
-        packBufferGPU(props, op);
-        }
-    else
-#endif // ENABLE_HIP
-        {
-        packBuffer(props, op);
-        }
-
-        // make the MPI calls
-        {
-        // determine whether to use CPU or GPU CUDA buffers
-        access_location::Enum mpi_loc;
-            {
-            mpi_loc = access_location::host;
-            }
-
-        ArrayHandle<unsigned char> h_send_buf(m_send_buf, mpi_loc, access_mode::read);
-        ArrayHandle<unsigned char> h_recv_buf(m_recv_buf, mpi_loc, access_mode::overwrite);
-        typename PackOpT::element* send_buf
-            = reinterpret_cast<typename PackOpT::element*>(h_send_buf.data);
-        typename PackOpT::element* recv_buf
-            = reinterpret_cast<typename PackOpT::element*>(h_recv_buf.data);
-
-        m_reqs.resize(2 * m_neighbors.size());
-        for (unsigned int idx = 0; idx < m_neighbors.size(); ++idx)
-            {
-            const unsigned int neigh = m_neighbors[idx];
-            const unsigned int offset = m_begin[idx];
-            const size_t num_bytes = sizeof(typename PackOpT::element) * m_num_send[idx];
-            MPI_Isend(send_buf + offset,
-                      (unsigned int)num_bytes,
-                      MPI_BYTE,
-                      neigh,
-                      0,
-                      m_mpi_comm,
-                      &m_reqs[2 * idx]);
-            MPI_Irecv(recv_buf + offset,
-                      (unsigned int)num_bytes,
-                      MPI_BYTE,
-                      neigh,
-                      0,
-                      m_mpi_comm,
-                      &m_reqs[2 * idx + 1]);
-            }
-        }
-    }
-
-//! Finalize communication of the grid
-/*!
- * \param props Property buffer to unpack
- * \param op Packing operator to apply to the receive buffer
- *
- * \tparam T Type of buffer to pack (inferred)
- * \tparam PackOpT Packing operator type (inferred)
- *
- * Existing MPI requests are finalized, and then the received buffers are
- * unpacked into \a props. If no communication is ongoing, the method immediately
- * returns and takes no action.
- */
-template<typename T, class PackOpT>
-void mpcd::CellCommunicator::finalize(const GPUArray<T>& props, const PackOpT op)
-    {
-    if (!m_communicating)
-        return;
-
-    // finish all MPI requests
-    MPI_Waitall((unsigned int)m_reqs.size(), m_reqs.data(), MPI_STATUSES_IGNORE);
-
-// unpack the buffer
-#ifdef ENABLE_HIP
-    if (m_exec_conf->isCUDAEnabled())
-        {
-        unpackBufferGPU(props, op);
-        }
-    else
-#endif // ENABLE_HIP
-        {
-        unpackBuffer(props, op);
-        }
-
-    m_communicating = false;
-    }
-
-/*!
- * \param props Property buffer to pack
- * \param op Packing operator to apply to the send buffer
- *
- * \tparam T Type of buffer to pack (inferred)
- * \tparam PackOpT Packing operator type (inferred)
- *
- * The packing operator applies a unary pack functor to \a props, which permits
- * a reduction or transformation of the data to be sent. See mpcd::detail::CellEnergyPackOp
- * for an example.
- *
- * \post Communicated cells in \a props are packed into \a m_send_buf.
- */
-template<typename T, class PackOpT>
-void mpcd::CellCommunicator::packBuffer(const GPUArray<T>& props, const PackOpT op)
-    {
-    ArrayHandle<T> h_props(props, access_location::host, access_mode::read);
-    ArrayHandle<unsigned int> h_send_idx(m_send_idx, access_location::host, access_mode::read);
-    ArrayHandle<unsigned char> h_send_buf(m_send_buf,
-                                          access_location::host,
-                                          access_mode::overwrite);
-    typename PackOpT::element* send_buf
-        = reinterpret_cast<typename PackOpT::element*>(h_send_buf.data);
-
-    for (unsigned int idx = 0; idx < m_send_idx.getNumElements(); ++idx)
-        {
-        send_buf[idx] = op.pack(h_props.data[h_send_idx.data[idx]]);
-        }
-    }
-
-/*!
- * \param props Property buffer to pack
- * \param op Packing operator to apply to the received buffer
- *
- * \tparam T Type of buffer to pack (inferred)
- * \tparam PackOpT Packing operator type (inferred)
- *
- * The packing operator must be associative so that the order it is applied
- * to the received cells does not matter, e.g., addition.
- *
- * \post The bytes from \a m_recv_buf are unpacked into \a props.
- */
-template<typename T, class PackOpT>
-void mpcd::CellCommunicator::unpackBuffer(const GPUArray<T>& props, const PackOpT op)
-    {
-    ArrayHandle<unsigned int> h_recv(m_recv, access_location::host, access_mode::read);
-    ArrayHandle<unsigned int> h_cells(m_cells, access_location::host, access_mode::read);
-    ArrayHandle<unsigned int> h_recv_begin(m_recv_begin, access_location::host, access_mode::read);
-    ArrayHandle<unsigned int> h_recv_end(m_recv_end, access_location::host, access_mode::read);
-
-    ArrayHandle<unsigned char> h_recv_buf(m_recv_buf, access_location::host, access_mode::read);
-    typename PackOpT::element* recv_buf
-        = reinterpret_cast<typename PackOpT::element*>(h_recv_buf.data);
-
-    ArrayHandle<T> h_props(props, access_location::host, access_mode::readwrite);
-
-    for (unsigned int idx = 0; idx < m_num_cells; ++idx)
-        {
-        const unsigned int cell_idx = h_cells.data[idx];
-        const unsigned int begin = h_recv_begin.data[idx];
-        const unsigned int end = h_recv_end.data[idx];
-
-        // loop through all received data for this cell, and unpack it iteratively
-        T val = h_props.data[cell_idx];
-        for (unsigned int i = begin; i < end; ++i)
-            {
-            val = op.unpack(recv_buf[h_recv.data[i]], val);
-            }
-
-        // save the accumulated unpacked value
-        h_props.data[cell_idx] = val;
-        }
-    }
-
-#ifdef ENABLE_HIP
-/*!
- * \param props Property buffer to pack
- * \param op Packing operator to apply to the send buffer
- *
- * \tparam T Type of buffer to pack (inferred)
- * \tparam PackOpT Packing operator type (inferred)
- *
- * The packing operator applies a unary pack functor to \a props, which permits
- * a reduction or transformation of the data to be sent. See mpcd::detail::CellEnergyPackOp
- * for an example.
- *
- * \post Communicated cells in \a props are packed into \a m_send_buf.
- */
-template<typename T, class PackOpT>
-void mpcd::CellCommunicator::packBufferGPU(const GPUArray<T>& props, const PackOpT op)
-    {
-    ArrayHandle<T> d_props(props, access_location::device, access_mode::read);
-    ArrayHandle<unsigned int> d_send_idx(m_send_idx, access_location::device, access_mode::read);
-    ArrayHandle<unsigned char> d_send_buf(m_send_buf,
-                                          access_location::device,
-                                          access_mode::overwrite);
-    typename PackOpT::element* send_buf
-        = reinterpret_cast<typename PackOpT::element*>(d_send_buf.data);
-
-    m_tuner_pack->begin();
-    mpcd::gpu::pack_cell_buffer(send_buf,
-                                d_props.data,
-                                d_send_idx.data,
-                                op,
-                                (unsigned int)m_send_idx.getNumElements(),
-                                m_tuner_pack->getParam()[0]);
-    if (m_exec_conf->isCUDAErrorCheckingEnabled())
-        CHECK_CUDA_ERROR();
-    m_tuner_pack->end();
-    }
-
-/*!
- * \param props Property buffer to pack
- * \param op Packing operator to apply to the received buffer
- *
- * \tparam T Type of buffer to pack (inferred)
- * \tparam PackOpT Packing operator type (inferred)
- *
- * The packing operator must be associative so that the order it is applied
- * to the received cells does not matter, e.g., addition.
- *
- * \post The bytes from \a m_recv_buf are unpacked into \a props.
- */
-template<typename T, class PackOpT>
-void mpcd::CellCommunicator::unpackBufferGPU(const GPUArray<T>& props, const PackOpT op)
-    {
-    ArrayHandle<unsigned int> d_recv(m_recv, access_location::device, access_mode::read);
-    ArrayHandle<unsigned int> d_cells(m_cells, access_location::device, access_mode::read);
-    ArrayHandle<unsigned int> d_recv_begin(m_recv_begin,
-                                           access_location::device,
-                                           access_mode::read);
-    ArrayHandle<unsigned int> d_recv_end(m_recv_end, access_location::device, access_mode::read);
-
-    ArrayHandle<unsigned char> d_recv_buf(m_recv_buf, access_location::device, access_mode::read);
-    typename PackOpT::element* recv_buf
-        = reinterpret_cast<typename PackOpT::element*>(d_recv_buf.data);
-
-    ArrayHandle<T> d_props(props, access_location::device, access_mode::readwrite);
-
-    m_tuner_unpack->begin();
-    mpcd::gpu::unpack_cell_buffer(d_props.data,
-                                  d_cells.data,
-                                  d_recv.data,
-                                  d_recv_begin.data,
-                                  d_recv_end.data,
-                                  recv_buf,
-                                  op,
-                                  m_num_cells,
-                                  m_tuner_unpack->getParam()[0]);
-    if (m_exec_conf->isCUDAErrorCheckingEnabled())
-        CHECK_CUDA_ERROR();
-    m_tuner_unpack->end();
-    }
-#endif // ENABLE_HIP
-
-    } // end namespace hoomd
-
-#endif // MPCD_CELL_COMMUNICATOR_H_
-
-#endif // ENABLE_MPI
diff --git a/hoomd/mpcd/CellList.cc b/hoomd/mpcd/CellList.cc
index 58bacdd3a1..bc47e1c6cd 100644
--- a/hoomd/mpcd/CellList.cc
+++ b/hoomd/mpcd/CellList.cc
@@ -20,7 +20,8 @@ namespace hoomd
 mpcd::CellList::CellList(std::shared_ptr<SystemDefinition> sysdef, Scalar cell_size, bool shift)
     : Compute(sysdef), m_mpcd_pdata(m_sysdef->getMPCDParticleData()), m_cell_np_max(4),
       m_cell_np(m_exec_conf), m_cell_list(m_exec_conf), m_embed_cell_ids(m_exec_conf),
-      m_conditions(m_exec_conf), m_needs_compute_dim(true), m_particles_sorted(false),
+      m_conditions(m_exec_conf), m_cell_vel(m_exec_conf), m_cell_energy(m_exec_conf),
+      m_needs_compute_dim(true), m_property_sum(false), m_particles_sorted(false),
       m_virtual_change(false)
     {
     assert(m_mpcd_pdata);
@@ -40,8 +41,6 @@ mpcd::CellList::CellList(std::shared_ptr<SystemDefinition> sysdef, Scalar cell_s
     m_num_extra = 0;
     m_cover_box = m_pdata->getBox();
 #endif // ENABLE_MPI
-
-    m_mpcd_pdata->getSortSignal().connect<mpcd::CellList, &mpcd::CellList::sort>(this);
     m_mpcd_pdata->getNumVirtualSignal().connect<mpcd::CellList, &mpcd::CellList::slotNumVirtual>(
         this);
     m_pdata->getParticleSortSignal().connect<mpcd::CellList, &mpcd::CellList::slotSorted>(this);
@@ -53,7 +52,8 @@ mpcd::CellList::CellList(std::shared_ptr<SystemDefinition> sysdef,
                          bool shift)
     : Compute(sysdef), m_mpcd_pdata(m_sysdef->getMPCDParticleData()), m_cell_np_max(4),
       m_cell_np(m_exec_conf), m_cell_list(m_exec_conf), m_embed_cell_ids(m_exec_conf),
-      m_conditions(m_exec_conf), m_needs_compute_dim(true), m_particles_sorted(false),
+      m_conditions(m_exec_conf), m_cell_vel(m_exec_conf), m_cell_energy(m_exec_conf),
+      m_needs_compute_dim(true), m_property_sum(false), m_particles_sorted(false),
       m_virtual_change(false)
     {
     assert(m_mpcd_pdata);
@@ -73,8 +73,6 @@ mpcd::CellList::CellList(std::shared_ptr<SystemDefinition> sysdef,
     m_num_extra = 0;
     m_cover_box = m_pdata->getBox();
 #endif // ENABLE_MPI
-
-    m_mpcd_pdata->getSortSignal().connect<mpcd::CellList, &mpcd::CellList::sort>(this);
     m_mpcd_pdata->getNumVirtualSignal().connect<mpcd::CellList, &mpcd::CellList::slotNumVirtual>(
         this);
     m_pdata->getParticleSortSignal().connect<mpcd::CellList, &mpcd::CellList::slotSorted>(this);
@@ -84,7 +82,6 @@ mpcd::CellList::CellList(std::shared_ptr<SystemDefinition> sysdef,
 mpcd::CellList::~CellList()
     {
     m_exec_conf->msg->notice(5) << "Destroying MPCD CellList" << std::endl;
-    m_mpcd_pdata->getSortSignal().disconnect<mpcd::CellList, &mpcd::CellList::sort>(this);
     m_mpcd_pdata->getNumVirtualSignal().disconnect<mpcd::CellList, &mpcd::CellList::slotNumVirtual>(
         this);
     m_pdata->getParticleSortSignal().disconnect<mpcd::CellList, &mpcd::CellList::slotSorted>(this);
@@ -113,6 +110,9 @@ void mpcd::CellList::compute(uint64_t timestep)
         m_force_compute = true;
         }
 
+    // ensure optional flags are up to date
+    updateFlags();
+
     if (peekCompute(timestep))
         {
         // ensure grid is shifted
@@ -151,7 +151,8 @@ void mpcd::CellList::compute(uint64_t timestep)
                 resetConditions();
                 }
             } while (overflowed);
-
+        m_property_sum = true;
+        finishComputeProperties();
         // we are finished building, explicitly mark everything (rather than using shouldCompute)
         m_first_compute = false;
         m_force_compute = false;
@@ -308,6 +309,8 @@ void mpcd::CellList::computeDimensions()
     m_global_cell_indexer = Index3D(m_global_cell_dim.x, m_global_cell_dim.y, m_global_cell_dim.z);
     m_cell_indexer = Index3D(m_cell_dim.x, m_cell_dim.y, m_cell_dim.z);
     m_cell_np.resize(m_cell_indexer.getNumElements());
+    m_cell_vel.resize(m_cell_indexer.getNumElements());
+    m_cell_energy.resize(m_cell_indexer.getNumElements());
 
     // reallocate per-cell memory
     reallocate();
@@ -317,6 +320,21 @@ void mpcd::CellList::computeDimensions()
     notifySizeChange();
     }
 
+void mpcd::CellList::startAutotuning()
+    {
+    Compute::startAutotuning();
+#ifdef ENABLE_MPI
+#endif // ENABLE_MPI
+    }
+
+bool mpcd::CellList::isAutotuningComplete()
+    {
+    bool result = Compute::isAutotuningComplete();
+#ifdef ENABLE_MPI
+#endif // ENABLE_MPI
+    return result;
+    }
+
 #ifdef ENABLE_MPI
 /*!
  * \param dir Direction of communication
@@ -356,8 +374,16 @@ void mpcd::CellList::buildCellList()
                                           access_location::host,
                                           access_mode::overwrite);
     ArrayHandle<unsigned int> h_cell_np(m_cell_np, access_location::host, access_mode::overwrite);
+    ArrayHandle<double4> h_cell_vel(m_cell_vel, access_location::host, access_mode::overwrite);
+    ArrayHandle<double3> h_cell_energy(m_cell_energy,
+                                       access_location::host,
+                                       access_mode::overwrite);
+
     // zero the cell counter
     m_cell_np.zeroFill();
+    m_cell_vel.zeroFill();
+    m_cell_energy.zeroFill();
+    m_property_sum = false;
 
     uint3 conditions = make_uint3(0, 0, 0);
 
@@ -373,6 +399,7 @@ void mpcd::CellList::buildCellList()
     // we can't modify the velocity of embedded particles, so we only read their position
     std::unique_ptr<ArrayHandle<unsigned int>> h_embed_cell_ids;
     std::unique_ptr<ArrayHandle<Scalar4>> h_pos_embed;
+    std::unique_ptr<ArrayHandle<Scalar4>> h_vel_embed;
     std::unique_ptr<ArrayHandle<unsigned int>> h_embed_member_idx;
     if (m_embed_group)
         {
@@ -382,6 +409,9 @@ void mpcd::CellList::buildCellList()
         h_pos_embed.reset(new ArrayHandle<Scalar4>(m_pdata->getPositions(),
                                                    access_location::host,
                                                    access_mode::read));
+        h_vel_embed.reset(new ArrayHandle<Scalar4>(m_pdata->getVelocities(),
+                                                   access_location::host,
+                                                   access_mode::read));
         h_embed_member_idx.reset(new ArrayHandle<unsigned int>(m_embed_group->getIndexArray(),
                                                                access_location::host,
                                                                access_mode::read));
@@ -405,15 +435,22 @@ void mpcd::CellList::buildCellList()
     for (unsigned int cur_p = 0; cur_p < N_tot; ++cur_p)
         {
         Scalar4 postype_i;
+        Scalar4 vel_mass_i;
+        double mass_i;
         if (cur_p < N_mpcd)
             {
             postype_i = h_pos.data[cur_p];
+            vel_mass_i = h_vel.data[cur_p];
+            mass_i = 1;
             }
         else
             {
             postype_i = h_pos_embed->data[h_embed_member_idx->data[cur_p - N_mpcd]];
+            vel_mass_i = h_vel_embed->data[h_embed_member_idx->data[cur_p - N_mpcd]];
+            mass_i = vel_mass_i.w;
             }
-        Scalar3 pos_i = make_scalar3(postype_i.x, postype_i.y, postype_i.z);
+        const Scalar3 pos_i = make_scalar3(postype_i.x, postype_i.y, postype_i.z);
+        const double3 vel_i = make_double3(vel_mass_i.x, vel_mass_i.y, vel_mass_i.z);
 
         if (std::isnan(pos_i.x) || std::isnan(pos_i.y) || std::isnan(pos_i.z))
             {
@@ -509,6 +546,22 @@ void mpcd::CellList::buildCellList()
             h_embed_cell_ids->data[cur_p - N_mpcd] = bin_idx;
             }
 
+        // compute the contribution of the particle to cell velocity
+        double4 cell_vel = h_cell_vel.data[bin_idx];
+        cell_vel.x += mass_i * vel_i.x;
+        cell_vel.y += mass_i * vel_i.y;
+        cell_vel.z += mass_i * vel_i.z;
+        cell_vel.w += mass_i;
+        h_cell_vel.data[bin_idx] = make_double4(cell_vel.x, cell_vel.y, cell_vel.z, cell_vel.w);
+        // compute optional cell properties
+        if (m_flags[mpcd::detail::thermo_options::energy])
+            {
+            double3 cell_energy = h_cell_energy.data[bin_idx];
+            cell_energy.x
+                += 0.5 * mass_i * (vel_i.x * vel_i.x + vel_i.y * vel_i.y + vel_i.z * vel_i.z);
+            cell_energy.z += 1;
+            h_cell_energy.data[bin_idx] = make_double3(cell_energy.x, 0.0, cell_energy.z);
+            }
         // increment the counter always
         ++h_cell_np.data[bin_idx];
         }
@@ -517,51 +570,55 @@ void mpcd::CellList::buildCellList()
     m_conditions.resetFlags(conditions);
     }
 
-/*!
- * \param timestep Timestep that the sorting occurred
- * \param order Mapping of sorted particle indexes onto old particle indexes
- * \param rorder Mapping of old particle indexes onto sorted particle indexes
- */
-void mpcd::CellList::sort(uint64_t timestep,
-                          const GPUArray<unsigned int>& order,
-                          const GPUArray<unsigned int>& rorder)
+void mpcd::CellList::finishComputeProperties()
     {
-    // no need to do any sorting if we can still be called at the current timestep
-    if (peekCompute(timestep))
-        return;
-
-    // if mapping is not valid, signal that we need to force a recompute next time
-    // that the cell list is needed. We don't call forceCompute() directly because this
-    // always runs compute(), and we just want to defer to the next compute() call.
-    if (rorder.isNull())
+    if (!m_property_sum)
         {
-        m_force_compute = true;
         return;
         }
+    ArrayHandle<unsigned int> h_cell_np(m_cell_np, access_location::host, access_mode::overwrite);
+    ArrayHandle<double4> h_cell_vel(m_cell_vel, access_location::host, access_mode::readwrite);
+    ArrayHandle<double3> h_cell_energy(m_cell_energy,
+                                       access_location::host,
+                                       access_mode::readwrite);
 
-    // iterate through particles in cell list, and update their indexes using reverse
-    // mapping
-    ArrayHandle<unsigned int> h_rorder(rorder, access_location::host, access_mode::read);
-    ArrayHandle<unsigned int> h_cell_np(m_cell_np, access_location::host, access_mode::read);
-    ArrayHandle<unsigned int> h_cell_list(m_cell_list,
-                                          access_location::host,
-                                          access_mode::readwrite);
-    const unsigned int N_mpcd = m_mpcd_pdata->getN();
-
+    // iterate over all cells and compute average velocity, energy, temperature
+    const bool need_energy = m_flags[mpcd::detail::thermo_options::energy];
     for (unsigned int idx = 0; idx < getNCells(); ++idx)
         {
-        const unsigned int np = h_cell_np.data[idx];
-        for (unsigned int offset = 0; offset < np; ++offset)
+        // average cell properties if the cell has mass
+        const double4 cell_vel = h_cell_vel.data[idx];
+        double3 vel_cm = make_double3(cell_vel.x, cell_vel.y, cell_vel.z);
+        const double mass = cell_vel.w;
+
+        if (mass > 0.)
             {
-            const unsigned int cl_idx = m_cell_list_indexer(offset, idx);
-            const unsigned int pid = h_cell_list.data[cl_idx];
-            // only update indexes of MPCD particles, not virtual or embedded particles
-            if (pid < N_mpcd)
+            // average velocity is only defined when there is some mass in the cell
+            vel_cm.x /= mass;
+            vel_cm.y /= mass;
+            vel_cm.z /= mass;
+            }
+        h_cell_vel.data[idx] = make_double4(vel_cm.x, vel_cm.y, vel_cm.z, mass);
+
+        if (need_energy)
+            {
+            const double3 cell_energy = h_cell_energy.data[idx];
+            const double ke = cell_energy.x;
+            double temp(0.0);
+            const unsigned int np = h_cell_np.data[idx];
+            // temperature is only defined for 2 or more particles
+            if (np > 1)
                 {
-                h_cell_list.data[cl_idx] = h_rorder.data[pid];
+                const double ke_cm
+                    = 0.5 * mass
+                      * (vel_cm.x * vel_cm.x + vel_cm.y * vel_cm.y + vel_cm.z * vel_cm.z);
+                temp = 2. * (ke - ke_cm) / (m_sysdef->getNDimensions() * (np - 1));
                 }
+            h_cell_energy.data[idx] = make_double3(ke, temp, __int_as_double(np));
             }
         }
+    // ensure that properties will not be normalized twice
+    m_property_sum = false;
     }
 
 #ifdef ENABLE_MPI
@@ -609,6 +666,18 @@ bool mpcd::CellList::needsEmbedMigrate(uint64_t timestep)
     }
 #endif // ENABLE_MPI
 
+void mpcd::CellList::updateFlags()
+    {
+    mpcd::detail::ThermoFlags flags;
+
+    if (!m_flag_signal.empty())
+        {
+        m_flag_signal.emit_accumulate([&](mpcd::detail::ThermoFlags f) { flags |= f; });
+        }
+
+    m_flags = flags;
+    }
+
 bool mpcd::CellList::checkConditions()
     {
     bool result = false;
diff --git a/hoomd/mpcd/CellList.h b/hoomd/mpcd/CellList.h
index f63c37d40f..8b6e6e66c1 100644
--- a/hoomd/mpcd/CellList.h
+++ b/hoomd/mpcd/CellList.h
@@ -13,12 +13,14 @@
 #error This header cannot be compiled by nvcc
 #endif
 
+#include "CellThermoTypes.h"
 #include "CommunicatorUtilities.h"
 #include "ParticleData.h"
 
 #include "hoomd/Compute.h"
 #include "hoomd/GPUFlags.h"
 #include "hoomd/ParticleGroup.h"
+#include "hoomd/SystemDefinition.h"
 
 #include "hoomd/extern/nano-signal-slot/nano_signal_slot.hpp"
 #include <pybind11/pybind11.h>
@@ -48,6 +50,12 @@ class PYBIND11_EXPORT CellList : public Compute
     //! Sizes the cell list based on the box
     void computeDimensions();
 
+    //! Start autotuning kernel launch parameters
+    void startAutotuning() override;
+
+    //! Check if kernel autotuning is complete
+    bool isAutotuningComplete() override;
+
     //! Get the cell list data
     const GPUArray<unsigned int>& getCellList() const
         {
@@ -60,6 +68,18 @@ class PYBIND11_EXPORT CellList : public Compute
         return m_cell_np;
         }
 
+    //! Get the cell velocities from the last call to compute
+    const GPUArray<double4>& getCellVelocities() const
+        {
+        return m_cell_vel;
+        }
+
+    //! Get the cell energies from the last call to compute
+    const GPUArray<double3>& getCellEnergies() const
+        {
+        return m_cell_energy;
+        }
+
     //! Get the total number of cells in the list
     const unsigned int getNCells() const
         {
@@ -243,6 +263,23 @@ class PYBIND11_EXPORT CellList : public Compute
         return m_dim_signal;
         }
 
+    //! Get the signal for requested thermo flags
+    /*!
+     * \returns A signal that subscribers can attach a callback to in order
+     *          to request certain data.
+     *
+     * For performance reasons, the CellThermoCompute should be able to
+     * supply many related cell-level quantities from a single kernel launch.
+     * However, sometimes these quantities are not needed, and it is better
+     * to skip calculating them. Subscribing classes can optionally request
+     * some of these quantities via a callback return mpcd::detail::ThermoFlags
+     * with the appropriate bits set.
+     */
+    Nano::Signal<mpcd::detail::ThermoFlags()>& getFlagsSignal()
+        {
+        return m_flag_signal;
+        }
+
     protected:
     std::shared_ptr<mpcd::ParticleData> m_mpcd_pdata; //!< MPCD particle data
     std::shared_ptr<ParticleGroup> m_embed_group;     //!< Embedded particles
@@ -268,6 +305,14 @@ class PYBIND11_EXPORT CellList : public Compute
 
     int3 m_origin_idx; //!< Origin as a global index
 
+    GPUVector<double4> m_cell_vel;    //!< Average velocity of a cell + cell mass
+    GPUVector<double3> m_cell_energy; //!< Kinetic energy, unscaled temperature, dof in each cell
+
+    Nano::Signal<mpcd::detail::ThermoFlags()> m_flag_signal; //!< Signal for requested flags
+    mpcd::detail::ThermoFlags m_flags;                       //!< Requested thermo flags
+    //! Updates the requested optional flags
+    void updateFlags();
+
 #ifdef ENABLE_MPI
     unsigned int m_num_extra;               //!< Number of extra cells to communicate over
     std::array<unsigned int, 6> m_num_comm; //!< Number of cells to communicate on each face
@@ -286,13 +331,12 @@ class PYBIND11_EXPORT CellList : public Compute
     //! Builds the cell list and handles cell list memory
     virtual void buildCellList();
 
-    //! Callback to sort cell list when particle data is sorted
-    virtual void sort(uint64_t timestep,
-                      const GPUArray<unsigned int>& order,
-                      const GPUArray<unsigned int>& rorder);
+    //! Do final cell property calculation
+    void finishComputeProperties();
 
     private:
     bool m_needs_compute_dim; //!< True if the dimensions need to be (re-)computed
+    bool m_property_sum; //!< True if all contributions to cell properties have been accumulated
     //! Slot for box resizing
     void slotBoxChanged()
         {
diff --git a/hoomd/mpcd/CellListGPU.cc b/hoomd/mpcd/CellListGPU.cc
index ce51583a09..42ae5579ba 100644
--- a/hoomd/mpcd/CellListGPU.cc
+++ b/hoomd/mpcd/CellListGPU.cc
@@ -19,10 +19,7 @@ mpcd::CellListGPU::CellListGPU(std::shared_ptr<SystemDefinition> sysdef,
     m_tuner_cell.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
                                         m_exec_conf,
                                         "mpcd_cell"));
-    m_tuner_sort.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
-                                        m_exec_conf,
-                                        "mpcd_cell_sort"));
-    m_autotuners.insert(m_autotuners.end(), {m_tuner_cell, m_tuner_sort});
+    m_autotuners.insert(m_autotuners.end(), {m_tuner_cell});
 
 #ifdef ENABLE_MPI
     m_tuner_embed_migrate.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
@@ -43,10 +40,7 @@ mpcd::CellListGPU::CellListGPU(std::shared_ptr<SystemDefinition> sysdef,
     m_tuner_cell.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
                                         m_exec_conf,
                                         "mpcd_cell"));
-    m_tuner_sort.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
-                                        m_exec_conf,
-                                        "mpcd_cell_sort"));
-    m_autotuners.insert(m_autotuners.end(), {m_tuner_cell, m_tuner_sort});
+    m_autotuners.insert(m_autotuners.end(), {m_tuner_cell});
 
 #ifdef ENABLE_MPI
     m_tuner_embed_migrate.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
@@ -67,6 +61,10 @@ void mpcd::CellListGPU::buildCellList()
                                           access_location::device,
                                           access_mode::overwrite);
     ArrayHandle<unsigned int> d_cell_np(m_cell_np, access_location::device, access_mode::overwrite);
+    ArrayHandle<double4> d_cell_vel(m_cell_vel, access_location::device, access_mode::overwrite);
+    ArrayHandle<double3> d_cell_energy(m_cell_energy,
+                                       access_location::device,
+                                       access_mode::overwrite);
     ArrayHandle<Scalar4> d_pos(m_mpcd_pdata->getPositions(),
                                access_location::device,
                                access_mode::read);
@@ -97,6 +95,9 @@ void mpcd::CellListGPU::buildCellList()
         ArrayHandle<Scalar4> d_pos_embed(m_pdata->getPositions(),
                                          access_location::device,
                                          access_mode::read);
+        ArrayHandle<Scalar4> d_vel_embed(m_pdata->getVelocities(),
+                                         access_location::device,
+                                         access_mode::read);
         ArrayHandle<unsigned int> d_embed_member_idx(m_embed_group->getIndexArray(),
                                                      access_location::device,
                                                      access_mode::read);
@@ -104,12 +105,15 @@ void mpcd::CellListGPU::buildCellList()
 
         m_tuner_cell->begin();
         mpcd::gpu::compute_cell_list(d_cell_np.data,
+                                     d_cell_vel.data,
+                                     d_cell_energy.data,
                                      d_cell_list.data,
                                      m_conditions.getDeviceFlags(),
                                      d_vel.data,
                                      d_embed_cell_ids.data,
                                      d_pos.data,
                                      d_pos_embed.data,
+                                     d_vel_embed.data,
                                      d_embed_member_idx.data,
                                      m_pdata->getBox().getPeriodic(),
                                      m_origin_idx,
@@ -122,6 +126,7 @@ void mpcd::CellListGPU::buildCellList()
                                      m_cell_list_indexer,
                                      N_mpcd,
                                      N_tot,
+                                     m_flags[mpcd::detail::thermo_options::energy],
                                      m_tuner_cell->getParam()[0]);
         if (m_exec_conf->isCUDAErrorCheckingEnabled())
             CHECK_CUDA_ERROR();
@@ -131,6 +136,8 @@ void mpcd::CellListGPU::buildCellList()
         {
         m_tuner_cell->begin();
         mpcd::gpu::compute_cell_list(d_cell_np.data,
+                                     d_cell_vel.data,
+                                     d_cell_energy.data,
                                      d_cell_list.data,
                                      m_conditions.getDeviceFlags(),
                                      d_vel.data,
@@ -138,6 +145,7 @@ void mpcd::CellListGPU::buildCellList()
                                      d_pos.data,
                                      NULL,
                                      NULL,
+                                     NULL,
                                      m_pdata->getBox().getPeriodic(),
                                      m_origin_idx,
                                      m_grid_shift,
@@ -149,6 +157,7 @@ void mpcd::CellListGPU::buildCellList()
                                      m_cell_list_indexer,
                                      N_mpcd,
                                      N_tot,
+                                     m_flags[mpcd::detail::thermo_options::energy],
                                      m_tuner_cell->getParam()[0]);
         if (m_exec_conf->isCUDAErrorCheckingEnabled())
             CHECK_CUDA_ERROR();
@@ -156,45 +165,6 @@ void mpcd::CellListGPU::buildCellList()
         }
     }
 
-/*!
- * \param timestep Timestep that the sorting occurred
- * \param order Mapping of sorted particle indexes onto old particle indexes
- * \param rorder Mapping of old particle indexes onto sorted particle indexes
- */
-void mpcd::CellListGPU::sort(uint64_t timestep,
-                             const GPUArray<unsigned int>& order,
-                             const GPUArray<unsigned int>& rorder)
-    {
-    // no need to do any sorting if we can still be called at the current timestep
-    if (peekCompute(timestep))
-        return;
-
-    // force a recompute if mapping is invalid
-    if (rorder.isNull())
-        {
-        m_force_compute = true;
-        return;
-        }
-
-    // iterate through particles in cell list, and update their indexes using reverse mapping
-    ArrayHandle<unsigned int> d_cell_list(m_cell_list,
-                                          access_location::device,
-                                          access_mode::readwrite);
-    ArrayHandle<unsigned int> d_rorder(rorder, access_location::device, access_mode::read);
-    ArrayHandle<unsigned int> d_cell_np(m_cell_np, access_location::device, access_mode::read);
-
-    m_tuner_sort->begin();
-    mpcd::gpu::cell_apply_sort(d_cell_list.data,
-                               d_rorder.data,
-                               d_cell_np.data,
-                               m_cell_list_indexer,
-                               m_mpcd_pdata->getN(),
-                               m_tuner_sort->getParam()[0]);
-    if (m_exec_conf->isCUDAErrorCheckingEnabled())
-        CHECK_CUDA_ERROR();
-    m_tuner_sort->end();
-    }
-
 #ifdef ENABLE_MPI
 bool mpcd::CellListGPU::needsEmbedMigrate(uint64_t timestep)
     {
diff --git a/hoomd/mpcd/CellListGPU.cu b/hoomd/mpcd/CellListGPU.cu
index 40435b7946..7e3feb8376 100644
--- a/hoomd/mpcd/CellListGPU.cu
+++ b/hoomd/mpcd/CellListGPU.cu
@@ -19,12 +19,15 @@ namespace kernel
 //! Kernel to compute the MPCD cell list on the GPU
 /*!
  * \param d_cell_np Array of number of particles per cell
+ * \param d_cell_vel Array of center of mass velocity + cell mass per cell
+ * \param d_cell_energy Array of per cell kinetic energy, temperature, and dof
  * \param d_cell_list 2D array of MPCD particles in each cell
  * \param d_conditions Conditions flags for error reporting
  * \param d_vel MPCD particle velocities
  * \param d_embed_cell_ids Cell indexes of embedded particles
  * \param d_pos MPCD particle positions
  * \param d_pos_embed Particle positions
+ * \param d_vel_embed Particle velocities
  * \param d_embed_member_idx Indexes of embedded particles in \a d_pos_embed
  * \param periodic Flags if local simulation is periodic
  * \param origin_idx Global origin index for the local box
@@ -37,21 +40,26 @@ namespace kernel
  * \param cell_list_indexer 2D indexer for particle position in cell
  * \param N_mpcd Number of MPCD particles
  * \param N_tot Total number of particle (MPCD + embedded)
+ * \param need_energy True if computing the energy
  *
  * \b Implementation
  * One thread is launched per particle. The particle is floored into a bin subject to a random grid
- * shift. The number of particles in that bin is atomically incremented. If the addition of the
+ * shift. The number of particles in that bin is atomically incremented and the contribution of the
+ * particle's properties to the cell's properties is added to a running sum. If the addition of the
  * particle will not overflow the allocated memory, the particle is written into that bin.
  * Otherwise, a flag is set to resize the cell list and recompute. The MPCD particle's cell id is
  * stashed into the velocity array.
  */
 __global__ void compute_cell_list(unsigned int* d_cell_np,
+                                  double4* d_cell_vel,
+                                  double3* d_cell_energy,
                                   unsigned int* d_cell_list,
                                   uint3* d_conditions,
                                   Scalar4* d_vel,
                                   unsigned int* d_embed_cell_ids,
                                   const Scalar4* d_pos,
                                   const Scalar4* d_pos_embed,
+                                  const Scalar4* d_vel_embed,
                                   const unsigned int* d_embed_member_idx,
                                   const uchar3 periodic,
                                   const int3 origin_idx,
@@ -63,7 +71,8 @@ __global__ void compute_cell_list(unsigned int* d_cell_np,
                                   const Index3D cell_indexer,
                                   const Index2D cell_list_indexer,
                                   const unsigned int N_mpcd,
-                                  const unsigned int N_tot)
+                                  const unsigned int N_tot,
+                                  const bool need_energy)
     {
     // one thread per particle
     unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -71,15 +80,22 @@ __global__ void compute_cell_list(unsigned int* d_cell_np,
         return;
 
     Scalar4 postype_i;
+    Scalar4 vel_mass_i;
+    double mass_i;
     if (idx < N_mpcd)
         {
         postype_i = d_pos[idx];
+        vel_mass_i = d_vel[idx];
+        mass_i = 1.0;
         }
     else
         {
         postype_i = d_pos_embed[d_embed_member_idx[idx - N_mpcd]];
+        vel_mass_i = d_vel_embed[d_embed_member_idx[idx - N_mpcd]];
+        mass_i = vel_mass_i.w;
         }
     const Scalar3 pos_i = make_scalar3(postype_i.x, postype_i.y, postype_i.z);
+    const double3 vel_i = make_double3(vel_mass_i.x, vel_mass_i.y, vel_mass_i.z);
 
     if (isnan(pos_i.x) || isnan(pos_i.y) || isnan(pos_i.z))
         {
@@ -175,6 +191,19 @@ __global__ void compute_cell_list(unsigned int* d_cell_np,
         {
         d_embed_cell_ids[idx - N_mpcd] = bin_idx;
         }
+
+    // compute the contribution of the particle to cell properties
+    atomicAdd(&d_cell_vel[bin_idx].x, vel_i.x * mass_i);
+    atomicAdd(&d_cell_vel[bin_idx].y, vel_i.y * mass_i);
+    atomicAdd(&d_cell_vel[bin_idx].z, vel_i.z * mass_i);
+    atomicAdd(&d_cell_vel[bin_idx].w, mass_i);
+
+    if (need_energy)
+        {
+        double ke = 0.5 * mass_i * (vel_i.x * vel_i.x + vel_i.y * vel_i.y + vel_i.z * vel_i.z);
+        atomicAdd(&d_cell_energy[bin_idx].x, ke);
+        atomicAdd(&d_cell_energy[bin_idx].z, __int_as_double(1));
+        }
     }
 
 /*!
@@ -218,47 +247,21 @@ __global__ void cell_check_migrate_embed(unsigned int* d_migrate_flag,
         }
     }
 
-__global__ void cell_apply_sort(unsigned int* d_cell_list,
-                                const unsigned int* d_rorder,
-                                const unsigned int* d_cell_np,
-                                const Index2D cli,
-                                const unsigned int N_mpcd,
-                                const unsigned int N_cli)
-    {
-    // one thread per cell-list entry
-    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= N_cli)
-        return;
-
-    // convert the entry 1D index into a 2D index
-    const unsigned int cell = idx / cli.getW();
-    const unsigned int offset = idx - (cell * cli.getW());
-
-    /* here comes some terrible execution divergence */
-    // check if the cell is filled
-    const unsigned int np = d_cell_np[cell];
-    if (offset < np)
-        {
-        // check if this is an MPCD particle
-        const unsigned int pid = d_cell_list[idx];
-        if (pid < N_mpcd)
-            {
-            d_cell_list[idx] = d_rorder[pid];
-            }
-        }
-    }
     } // end namespace kernel
     } // end namespace gpu
     } // end namespace mpcd
 
 /*!
  * \param d_cell_np Array of number of particles per cell
+ * \param d_cell_vel Array of center of mass velocity + cell mass per cell
+ * \param d_cell_energy Array of per cell kinetic energy, temperature, and dof
  * \param d_cell_list 2D array of MPCD particles in each cell
  * \param d_conditions Conditions flags for error reporting
  * \param d_vel MPCD particle velocities
  * \param d_embed_cell_ids Cell indexes of embedded particles
  * \param d_pos MPCD particle positions
  * \param d_pos_embed Particle positions
+ * \param d_vel_embed Particle velocities
  * \param d_embed_member_idx Indexes of embedded particles in \a d_pos_embed
  * \param periodic Flags if local simulation is periodic
  * \param origin_idx Global origin index for the local box
@@ -271,17 +274,21 @@ __global__ void cell_apply_sort(unsigned int* d_cell_list,
  * \param cell_list_indexer 2D indexer for particle position in cell
  * \param N_mpcd Number of MPCD particles
  * \param N_tot Total number of particle (MPCD + embedded)
+ * \param need_energy True if computing the energy
  * \param block_size Number of threads per block
  *
  * \returns cudaSuccess on completion, or an error on failure
  */
 cudaError_t mpcd::gpu::compute_cell_list(unsigned int* d_cell_np,
+                                         double4* d_cell_vel,
+                                         double3* d_cell_energy,
                                          unsigned int* d_cell_list,
                                          uint3* d_conditions,
                                          Scalar4* d_vel,
                                          unsigned int* d_embed_cell_ids,
                                          const Scalar4* d_pos,
                                          const Scalar4* d_pos_embed,
+                                         const Scalar4* d_vel_embed,
                                          const unsigned int* d_embed_member_idx,
                                          const uchar3& periodic,
                                          const int3& origin_idx,
@@ -294,6 +301,7 @@ cudaError_t mpcd::gpu::compute_cell_list(unsigned int* d_cell_np,
                                          const Index2D& cell_list_indexer,
                                          const unsigned int N_mpcd,
                                          const unsigned int N_tot,
+                                         const bool need_energy,
                                          const unsigned int block_size)
     {
     // set the number of particles in each cell to zero
@@ -301,7 +309,14 @@ cudaError_t mpcd::gpu::compute_cell_list(unsigned int* d_cell_np,
         = cudaMemset(d_cell_np, 0, sizeof(unsigned int) * cell_indexer.getNumElements());
     if (error != cudaSuccess)
         return error;
-
+    cudaError_t error_vel
+        = cudaMemset(d_cell_vel, 0, sizeof(double3) * cell_indexer.getNumElements());
+    if (error_vel != cudaSuccess)
+        return error_vel;
+    cudaError_t error_energy
+        = cudaMemset(d_cell_energy, 0, sizeof(double3) * cell_indexer.getNumElements());
+    if (error_energy != cudaSuccess)
+        return error_energy;
     unsigned int max_block_size;
     cudaFuncAttributes attr;
     cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::compute_cell_list);
@@ -310,12 +325,15 @@ cudaError_t mpcd::gpu::compute_cell_list(unsigned int* d_cell_np,
     unsigned int run_block_size = min(block_size, max_block_size);
     dim3 grid(N_tot / run_block_size + 1);
     mpcd::gpu::kernel::compute_cell_list<<<grid, run_block_size>>>(d_cell_np,
+                                                                   d_cell_vel,
+                                                                   d_cell_energy,
                                                                    d_cell_list,
                                                                    d_conditions,
                                                                    d_vel,
                                                                    d_embed_cell_ids,
                                                                    d_pos,
                                                                    d_pos_embed,
+                                                                   d_vel_embed,
                                                                    d_embed_member_idx,
                                                                    periodic,
                                                                    origin_idx,
@@ -327,7 +345,8 @@ cudaError_t mpcd::gpu::compute_cell_list(unsigned int* d_cell_np,
                                                                    cell_indexer,
                                                                    cell_list_indexer,
                                                                    N_mpcd,
-                                                                   N_tot);
+                                                                   N_tot,
+                                                                   need_energy);
 
     return cudaSuccess;
     }
@@ -370,30 +389,4 @@ cudaError_t mpcd::gpu::cell_check_migrate_embed(unsigned int* d_migrate_flag,
     return cudaSuccess;
     }
 
-cudaError_t mpcd::gpu::cell_apply_sort(unsigned int* d_cell_list,
-                                       const unsigned int* d_rorder,
-                                       const unsigned int* d_cell_np,
-                                       const Index2D& cli,
-                                       const unsigned int N_mpcd,
-                                       const unsigned int block_size)
-    {
-    unsigned int max_block_size;
-    cudaFuncAttributes attr;
-    cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::cell_apply_sort);
-    max_block_size = attr.maxThreadsPerBlock;
-
-    const unsigned int N_cli = cli.getNumElements();
-
-    unsigned int run_block_size = min(block_size, max_block_size);
-    dim3 grid(N_cli / run_block_size + 1);
-    mpcd::gpu::kernel::cell_apply_sort<<<grid, run_block_size>>>(d_cell_list,
-                                                                 d_rorder,
-                                                                 d_cell_np,
-                                                                 cli,
-                                                                 N_mpcd,
-                                                                 N_cli);
-
-    return cudaSuccess;
-    }
-
     } // end namespace hoomd
diff --git a/hoomd/mpcd/CellListGPU.cuh b/hoomd/mpcd/CellListGPU.cuh
index e74440ba81..39287c3a64 100644
--- a/hoomd/mpcd/CellListGPU.cuh
+++ b/hoomd/mpcd/CellListGPU.cuh
@@ -23,12 +23,15 @@ namespace gpu
     {
 //! Kernel driver to compute mpcd cell list
 cudaError_t compute_cell_list(unsigned int* d_cell_np,
+                              double4* d_cell_vel,
+                              double3* d_cell_energy,
                               unsigned int* d_cell_list,
                               uint3* d_conditions,
                               Scalar4* d_vel,
                               unsigned int* d_embed_cell_ids,
                               const Scalar4* d_pos,
                               const Scalar4* d_pos_embed,
+                              const Scalar4* d_vel_embed,
                               const unsigned int* d_embed_member_idx,
                               const uchar3& periodic,
                               const int3& origin_idx,
@@ -41,6 +44,7 @@ cudaError_t compute_cell_list(unsigned int* d_cell_np,
                               const Index2D& cell_list_indexer,
                               const unsigned int N_mpcd,
                               const unsigned int N_tot,
+                              const bool need_energy,
                               const unsigned int block_size);
 
 //! Kernel driver to check if any embedded particles require migration
@@ -52,14 +56,6 @@ cudaError_t cell_check_migrate_embed(unsigned int* d_migrate_flag,
                                      const unsigned int N,
                                      const unsigned int block_size);
 
-//! Kernel drive to apply sorted order to MPCD particles in cell list
-cudaError_t cell_apply_sort(unsigned int* d_cell_list,
-                            const unsigned int* d_rorder,
-                            const unsigned int* d_cell_np,
-                            const Index2D& cli,
-                            const unsigned int N_mpcd,
-                            const unsigned int block_size);
-
     } // end namespace gpu
     } // end namespace mpcd
     } // end namespace hoomd
diff --git a/hoomd/mpcd/CellListGPU.h b/hoomd/mpcd/CellListGPU.h
index f9f78f8779..c58ecce123 100644
--- a/hoomd/mpcd/CellListGPU.h
+++ b/hoomd/mpcd/CellListGPU.h
@@ -36,11 +36,6 @@ class PYBIND11_EXPORT CellListGPU : public mpcd::CellList
     //! Compute the cell list of particles on the GPU
     void buildCellList() override;
 
-    //! Callback to sort cell list on the GPU when particle data is sorted
-    virtual void sort(uint64_t timestep,
-                      const GPUArray<unsigned int>& order,
-                      const GPUArray<unsigned int>& rorder);
-
 #ifdef ENABLE_MPI
     //! Determine if embedded particles require migration on the gpu
     virtual bool needsEmbedMigrate(uint64_t timestep);
@@ -51,8 +46,6 @@ class PYBIND11_EXPORT CellListGPU : public mpcd::CellList
     /// Autotuner for the cell list calculation.
     std::shared_ptr<Autotuner<1>> m_tuner_cell;
 
-    /// Autotuner for sorting the cell list.
-    std::shared_ptr<Autotuner<1>> m_tuner_sort;
 #ifdef ENABLE_MPI
     /// Autotuner for checking embedded migration.
     std::shared_ptr<Autotuner<1>> m_tuner_embed_migrate;
diff --git a/hoomd/mpcd/CollisionMethod.cc b/hoomd/mpcd/CollisionMethod.cc
index b7d59b66f7..e1fe10c9ef 100644
--- a/hoomd/mpcd/CollisionMethod.cc
+++ b/hoomd/mpcd/CollisionMethod.cc
@@ -99,12 +99,6 @@ void mpcd::CollisionMethod::collide(uint64_t timestep)
         }
 #endif // ENABLE_HIP
 
-    // set random grid shift
-    m_cl->drawGridShift(timestep);
-
-    // update cell list
-    m_cl->compute(timestep);
-
     // setup rigid bodies before collision happens
     const bool rigid_body_collision = m_embed_group && m_rigid_bodies;
     if (rigid_body_collision)
@@ -143,6 +137,12 @@ void mpcd::CollisionMethod::collide(uint64_t timestep)
             }
         }
 
+    // set random grid shift
+    m_cl->drawGridShift(timestep);
+
+    // update cell list
+    m_cl->compute(timestep);
+
     // apply collisions
     rule(timestep);
 
diff --git a/hoomd/mpcd/SRDCollisionMethod.cc b/hoomd/mpcd/SRDCollisionMethod.cc
index c69b8ed16e..fc9d32679c 100644
--- a/hoomd/mpcd/SRDCollisionMethod.cc
+++ b/hoomd/mpcd/SRDCollisionMethod.cc
@@ -32,22 +32,20 @@ mpcd::SRDCollisionMethod::~SRDCollisionMethod()
 void mpcd::SRDCollisionMethod::startAutotuning()
     {
     mpcd::CollisionMethod::startAutotuning();
-    if (m_thermo)
-        m_thermo->startAutotuning();
+    if (m_cl)
+        m_cl->startAutotuning();
     }
 
 bool mpcd::SRDCollisionMethod::isAutotuningComplete()
     {
     bool result = mpcd::CollisionMethod::isAutotuningComplete();
-    if (m_thermo)
-        result = result && m_thermo->isAutotuningComplete();
+    if (m_cl)
+        result = result && m_cl->isAutotuningComplete();
     return result;
     }
 
 void mpcd::SRDCollisionMethod::rule(uint64_t timestep)
     {
-    m_thermo->compute(timestep);
-
     // resize the rotation vectors and rescale factors
     m_rotvec.resize(m_cl->getNCells());
     if (m_T)
@@ -78,7 +76,7 @@ void mpcd::SRDCollisionMethod::drawRotationVectors(uint64_t timestep)
         {
         h_factors.reset(
             new ArrayHandle<double>(m_factors, access_location::host, access_mode::overwrite));
-        h_cell_energy.reset(new ArrayHandle<double3>(m_thermo->getCellEnergies(),
+        h_cell_energy.reset(new ArrayHandle<double3>(m_cl->getCellEnergies(),
                                                      access_location::host,
                                                      access_mode::read));
         T_set = (*m_T)(timestep);
@@ -162,7 +160,7 @@ void mpcd::SRDCollisionMethod::rotate(uint64_t timestep)
         }
 
     // acquire cell velocities
-    ArrayHandle<double4> h_cell_vel(m_thermo->getCellVelocities(),
+    ArrayHandle<double4> h_cell_vel(m_cl->getCellVelocities(),
                                     access_location::host,
                                     access_mode::read);
 
@@ -260,33 +258,28 @@ void mpcd::SRDCollisionMethod::setCellList(std::shared_ptr<mpcd::CellList> cl)
     {
     if (cl != m_cl)
         {
-        CollisionMethod::setCellList(cl);
         detachCallbacks();
+        CollisionMethod::setCellList(cl);
         if (m_cl)
             {
-            m_thermo = std::make_shared<mpcd::CellThermoCompute>(m_sysdef, m_cl);
             attachCallbacks();
             }
-        else
-            {
-            m_thermo = std::shared_ptr<mpcd::CellThermoCompute>();
-            }
         }
     }
 
 void mpcd::SRDCollisionMethod::attachCallbacks()
     {
-    assert(m_thermo);
-    m_thermo->getFlagsSignal()
+    assert(m_cl);
+    m_cl->getFlagsSignal()
         .connect<mpcd::SRDCollisionMethod, &mpcd::SRDCollisionMethod::getRequestedThermoFlags>(
             this);
     }
 
 void mpcd::SRDCollisionMethod::detachCallbacks()
     {
-    if (m_thermo)
+    if (m_cl)
         {
-        m_thermo->getFlagsSignal()
+        m_cl->getFlagsSignal()
             .disconnect<mpcd::SRDCollisionMethod,
                         &mpcd::SRDCollisionMethod::getRequestedThermoFlags>(this);
         }
diff --git a/hoomd/mpcd/SRDCollisionMethod.h b/hoomd/mpcd/SRDCollisionMethod.h
index d4732269dd..2d7c09b84e 100644
--- a/hoomd/mpcd/SRDCollisionMethod.h
+++ b/hoomd/mpcd/SRDCollisionMethod.h
@@ -13,7 +13,6 @@
 #error This header cannot be compiled by nvcc
 #endif
 
-#include "CellThermoCompute.h"
 #include "CollisionMethod.h"
 
 #include "hoomd/Variant.h"
@@ -81,9 +80,8 @@ class PYBIND11_EXPORT SRDCollisionMethod : public mpcd::CollisionMethod
         }
 
     protected:
-    std::shared_ptr<mpcd::CellThermoCompute> m_thermo; //!< Cell thermo
-    GPUVector<double3> m_rotvec;                       //!< MPCD rotation vectors
-    Scalar m_angle;                                    //!< MPCD rotation angle (degrees)
+    GPUVector<double3> m_rotvec; //!< MPCD rotation vectors
+    Scalar m_angle;              //!< MPCD rotation angle (degrees)
 
     GPUVector<double> m_factors; //!< Cell-level rescale factors
 
diff --git a/hoomd/mpcd/SRDCollisionMethodGPU.cc b/hoomd/mpcd/SRDCollisionMethodGPU.cc
index bb13dd00bb..169f4a10ae 100644
--- a/hoomd/mpcd/SRDCollisionMethodGPU.cc
+++ b/hoomd/mpcd/SRDCollisionMethodGPU.cc
@@ -37,7 +37,7 @@ void mpcd::SRDCollisionMethodGPU::drawRotationVectors(uint64_t timestep)
     if (m_T)
         {
         ArrayHandle<double> d_factors(m_factors, access_location::device, access_mode::overwrite);
-        ArrayHandle<double3> d_cell_energy(m_thermo->getCellEnergies(),
+        ArrayHandle<double3> d_cell_energy(m_cl->getCellEnergies(),
                                            access_location::device,
                                            access_mode::read);
 
@@ -89,7 +89,7 @@ void mpcd::SRDCollisionMethodGPU::rotate(uint64_t timestep)
     unsigned int N_tot = N_mpcd;
 
     // acquire cell velocities and rotation vectors
-    ArrayHandle<double4> d_cell_vel(m_thermo->getCellVelocities(),
+    ArrayHandle<double4> d_cell_vel(m_cl->getCellVelocities(),
                                     access_location::device,
                                     access_mode::read);
     ArrayHandle<double3> d_rotvec(m_rotvec, access_location::device, access_mode::read);
@@ -162,13 +162,8 @@ void mpcd::SRDCollisionMethodGPU::setCellList(std::shared_ptr<mpcd::CellList> cl
         detachCallbacks();
         if (m_cl)
             {
-            m_thermo = std::make_shared<mpcd::CellThermoComputeGPU>(m_sysdef, m_cl);
             attachCallbacks();
             }
-        else
-            {
-            m_thermo = std::shared_ptr<mpcd::CellThermoComputeGPU>();
-            }
         }
     }
 
diff --git a/hoomd/mpcd/Sorter.cc b/hoomd/mpcd/Sorter.cc
index cc178c372a..69089413ce 100644
--- a/hoomd/mpcd/Sorter.cc
+++ b/hoomd/mpcd/Sorter.cc
@@ -28,7 +28,7 @@ mpcd::Sorter::~Sorter()
 /*!
  * \param timestep Current simulation timestep
  *
- * This method is just a driver for the computeOrder() and applyOrder() methods.
+ * This method is just a driver for the computeOrder() and applySortOrder() methods.
  */
 void mpcd::Sorter::update(uint64_t timestep)
     {
@@ -43,7 +43,7 @@ void mpcd::Sorter::update(uint64_t timestep)
 
     // generate and apply the sorted order
     computeOrder(timestep);
-    applyOrder();
+    applySortOrder();
 
     // trigger the sort signal for ParticleData callbacks using the current sortings
     m_mpcd_pdata->notifySort(timestep, m_order, m_rorder);
@@ -100,7 +100,7 @@ void mpcd::Sorter::computeOrder(uint64_t timestep)
  * arrays. The communication flags are \b not sorted in MPI because by design,
  * the caller is responsible for clearing out any old flags before using them.
  */
-void mpcd::Sorter::applyOrder() const
+void mpcd::Sorter::applySortOrder() const
     {
         // apply the sorted order
         {
diff --git a/hoomd/mpcd/Sorter.h b/hoomd/mpcd/Sorter.h
index 3a308cca0f..12f81739b0 100644
--- a/hoomd/mpcd/Sorter.h
+++ b/hoomd/mpcd/Sorter.h
@@ -28,7 +28,7 @@ namespace mpcd
 /*!
  * The Sorter puts MPCD particles into an order that is more cache-friendly.
  * The natural way to do this for the algorithm is cell list order. The base
- * Sorter implements applyOrder() for applying the reordering map to the
+ * Sorter implements applySortOrder() for applying the reordering map to the
  * mpcd::ParticleData. Specific sorting algorithms can be implemented by
  * deriving from Sorter and implementing computeOrder(). Any computeOrder()
  * must set the map from old particle index to new particle index, and the
@@ -71,7 +71,7 @@ class PYBIND11_EXPORT Sorter : public Tuner
     virtual void computeOrder(uint64_t timestep);
 
     //! Apply the sorting order
-    virtual void applyOrder() const;
+    virtual void applySortOrder() const;
     };
     } // end namespace mpcd
     } // end namespace hoomd
diff --git a/hoomd/mpcd/SorterGPU.cc b/hoomd/mpcd/SorterGPU.cc
index 897b81919b..2620b507c0 100644
--- a/hoomd/mpcd/SorterGPU.cc
+++ b/hoomd/mpcd/SorterGPU.cc
@@ -105,7 +105,7 @@ void mpcd::SorterGPU::computeOrder(uint64_t timestep)
  * arrays. The communication flags are \b not sorted in MPI because by design,
  * the caller is responsible for clearing out any old flags before using them.
  */
-void mpcd::SorterGPU::applyOrder() const
+void mpcd::SorterGPU::applySortOrder() const
     {
         // apply the sorted order
         {
diff --git a/hoomd/mpcd/SorterGPU.h b/hoomd/mpcd/SorterGPU.h
index 566db8aeb7..c7dd719b27 100644
--- a/hoomd/mpcd/SorterGPU.h
+++ b/hoomd/mpcd/SorterGPU.h
@@ -45,7 +45,7 @@ class PYBIND11_EXPORT SorterGPU : public mpcd::Sorter
     void computeOrder(uint64_t timestep) override;
 
     //! Apply the sorting order on the GPU
-    void applyOrder() const override;
+    void applySortOrder() const override;
     };
     } // end namespace mpcd
     } // end namespace hoomd
diff --git a/hoomd/mpcd/pytest/test_collide.py b/hoomd/mpcd/pytest/test_collide.py
index 3bc4852f9b..42ee97f2c5 100644
--- a/hoomd/mpcd/pytest/test_collide.py
+++ b/hoomd/mpcd/pytest/test_collide.py
@@ -139,7 +139,7 @@ def test_run(self, small_snap, simulation_factory, cls, init_args):
         "angmom_rigid", [[0, 0, 0, 0], [0, 2, 3, 4]], ids=["Nonrotating", "Rotating"]
     )
     @pytest.mark.parametrize(
-        "pos_rigid", [[0, 0, 0], [10, 10, 10]], ids=["center", "edge"]
+        "pos_rigid", [[0, 0, 0], [5, 5, 5]], ids=["center", "edge"]
     )
     @pytest.mark.parametrize(
         "def_rigid,properties_rigid",
@@ -216,7 +216,7 @@ def test_rigid_collide(
 
         # create simulation
         initial_snap = one_particle_snapshot_factory(
-            particle_types=["A", "B"], position=pos_rigid, L=21
+            particle_types=["A", "B"], position=pos_rigid, L=11
         )
         total_mass = properties_rigid["mass"][0]
         if initial_snap.communicator.rank == 0:
@@ -226,9 +226,12 @@ def test_rigid_collide(
             initial_snap.particles.angmom[:] = [angmom_rigid]
 
             # place the mpcd particles on top of constituents
+            positions = np.add(def_rigid["positions"], pos_rigid)
+            positions[positions < -11 * 0.5] = positions[positions < -11 * 0.5] + 11
+            positions[positions > 11 * 0.5] = positions[positions > 11 * 0.5] - 11
             initial_snap.mpcd.N = N_mpcd
             initial_snap.mpcd.types = ["C"]
-            initial_snap.mpcd.position[:] = def_rigid["positions"]
+            initial_snap.mpcd.position[:] = positions
             initial_snap.mpcd.velocity[:] = velo_mpcd
 
         sim = simulation_factory(initial_snap)
@@ -338,7 +341,7 @@ def test_rigid_collide_free(
         # create simulation
         total_mass = properties_rigid["mass"][0]
         initial_snap = two_particle_snapshot_factory(
-            particle_types=["A", "B", "C", "D"], L=21
+            particle_types=["A", "B", "C", "D"], L=11
         )
         if initial_snap.communicator.rank == 0:
             # put a free particle that doesn't participate in collision on top of