diff --git a/hoomd/mpcd/CMakeLists.txt b/hoomd/mpcd/CMakeLists.txt
index 7ae9b10b8d..a0b5102004 100644
--- a/hoomd/mpcd/CMakeLists.txt
+++ b/hoomd/mpcd/CMakeLists.txt
@@ -5,9 +5,6 @@ endif()
 
 set(_mpcd_cc_sources
     module.cc
-    ATCollisionMethod.cc
-    CellCommunicator.cc
-    CellThermoCompute.cc
     CellList.cc
     CollisionMethod.cc
     Communicator.cc
@@ -23,12 +20,9 @@ set(_mpcd_cc_sources
     )
 
 set(_mpcd_headers
-    ATCollisionMethod.h
     BounceBackNVE.h
     BulkGeometry.h
     BulkStreamingMethod.h
-    CellCommunicator.h
-    CellThermoCompute.h
     CellList.h
     CollisionMethod.h
     BounceBackStreamingMethod.h
@@ -70,8 +64,6 @@ set(_geometries
 
 if(ENABLE_HIP)
     list(APPEND _mpcd_cc_sources
-        ATCollisionMethodGPU.cc
-        CellThermoComputeGPU.cc
         CellListGPU.cc
         CommunicatorGPU.cc
         ParallelPlateGeometryFillerGPU.cc
@@ -81,16 +73,11 @@ if(ENABLE_HIP)
         SRDCollisionMethodGPU.cc
         )
     list(APPEND _mpcd_headers
-        ATCollisionMethodGPU.cuh
-        ATCollisionMethodGPU.h
         BounceBackNVEGPU.cuh
         BounceBackNVEGPU.h
         BounceBackStreamingMethodGPU.cuh
         BounceBackStreamingMethodGPU.h
         BulkStreamingMethodGPU.h
-        CellCommunicator.cuh
-        CellThermoComputeGPU.cuh
-        CellThermoComputeGPU.h
         CellListGPU.cuh
         CellListGPU.h
         CollisionMethod.cuh
@@ -111,9 +98,7 @@ if(ENABLE_HIP)
         SRDCollisionMethodGPU.h
         )
     set(_mpcd_cu_sources
-        ATCollisionMethodGPU.cu
         BounceBackNVEGPU.cu
-        CellThermoComputeGPU.cu
         CellListGPU.cu
         CollisionMethod.cu
         CommunicatorGPU.cu
diff --git a/hoomd/mpcd/CellCommunicator.cc b/hoomd/mpcd/CellCommunicator.cc
deleted file mode 100644
index ee52380b1c..0000000000
--- a/hoomd/mpcd/CellCommunicator.cc
+++ /dev/null
@@ -1,374 +0,0 @@
-// Copyright (c) 2009-2025 The Regents of the University of Michigan.
-// Part of HOOMD-blue, released under the BSD 3-Clause License.
-
-/*!
- * \file mpcd/CellCommunicator.cc
- * \brief Definition of mpcd::CellCommunicator
- */
-
-#ifdef ENABLE_MPI
-
-#include "CellCommunicator.h"
-
-namespace hoomd
-    {
-// initialize with zero instances of the communicator
-unsigned int mpcd::CellCommunicator::num_instances = 0;
-
-/*!
- * \param sysdef System definition
- * \param cl MPCD cell list
- */
-mpcd::CellCommunicator::CellCommunicator(std::shared_ptr<SystemDefinition> sysdef,
-                                         std::shared_ptr<mpcd::CellList> cl)
-    : m_id(num_instances++), m_sysdef(sysdef), m_pdata(sysdef->getParticleData()),
-      m_exec_conf(m_pdata->getExecConf()), m_mpi_comm(m_exec_conf->getMPICommunicator()),
-      m_decomposition(m_pdata->getDomainDecomposition()), m_cl(cl), m_communicating(false),
-      m_send_buf(m_exec_conf), m_recv_buf(m_exec_conf), m_needs_init(true)
-    {
-    m_exec_conf->msg->notice(5) << "Constructing MPCD CellCommunicator" << std::endl;
-#ifdef ENABLE_HIP
-    if (m_exec_conf->isCUDAEnabled())
-        {
-        m_tuner_pack.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
-                                            m_exec_conf,
-                                            "mpcd_cell_comm_pack_" + std::to_string(m_id)));
-        m_tuner_unpack.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
-                                              m_exec_conf,
-                                              "mpcd_cell_comm_unpack_" + std::to_string(m_id)));
-        m_autotuners.insert(m_autotuners.end(), {m_tuner_pack, m_tuner_unpack});
-        }
-#endif // ENABLE_HIP
-
-    m_cl->getSizeChangeSignal().connect<mpcd::CellCommunicator, &mpcd::CellCommunicator::slotInit>(
-        this);
-    }
-
-mpcd::CellCommunicator::~CellCommunicator()
-    {
-    m_exec_conf->msg->notice(5) << "Destroying MPCD CellCommunicator" << std::endl;
-    m_cl->getSizeChangeSignal()
-        .disconnect<mpcd::CellCommunicator, &mpcd::CellCommunicator::slotInit>(this);
-    }
-
-namespace mpcd
-    {
-namespace detail
-    {
-//! Unary operator to wrap global cell indexes into the local domain
-struct LocalCellWrapOp
-    {
-    LocalCellWrapOp(std::shared_ptr<mpcd::CellList> cl_)
-        : cl(cl_), ci(cl_->getCellIndexer()), gci(cl_->getGlobalCellIndexer())
-        {
-        }
-
-    //! Transform the global 1D cell index into a local 1D cell index
-    inline unsigned int operator()(unsigned int cell_idx)
-        {
-        // convert the 1D global cell index to a global cell tuple
-        const uint3 cell = gci.getTriple(cell_idx);
-
-        // convert the global cell tuple to a local cell tuple
-        int3 local_cell = cl->getLocalCell(make_int3(cell.x, cell.y, cell.z));
-
-        // wrap the local cell through the global boundaries, which should work for all reasonable
-        // cell comms.
-        if (local_cell.x >= (int)gci.getW())
-            local_cell.x -= gci.getW();
-        else if (local_cell.x < 0)
-            local_cell.x += gci.getW();
-
-        if (local_cell.y >= (int)gci.getH())
-            local_cell.y -= gci.getH();
-        else if (local_cell.y < 0)
-            local_cell.y += gci.getH();
-
-        if (local_cell.z >= (int)gci.getD())
-            local_cell.z -= gci.getD();
-        else if (local_cell.z < 0)
-            local_cell.z += gci.getD();
-
-        // convert the local cell tuple back to an index
-        return ci(local_cell.x, local_cell.y, local_cell.z);
-        }
-
-    std::shared_ptr<mpcd::CellList> cl; //!< Cell list
-    const Index3D ci;                   //!< Cell indexer
-    const Index3D gci;                  //!< Global cell indexer
-    };
-    } // end namespace detail
-    } // end namespace mpcd
-
-void mpcd::CellCommunicator::initialize()
-    {
-    // obtain domain decomposition
-    const Index3D& di = m_decomposition->getDomainIndexer();
-    ArrayHandle<unsigned int> h_cart_ranks(m_decomposition->getCartRanks(),
-                                           access_location::host,
-                                           access_mode::read);
-    const uint3 my_pos = m_decomposition->getGridPos();
-
-    // use the cell list to compute the bounds
-    const Index3D& ci = m_cl->getCellIndexer();
-    const Index3D& global_ci = m_cl->getGlobalCellIndexer();
-    auto num_comm_cells = m_cl->getNComm();
-    const uint3 max_lo
-        = make_uint3(num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::west)],
-                     num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::south)],
-                     num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::down)]);
-    const uint3 min_hi = make_uint3(
-        ci.getW() - num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::east)],
-        ci.getH() - num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::north)],
-        ci.getD() - num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::up)]);
-
-        // check to make sure box is not overdecomposed
-        {
-        const unsigned int nextra = m_cl->getNExtraCells();
-        unsigned int err = ((max_lo.x + nextra) > min_hi.x || (max_lo.y + nextra) > min_hi.y
-                            || (max_lo.z + nextra) > min_hi.z);
-        MPI_Allreduce(MPI_IN_PLACE, &err, 1, MPI_UNSIGNED, MPI_MAX, m_mpi_comm);
-        if (err)
-            {
-            m_exec_conf->msg->error()
-                << "mpcd: Simulation box is overdecomposed, decrease the number of ranks."
-                << std::endl;
-            throw std::runtime_error("Simulation box is overdecomposed for MPCD");
-            }
-        }
-
-    // loop over all cells in the grid and determine where to send them
-    std::multimap<unsigned int, unsigned int> send_map;
-    std::set<unsigned int> neighbors;
-    for (unsigned int k = 0; k < ci.getD(); ++k)
-        {
-        for (unsigned int j = 0; j < ci.getH(); ++j)
-            {
-            for (unsigned int i = 0; i < ci.getW(); ++i)
-                {
-                // skip any cells interior to the grid, which will not be communicated
-                // this is wasteful loop logic, but initialize will only be called rarely
-                if (i >= max_lo.x && i < min_hi.x && j >= max_lo.y && j < min_hi.y && k >= max_lo.z
-                    && k < min_hi.z)
-                    continue;
-
-                // obtain the 1D global index of this cell
-                const int3 global_cell = m_cl->getGlobalCell(make_int3(i, j, k));
-                const unsigned int global_cell_idx
-                    = global_ci(global_cell.x, global_cell.y, global_cell.z);
-
-                // check which direction the cell lies off rank in x,y,z
-                std::vector<int> dx = {0};
-                if (i < max_lo.x)
-                    dx.push_back(-1);
-                else if (i >= min_hi.x)
-                    dx.push_back(1);
-
-                std::vector<int> dy = {0};
-                if (j < max_lo.y)
-                    dy.push_back(-1);
-                else if (j >= min_hi.y)
-                    dy.push_back(1);
-
-                std::vector<int> dz = {0};
-                if (k < max_lo.z)
-                    dz.push_back(-1);
-                else if (k >= min_hi.z)
-                    dz.push_back(1);
-
-                // generate all permutations of these neighbors for the cell
-                for (auto ddx = dx.begin(); ddx != dx.end(); ++ddx)
-                    {
-                    for (auto ddy = dy.begin(); ddy != dy.end(); ++ddy)
-                        {
-                        for (auto ddz = dz.begin(); ddz != dz.end(); ++ddz)
-                            {
-                            // skip self
-                            if (*ddx == 0 && *ddy == 0 && *ddz == 0)
-                                continue;
-
-                            // get neighbor rank tuple
-                            int3 neigh = make_int3((int)my_pos.x + *ddx,
-                                                   (int)my_pos.y + *ddy,
-                                                   (int)my_pos.z + *ddz);
-
-                            // wrap neighbor through the boundaries
-                            if (neigh.x < 0)
-                                neigh.x += di.getW();
-                            else if (neigh.x >= (int)di.getW())
-                                neigh.x -= di.getW();
-
-                            if (neigh.y < 0)
-                                neigh.y += di.getH();
-                            else if (neigh.y >= (int)di.getH())
-                                neigh.y -= di.getH();
-
-                            if (neigh.z < 0)
-                                neigh.z += di.getD();
-                            else if (neigh.z >= (int)di.getD())
-                                neigh.z -= di.getD();
-
-                            // convert neighbor to a linear rank and push it into the unique
-                            // neighbor set
-                            const unsigned int neigh_rank
-                                = h_cart_ranks.data[di(neigh.x, neigh.y, neigh.z)];
-                            neighbors.insert(neigh_rank);
-                            send_map.insert(std::make_pair(neigh_rank, global_cell_idx));
-                            } // ddz
-                        } // ddy
-                    } // ddx
-                } // i
-            } // j
-        } // k
-
-        // allocate send / receive index arrays
-        {
-        GPUArray<unsigned int> send_idx(send_map.size(), m_exec_conf);
-        m_send_idx.swap(send_idx);
-        }
-
-        // fill the send indexes with the global values
-        // flood the array of unique neighbors and count the number to send
-        {
-        ArrayHandle<unsigned int> h_send_idx(m_send_idx,
-                                             access_location::host,
-                                             access_mode::overwrite);
-        unsigned int idx = 0;
-        for (auto it = send_map.begin(); it != send_map.end(); ++it)
-            {
-            h_send_idx.data[idx++] = it->second;
-            }
-
-        m_neighbors.resize(neighbors.size());
-        m_begin.resize(m_neighbors.size());
-        m_num_send.resize(m_neighbors.size());
-        idx = 0;
-        for (auto it = neighbors.begin(); it != neighbors.end(); ++it)
-            {
-            auto lower = send_map.lower_bound(*it);
-            auto upper = send_map.upper_bound(*it);
-
-            m_neighbors[idx] = *it;
-            m_begin[idx] = (unsigned int)std::distance(send_map.begin(), lower);
-            m_num_send[idx] = (unsigned int)std::distance(lower, upper);
-            ++idx;
-            }
-        }
-
-    // send / receive the global cell indexes to be communicated with neighbors
-    std::vector<unsigned int> recv_idx(m_send_idx.getNumElements());
-        {
-        ArrayHandle<unsigned int> h_send_idx(m_send_idx, access_location::host, access_mode::read);
-
-        m_reqs.resize(2 * m_neighbors.size());
-        for (unsigned int idx = 0; idx < m_neighbors.size(); ++idx)
-            {
-            const unsigned int offset = m_begin[idx];
-            MPI_Isend(h_send_idx.data + offset,
-                      m_num_send[idx],
-                      MPI_INT,
-                      m_neighbors[idx],
-                      0,
-                      m_mpi_comm,
-                      &m_reqs[2 * idx]);
-            MPI_Irecv(recv_idx.data() + offset,
-                      m_num_send[idx],
-                      MPI_INT,
-                      m_neighbors[idx],
-                      0,
-                      m_mpi_comm,
-                      &m_reqs[2 * idx + 1]);
-            }
-        MPI_Waitall((unsigned int)m_reqs.size(), m_reqs.data(), MPI_STATUSES_IGNORE);
-        }
-
-        // transform all of the global cell indexes back into local cell indexes
-        {
-        ArrayHandle<unsigned int> h_send_idx(m_send_idx,
-                                             access_location::host,
-                                             access_mode::readwrite);
-
-        mpcd::detail::LocalCellWrapOp wrapper(m_cl);
-        std::transform(h_send_idx.data,
-                       h_send_idx.data + m_send_idx.getNumElements(),
-                       h_send_idx.data,
-                       wrapper);
-        std::transform(recv_idx.begin(), recv_idx.end(), recv_idx.begin(), wrapper);
-        }
-
-        // map the received cells from a rank-basis to a cell-basis
-        {
-        std::multimap<unsigned int, unsigned int> cell_map;
-        std::set<unsigned int> unique_cells;
-        for (unsigned int idx = 0; idx < recv_idx.size(); ++idx)
-            {
-            const unsigned int cell = recv_idx[idx];
-            unique_cells.insert(cell);
-            cell_map.insert(std::make_pair(cell, idx));
-            }
-        m_num_cells = (unsigned int)unique_cells.size();
-
-            /*
-             * Allocate auxiliary memory for receiving cell reordering
-             */
-            {
-            GPUArray<unsigned int> recv(recv_idx.size(), m_exec_conf);
-            m_recv.swap(recv);
-
-            GPUArray<unsigned int> cells(m_num_cells, m_exec_conf);
-            m_cells.swap(cells);
-
-            GPUArray<unsigned int> recv_begin(m_num_cells, m_exec_conf);
-            m_recv_begin.swap(recv_begin);
-
-            GPUArray<unsigned int> recv_end(m_num_cells, m_exec_conf);
-            m_recv_end.swap(recv_end);
-            }
-
-        /*
-         * Generate the compacted list of unique cells
-         */
-        ArrayHandle<unsigned int> h_cells(m_cells, access_location::host, access_mode::overwrite);
-        unsigned int idx = 0;
-        for (auto it = unique_cells.begin(); it != unique_cells.end(); ++it)
-            {
-            h_cells.data[idx++] = *it;
-            }
-
-        /*
-         * Loop over the cell map to do run-length encoding on the keys. This
-         * determines the range of data belonging to each received cell.
-         */
-        ArrayHandle<unsigned int> h_recv(m_recv, access_location::host, access_mode::overwrite);
-        ArrayHandle<unsigned int> h_recv_begin(m_recv_begin,
-                                               access_location::host,
-                                               access_mode::overwrite);
-        ArrayHandle<unsigned int> h_recv_end(m_recv_end,
-                                             access_location::host,
-                                             access_mode::overwrite);
-        unsigned int last_cell = UINT_MAX;
-        unsigned int cell_idx = 0;
-        idx = 0;
-        h_recv_begin.data[cell_idx] = idx;
-        for (auto it = cell_map.begin(); it != cell_map.end(); ++it)
-            {
-            // record the sorted receive index
-            h_recv.data[idx] = it->second;
-
-            // if not very first pass and the current cell does not match the
-            // last cell, then we are on a new cell, and need to demark an end / begin
-            if (last_cell != UINT_MAX && it->first != last_cell)
-                {
-                h_recv_end.data[cell_idx] = idx;
-                h_recv_begin.data[++cell_idx] = idx;
-                }
-            last_cell = it->first;
-
-            ++idx;
-            }
-        h_recv_end.data[cell_idx] = idx;
-        }
-    }
-    } // end namespace hoomd
-
-#endif // ENABLE_MPI
diff --git a/hoomd/mpcd/CellCommunicator.cuh b/hoomd/mpcd/CellCommunicator.cuh
deleted file mode 100644
index 3bba09f5f6..0000000000
--- a/hoomd/mpcd/CellCommunicator.cuh
+++ /dev/null
@@ -1,227 +0,0 @@
-// Copyright (c) 2009-2025 The Regents of the University of Michigan.
-// Part of HOOMD-blue, released under the BSD 3-Clause License.
-
-/*!
- * \file mpcd/CellCommunicator.cuh
- * \brief Declaration of CUDA kernels for mpcd::CellCommunicator
- */
-
-#ifndef MPCD_CELL_COMMUNICATOR_CUH_
-#define MPCD_CELL_COMMUNICATOR_CUH_
-
-#include "hoomd/HOOMDMath.h"
-#include "hoomd/Index1D.h"
-
-#include <cuda_runtime.h>
-
-namespace hoomd
-    {
-namespace mpcd
-    {
-namespace gpu
-    {
-//! Kernel driver to pack cell communication buffer
-template<typename T, class PackOpT>
-cudaError_t __attribute__((visibility("default")))
-pack_cell_buffer(typename PackOpT::element* d_send_buf,
-                 const T* d_props,
-                 const unsigned int* d_send_idx,
-                 const PackOpT op,
-                 const unsigned int num_send,
-                 unsigned int block_size);
-
-//! Kernel driver to unpack cell communication buffer
-template<typename T, class PackOpT>
-cudaError_t __attribute__((visibility("default")))
-unpack_cell_buffer(T* d_props,
-                   const unsigned int* d_cells,
-                   const unsigned int* d_recv,
-                   const unsigned int* d_recv_begin,
-                   const unsigned int* d_recv_end,
-                   const typename PackOpT::element* d_recv_buf,
-                   const PackOpT op,
-                   const unsigned int num_cells,
-                   const unsigned int block_size);
-
-#ifdef __HIPCC__
-
-namespace kernel
-    {
-//! Kernel to pack cell communication buffer
-/*!
- * \param d_send_buf Send buffer to pack (output)
- * \param d_props Cell property to pack
- * \param d_send_idx Cell indexes to pack into the buffer
- * \param op Pack operator
- * \param num_send Number of cells to pack
- * \param block_size Number of threads per block
- *
- * \tparam T Type of data to pack (inferred)
- * \tparam PackOpT Pack operation type (inferred)
- *
- * \b Implementation details:
- * Using one thread per cell to pack, each cell to send has its properties read
- * and packed into the send buffer.
- */
-template<typename T, class PackOpT>
-__global__ void pack_cell_buffer(typename PackOpT::element* d_send_buf,
-                                 const T* d_props,
-                                 const unsigned int* d_send_idx,
-                                 const PackOpT op,
-                                 const unsigned int num_send)
-    {
-    // one thread per buffer cell
-    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= num_send)
-        return;
-
-    const unsigned int send_idx = d_send_idx[idx];
-    d_send_buf[idx] = op.pack(d_props[send_idx]);
-    }
-
-//! Kernel to unpack cell communication buffer
-/*!
- * \param d_props Cell property array to unpack into
- * \param d_cells List of unique cells to unpack
- * \param d_recv Map of unique cells onto the receive buffer
- * \param d_recv_begin Beginning index into \a d_recv for cells to unpack
- * \param d_recv_end End index (exclusive) into \a d_recv for cells to unpack
- * \param d_recv_buf Received buffer from neighbor ranks
- * \param op Packing operator
- * \param num_cells Number of cells to unpack
- *
- * \tparam T Data type to unpack (inferred)
- * \tparam PackOpT Pack operation type (inferred)
- *
- * \b Implementation details:
- * Using one thread per cell to unpack, each cell iterates over the cells it
- * has received, and applies the unpacking operator to each in turn.
- */
-template<typename T, class PackOpT>
-__global__ void unpack_cell_buffer(T* d_props,
-                                   const unsigned int* d_cells,
-                                   const unsigned int* d_recv,
-                                   const unsigned int* d_recv_begin,
-                                   const unsigned int* d_recv_end,
-                                   const typename PackOpT::element* d_recv_buf,
-                                   const PackOpT op,
-                                   const unsigned int num_cells)
-    {
-    // one thread per particle
-    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= num_cells)
-        return;
-
-    const unsigned int cell_idx = d_cells[idx];
-    const unsigned int begin = d_recv_begin[idx];
-    const unsigned int end = d_recv_end[idx];
-
-    // loop through all received data for this cell, and unpack it iteratively
-    T val = d_props[cell_idx];
-    for (unsigned int i = begin; i < end; ++i)
-        {
-        val = op.unpack(d_recv_buf[d_recv[i]], val);
-        }
-
-    // save the accumulated unpacked value
-    d_props[cell_idx] = val;
-    }
-
-    } // end namespace kernel
-
-/*!
- * \param d_send_buf Send buffer to pack (output)
- * \param d_props Cell property to pack
- * \param d_send_idx Cell indexes to pack into the buffer
- * \param op Pack operator
- * \param num_send Number of cells to pack
- * \param block_size Number of threads per block
- *
- * \tparam T Type of data to pack (inferred)
- * \tparam PackOpT Pack operator type
- *
- * \returns cudaSuccess on completion
- *
- * \sa mpcd::gpu::kernel::pack_cell_buffer
- */
-template<typename T, class PackOpT>
-cudaError_t pack_cell_buffer(typename PackOpT::element* d_send_buf,
-                             const T* d_props,
-                             const unsigned int* d_send_idx,
-                             const PackOpT op,
-                             const unsigned int num_send,
-                             unsigned int block_size)
-    {
-    // determine runtime block size
-    unsigned int max_block_size;
-    cudaFuncAttributes attr;
-    cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::pack_cell_buffer<T, PackOpT>);
-    max_block_size = attr.maxThreadsPerBlock;
-
-    const unsigned int run_block_size = min(block_size, max_block_size);
-
-    dim3 grid(num_send / run_block_size + 1);
-    mpcd::gpu::kernel::pack_cell_buffer<<<grid, run_block_size>>>(d_send_buf,
-                                                                  d_props,
-                                                                  d_send_idx,
-                                                                  op,
-                                                                  num_send);
-
-    return cudaSuccess;
-    }
-
-/*!
- * \param d_props Cell property array to unpack into
- * \param d_cells List of unique cells to unpack
- * \param d_recv Map of unique cells onto the receive buffer
- * \param d_recv_begin Beginning index into \a d_recv for cells to unpack
- * \param d_recv_end End index (exclusive) into \a d_recv for cells to unpack
- * \param d_recv_buf Received buffer from neighbor ranks
- * \param op Packing operator
- * \param num_cells Number of cells to unpack
- *
- * \tparam T Data type to unpack (inferred)
- * \tparam PackOpT Pack operation type (inferred)
- *
- * \returns cudaSuccess on completion
- *
- * \sa mpcd::gpu::kernel::unpack_cell_buffer
- */
-template<typename T, class PackOpT>
-cudaError_t unpack_cell_buffer(T* d_props,
-                               const unsigned int* d_cells,
-                               const unsigned int* d_recv,
-                               const unsigned int* d_recv_begin,
-                               const unsigned int* d_recv_end,
-                               const typename PackOpT::element* d_recv_buf,
-                               const PackOpT op,
-                               const unsigned int num_cells,
-                               const unsigned int block_size)
-    {
-    // determine runtime block size
-    unsigned int max_block_size;
-    cudaFuncAttributes attr;
-    cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::unpack_cell_buffer<T, PackOpT>);
-    max_block_size = attr.maxThreadsPerBlock;
-
-    const unsigned int run_block_size = min(block_size, max_block_size);
-
-    dim3 grid(num_cells / run_block_size + 1);
-    mpcd::gpu::kernel::unpack_cell_buffer<<<grid, run_block_size>>>(d_props,
-                                                                    d_cells,
-                                                                    d_recv,
-                                                                    d_recv_begin,
-                                                                    d_recv_end,
-                                                                    d_recv_buf,
-                                                                    op,
-                                                                    num_cells);
-
-    return cudaSuccess;
-    }
-#endif // __HIPCC__
-
-    } // end namespace gpu
-    } // end namespace mpcd
-    } // end namespace hoomd
-
-#endif // MPCD_CELL_COMMUNICATOR_CUH_
diff --git a/hoomd/mpcd/CellCommunicator.h b/hoomd/mpcd/CellCommunicator.h
deleted file mode 100644
index ea78f0f8d8..0000000000
--- a/hoomd/mpcd/CellCommunicator.h
+++ /dev/null
@@ -1,437 +0,0 @@
-// Copyright (c) 2009-2025 The Regents of the University of Michigan.
-// Part of HOOMD-blue, released under the BSD 3-Clause License.
-
-/*!
- * \file mpcd/CellCommunicator.h
- * \brief Declares and defines the mpcd::CellCommunicator class
- */
-
-#ifdef ENABLE_MPI
-
-#ifndef MPCD_CELL_COMMUNICATOR_H_
-#define MPCD_CELL_COMMUNICATOR_H_
-
-#ifdef __HIPCC__
-#error This header cannot be compiled by nvcc
-#endif
-
-#ifdef ENABLE_HIP
-#include "CellCommunicator.cuh"
-#endif // ENABLE_HIP
-
-#include "CellList.h"
-#include "CommunicatorUtilities.h"
-
-#include "hoomd/Autotuned.h"
-#include "hoomd/DomainDecomposition.h"
-#include "hoomd/GPUArray.h"
-#include "hoomd/GPUVector.h"
-#include "hoomd/HOOMDMath.h"
-#include "hoomd/Index1D.h"
-#include "hoomd/SystemDefinition.h"
-
-#include <map>
-#include <set>
-
-namespace hoomd
-    {
-namespace mpcd
-    {
-//! Communicates properties across the MPCD cell list
-class PYBIND11_EXPORT CellCommunicator : public Autotuned
-    {
-    public:
-    //! Constructor
-    CellCommunicator(std::shared_ptr<SystemDefinition> sysdef, std::shared_ptr<mpcd::CellList> cl);
-
-    //! Destructor
-    virtual ~CellCommunicator();
-
-    //! Reduce cell list properties
-    /*!
-     * \param props Properties to reduce
-     * \param op Binary reduction operator
-     *
-     * Properties are reduced across domain boundaries, as marked in the cell list.
-     */
-    template<typename T, class PackOpT> void communicate(const GPUArray<T>& props, const PackOpT op)
-        {
-        begin(props, op);
-        finalize(props, op);
-        }
-
-    //! Begin communication of the grid
-    template<typename T, class PackOpT> void begin(const GPUArray<T>& props, const PackOpT op);
-
-    //! Finalize communication of the grid
-    template<typename T, class PackOpT> void finalize(const GPUArray<T>& props, const PackOpT op);
-
-    //! Get the number of unique cells with communication
-    unsigned int getNCells()
-        {
-        if (m_needs_init)
-            {
-            initialize();
-            m_needs_init = false;
-            }
-        return m_num_cells;
-        }
-
-    //! Get the list of unique cells with communication
-    const GPUArray<unsigned int>& getCells()
-        {
-        if (m_needs_init)
-            {
-            initialize();
-            m_needs_init = false;
-            }
-        return m_cells;
-        }
-
-    private:
-    static unsigned int num_instances; //!< Number of communicator instances
-    const unsigned int m_id;           //!< Id for this communicator to use in tags
-
-    std::shared_ptr<SystemDefinition> m_sysdef;                //!< System definition
-    std::shared_ptr<hoomd::ParticleData> m_pdata;              //!< HOOMD particle data
-    std::shared_ptr<const ExecutionConfiguration> m_exec_conf; //!< Execution configuration
-    const MPI_Comm m_mpi_comm;                                 //!< MPI Communicator
-    std::shared_ptr<DomainDecomposition> m_decomposition;      //!< Domain decomposition
-
-    std::shared_ptr<mpcd::CellList> m_cl; //!< MPCD cell list
-
-    bool m_communicating;                //!< Flag if communication is occurring
-    GPUVector<unsigned char> m_send_buf; //!< Send buffer
-    GPUVector<unsigned char> m_recv_buf; //!< Receive buffer
-    GPUArray<unsigned int> m_send_idx;   //!< Indexes of cells in send buffer
-    std::vector<MPI_Request> m_reqs;     //!< MPI request objects
-
-    std::vector<unsigned int> m_neighbors; //!< Unique neighbor ranks
-    std::vector<unsigned int> m_begin;     //!< Begin offset of every neighbor
-    std::vector<unsigned int> m_num_send;  //!< Number of cells to send to every neighbor
-
-    unsigned int m_num_cells;       //!< Number of unique cells to receive
-    GPUArray<unsigned int> m_cells; //!< Unique cells to receive
-    GPUArray<unsigned int>
-        m_recv; //!< Reordered mapping of buffer from ranks to group received cells together
-    GPUArray<unsigned int> m_recv_begin; //!< Begin offset of every unique cell
-    GPUArray<unsigned int> m_recv_end;   //!< End offset of every unique cell
-
-    bool m_needs_init; //!< Flag if grid needs to be initialized
-    //! Slot that communicator needs to be reinitialized
-    void slotInit()
-        {
-        m_needs_init = true;
-        }
-
-    //! Initialize the grid
-    void initialize();
-
-    //! Packs the property buffer
-    template<typename T, class PackOpT> void packBuffer(const GPUArray<T>& props, const PackOpT op);
-
-    //! Unpacks the property buffer
-    template<typename T, class PackOpT>
-    void unpackBuffer(const GPUArray<T>& props, const PackOpT op);
-
-#ifdef ENABLE_HIP
-    std::shared_ptr<Autotuner<1>> m_tuner_pack;   //!< Tuner for pack kernel
-    std::shared_ptr<Autotuner<1>> m_tuner_unpack; //!< Tuner for unpack kernel
-
-    //! Packs the property buffer on the GPU
-    template<typename T, class PackOpT>
-    void packBufferGPU(const GPUArray<T>& props, const PackOpT op);
-
-    //! Unpacks the property buffer on the GPU
-    template<typename T, class PackOpT>
-    void unpackBufferGPU(const GPUArray<T>& props, const PackOpT op);
-#endif // ENABLE_HIP
-    };
-
-    } // end namespace mpcd
-
-/*!
- * \param props Property buffer to pack
- * \param op Packing operator to apply to the send buffer
- *
- * \tparam T Type of buffer to pack (inferred)
- * \tparam PackOpT Packing operator type (inferred)
- *
- * The data in \a props is packed into the send buffers, and nonblocking MPI
- * send / receive operations are initiated. If communication is already occurring,
- * the method returns immediately and no action is taken.
- */
-template<typename T, class PackOpT>
-void mpcd::CellCommunicator::begin(const GPUArray<T>& props, const PackOpT op)
-    {
-    // check if communication is occurring and place a lock on
-    if (m_communicating)
-        return;
-    m_communicating = true;
-
-    // ensure that the property grid is sufficiently sized compared to the cell list
-    if (props.getNumElements() < m_cl->getNCells())
-        {
-        m_exec_conf->msg->error()
-            << "mpcd: cell property to be reduced is smaller than cell list dimensions"
-            << std::endl;
-        throw std::runtime_error("MPCD cell property has insufficient dimensions");
-        }
-
-    // initialize grid if required
-    if (m_needs_init)
-        {
-        initialize();
-        m_needs_init = false;
-        }
-
-    // resize send / receive buffers for this comm element
-    m_send_buf.resize(m_send_idx.getNumElements() * sizeof(typename PackOpT::element));
-    m_recv_buf.resize(m_send_idx.getNumElements() * sizeof(typename PackOpT::element));
-
-#ifdef ENABLE_HIP
-    if (m_exec_conf->isCUDAEnabled())
-        {
-        packBufferGPU(props, op);
-        }
-    else
-#endif // ENABLE_HIP
-        {
-        packBuffer(props, op);
-        }
-
-        // make the MPI calls
-        {
-        // determine whether to use CPU or GPU CUDA buffers
-        access_location::Enum mpi_loc;
-            {
-            mpi_loc = access_location::host;
-            }
-
-        ArrayHandle<unsigned char> h_send_buf(m_send_buf, mpi_loc, access_mode::read);
-        ArrayHandle<unsigned char> h_recv_buf(m_recv_buf, mpi_loc, access_mode::overwrite);
-        typename PackOpT::element* send_buf
-            = reinterpret_cast<typename PackOpT::element*>(h_send_buf.data);
-        typename PackOpT::element* recv_buf
-            = reinterpret_cast<typename PackOpT::element*>(h_recv_buf.data);
-
-        m_reqs.resize(2 * m_neighbors.size());
-        for (unsigned int idx = 0; idx < m_neighbors.size(); ++idx)
-            {
-            const unsigned int neigh = m_neighbors[idx];
-            const unsigned int offset = m_begin[idx];
-            const size_t num_bytes = sizeof(typename PackOpT::element) * m_num_send[idx];
-            MPI_Isend(send_buf + offset,
-                      (unsigned int)num_bytes,
-                      MPI_BYTE,
-                      neigh,
-                      0,
-                      m_mpi_comm,
-                      &m_reqs[2 * idx]);
-            MPI_Irecv(recv_buf + offset,
-                      (unsigned int)num_bytes,
-                      MPI_BYTE,
-                      neigh,
-                      0,
-                      m_mpi_comm,
-                      &m_reqs[2 * idx + 1]);
-            }
-        }
-    }
-
-//! Finalize communication of the grid
-/*!
- * \param props Property buffer to unpack
- * \param op Packing operator to apply to the receive buffer
- *
- * \tparam T Type of buffer to pack (inferred)
- * \tparam PackOpT Packing operator type (inferred)
- *
- * Existing MPI requests are finalized, and then the received buffers are
- * unpacked into \a props. If no communication is ongoing, the method immediately
- * returns and takes no action.
- */
-template<typename T, class PackOpT>
-void mpcd::CellCommunicator::finalize(const GPUArray<T>& props, const PackOpT op)
-    {
-    if (!m_communicating)
-        return;
-
-    // finish all MPI requests
-    MPI_Waitall((unsigned int)m_reqs.size(), m_reqs.data(), MPI_STATUSES_IGNORE);
-
-// unpack the buffer
-#ifdef ENABLE_HIP
-    if (m_exec_conf->isCUDAEnabled())
-        {
-        unpackBufferGPU(props, op);
-        }
-    else
-#endif // ENABLE_HIP
-        {
-        unpackBuffer(props, op);
-        }
-
-    m_communicating = false;
-    }
-
-/*!
- * \param props Property buffer to pack
- * \param op Packing operator to apply to the send buffer
- *
- * \tparam T Type of buffer to pack (inferred)
- * \tparam PackOpT Packing operator type (inferred)
- *
- * The packing operator applies a unary pack functor to \a props, which permits
- * a reduction or transformation of the data to be sent. See mpcd::detail::CellEnergyPackOp
- * for an example.
- *
- * \post Communicated cells in \a props are packed into \a m_send_buf.
- */
-template<typename T, class PackOpT>
-void mpcd::CellCommunicator::packBuffer(const GPUArray<T>& props, const PackOpT op)
-    {
-    ArrayHandle<T> h_props(props, access_location::host, access_mode::read);
-    ArrayHandle<unsigned int> h_send_idx(m_send_idx, access_location::host, access_mode::read);
-    ArrayHandle<unsigned char> h_send_buf(m_send_buf,
-                                          access_location::host,
-                                          access_mode::overwrite);
-    typename PackOpT::element* send_buf
-        = reinterpret_cast<typename PackOpT::element*>(h_send_buf.data);
-
-    for (unsigned int idx = 0; idx < m_send_idx.getNumElements(); ++idx)
-        {
-        send_buf[idx] = op.pack(h_props.data[h_send_idx.data[idx]]);
-        }
-    }
-
-/*!
- * \param props Property buffer to pack
- * \param op Packing operator to apply to the received buffer
- *
- * \tparam T Type of buffer to pack (inferred)
- * \tparam PackOpT Packing operator type (inferred)
- *
- * The packing operator must be associative so that the order it is applied
- * to the received cells does not matter, e.g., addition.
- *
- * \post The bytes from \a m_recv_buf are unpacked into \a props.
- */
-template<typename T, class PackOpT>
-void mpcd::CellCommunicator::unpackBuffer(const GPUArray<T>& props, const PackOpT op)
-    {
-    ArrayHandle<unsigned int> h_recv(m_recv, access_location::host, access_mode::read);
-    ArrayHandle<unsigned int> h_cells(m_cells, access_location::host, access_mode::read);
-    ArrayHandle<unsigned int> h_recv_begin(m_recv_begin, access_location::host, access_mode::read);
-    ArrayHandle<unsigned int> h_recv_end(m_recv_end, access_location::host, access_mode::read);
-
-    ArrayHandle<unsigned char> h_recv_buf(m_recv_buf, access_location::host, access_mode::read);
-    typename PackOpT::element* recv_buf
-        = reinterpret_cast<typename PackOpT::element*>(h_recv_buf.data);
-
-    ArrayHandle<T> h_props(props, access_location::host, access_mode::readwrite);
-
-    for (unsigned int idx = 0; idx < m_num_cells; ++idx)
-        {
-        const unsigned int cell_idx = h_cells.data[idx];
-        const unsigned int begin = h_recv_begin.data[idx];
-        const unsigned int end = h_recv_end.data[idx];
-
-        // loop through all received data for this cell, and unpack it iteratively
-        T val = h_props.data[cell_idx];
-        for (unsigned int i = begin; i < end; ++i)
-            {
-            val = op.unpack(recv_buf[h_recv.data[i]], val);
-            }
-
-        // save the accumulated unpacked value
-        h_props.data[cell_idx] = val;
-        }
-    }
-
-#ifdef ENABLE_HIP
-/*!
- * \param props Property buffer to pack
- * \param op Packing operator to apply to the send buffer
- *
- * \tparam T Type of buffer to pack (inferred)
- * \tparam PackOpT Packing operator type (inferred)
- *
- * The packing operator applies a unary pack functor to \a props, which permits
- * a reduction or transformation of the data to be sent. See mpcd::detail::CellEnergyPackOp
- * for an example.
- *
- * \post Communicated cells in \a props are packed into \a m_send_buf.
- */
-template<typename T, class PackOpT>
-void mpcd::CellCommunicator::packBufferGPU(const GPUArray<T>& props, const PackOpT op)
-    {
-    ArrayHandle<T> d_props(props, access_location::device, access_mode::read);
-    ArrayHandle<unsigned int> d_send_idx(m_send_idx, access_location::device, access_mode::read);
-    ArrayHandle<unsigned char> d_send_buf(m_send_buf,
-                                          access_location::device,
-                                          access_mode::overwrite);
-    typename PackOpT::element* send_buf
-        = reinterpret_cast<typename PackOpT::element*>(d_send_buf.data);
-
-    m_tuner_pack->begin();
-    mpcd::gpu::pack_cell_buffer(send_buf,
-                                d_props.data,
-                                d_send_idx.data,
-                                op,
-                                (unsigned int)m_send_idx.getNumElements(),
-                                m_tuner_pack->getParam()[0]);
-    if (m_exec_conf->isCUDAErrorCheckingEnabled())
-        CHECK_CUDA_ERROR();
-    m_tuner_pack->end();
-    }
-
-/*!
- * \param props Property buffer to pack
- * \param op Packing operator to apply to the received buffer
- *
- * \tparam T Type of buffer to pack (inferred)
- * \tparam PackOpT Packing operator type (inferred)
- *
- * The packing operator must be associative so that the order it is applied
- * to the received cells does not matter, e.g., addition.
- *
- * \post The bytes from \a m_recv_buf are unpacked into \a props.
- */
-template<typename T, class PackOpT>
-void mpcd::CellCommunicator::unpackBufferGPU(const GPUArray<T>& props, const PackOpT op)
-    {
-    ArrayHandle<unsigned int> d_recv(m_recv, access_location::device, access_mode::read);
-    ArrayHandle<unsigned int> d_cells(m_cells, access_location::device, access_mode::read);
-    ArrayHandle<unsigned int> d_recv_begin(m_recv_begin,
-                                           access_location::device,
-                                           access_mode::read);
-    ArrayHandle<unsigned int> d_recv_end(m_recv_end, access_location::device, access_mode::read);
-
-    ArrayHandle<unsigned char> d_recv_buf(m_recv_buf, access_location::device, access_mode::read);
-    typename PackOpT::element* recv_buf
-        = reinterpret_cast<typename PackOpT::element*>(d_recv_buf.data);
-
-    ArrayHandle<T> d_props(props, access_location::device, access_mode::readwrite);
-
-    m_tuner_unpack->begin();
-    mpcd::gpu::unpack_cell_buffer(d_props.data,
-                                  d_cells.data,
-                                  d_recv.data,
-                                  d_recv_begin.data,
-                                  d_recv_end.data,
-                                  recv_buf,
-                                  op,
-                                  m_num_cells,
-                                  m_tuner_unpack->getParam()[0]);
-    if (m_exec_conf->isCUDAErrorCheckingEnabled())
-        CHECK_CUDA_ERROR();
-    m_tuner_unpack->end();
-    }
-#endif // ENABLE_HIP
-
-    } // end namespace hoomd
-
-#endif // MPCD_CELL_COMMUNICATOR_H_
-
-#endif // ENABLE_MPI
diff --git a/hoomd/mpcd/CellList.cc b/hoomd/mpcd/CellList.cc
index 58bacdd3a1..067c133c3c 100644
--- a/hoomd/mpcd/CellList.cc
+++ b/hoomd/mpcd/CellList.cc
@@ -18,14 +18,17 @@
 namespace hoomd
     {
 mpcd::CellList::CellList(std::shared_ptr<SystemDefinition> sysdef, Scalar cell_size, bool shift)
-    : Compute(sysdef), m_mpcd_pdata(m_sysdef->getMPCDParticleData()), m_cell_np_max(4),
-      m_cell_np(m_exec_conf), m_cell_list(m_exec_conf), m_embed_cell_ids(m_exec_conf),
-      m_conditions(m_exec_conf), m_needs_compute_dim(true), m_particles_sorted(false),
-      m_virtual_change(false)
+    : Compute(sysdef), m_mpcd_pdata(m_sysdef->getMPCDParticleData()), m_cell_np(m_exec_conf),
+      m_embed_cell_ids(m_exec_conf), m_conditions(m_exec_conf), m_cell_vel(m_exec_conf),
+      m_cell_energy(m_exec_conf), m_cell_temp(m_exec_conf), m_needs_net_reduce(true),
+      m_needs_compute_dim(true), m_particles_sorted(false), m_virtual_change(false)
     {
     assert(m_mpcd_pdata);
     m_exec_conf->msg->notice(5) << "Constructing MPCD CellList" << std::endl;
 
+    GPUArray<double> net_properties(mpcd::detail::thermo_index::num_quantities, m_exec_conf);
+    m_net_properties.swap(net_properties);
+
     setCellSize(cell_size);
     m_origin_idx = make_int3(0, 0, 0);
     m_cell_dim = make_uint3(0, 0, 0);
@@ -41,7 +44,6 @@ mpcd::CellList::CellList(std::shared_ptr<SystemDefinition> sysdef, Scalar cell_s
     m_cover_box = m_pdata->getBox();
 #endif // ENABLE_MPI
 
-    m_mpcd_pdata->getSortSignal().connect<mpcd::CellList, &mpcd::CellList::sort>(this);
     m_mpcd_pdata->getNumVirtualSignal().connect<mpcd::CellList, &mpcd::CellList::slotNumVirtual>(
         this);
     m_pdata->getParticleSortSignal().connect<mpcd::CellList, &mpcd::CellList::slotSorted>(this);
@@ -51,14 +53,17 @@ mpcd::CellList::CellList(std::shared_ptr<SystemDefinition> sysdef, Scalar cell_s
 mpcd::CellList::CellList(std::shared_ptr<SystemDefinition> sysdef,
                          const uint3& global_cell_dim,
                          bool shift)
-    : Compute(sysdef), m_mpcd_pdata(m_sysdef->getMPCDParticleData()), m_cell_np_max(4),
-      m_cell_np(m_exec_conf), m_cell_list(m_exec_conf), m_embed_cell_ids(m_exec_conf),
-      m_conditions(m_exec_conf), m_needs_compute_dim(true), m_particles_sorted(false),
-      m_virtual_change(false)
+    : Compute(sysdef), m_mpcd_pdata(m_sysdef->getMPCDParticleData()), m_cell_np(m_exec_conf),
+      m_embed_cell_ids(m_exec_conf), m_conditions(m_exec_conf), m_cell_vel(m_exec_conf),
+      m_cell_energy(m_exec_conf), m_cell_temp(m_exec_conf), m_needs_compute_dim(true),
+      m_particles_sorted(false), m_virtual_change(false)
     {
     assert(m_mpcd_pdata);
     m_exec_conf->msg->notice(5) << "Constructing MPCD CellList" << std::endl;
 
+    GPUArray<double> net_properties(mpcd::detail::thermo_index::num_quantities, m_exec_conf);
+    m_net_properties.swap(net_properties);
+
     setGlobalDim(global_cell_dim);
     m_origin_idx = make_int3(0, 0, 0);
     m_cell_dim = make_uint3(0, 0, 0);
@@ -74,7 +79,6 @@ mpcd::CellList::CellList(std::shared_ptr<SystemDefinition> sysdef,
     m_cover_box = m_pdata->getBox();
 #endif // ENABLE_MPI
 
-    m_mpcd_pdata->getSortSignal().connect<mpcd::CellList, &mpcd::CellList::sort>(this);
     m_mpcd_pdata->getNumVirtualSignal().connect<mpcd::CellList, &mpcd::CellList::slotNumVirtual>(
         this);
     m_pdata->getParticleSortSignal().connect<mpcd::CellList, &mpcd::CellList::slotSorted>(this);
@@ -84,7 +88,6 @@ mpcd::CellList::CellList(std::shared_ptr<SystemDefinition> sysdef,
 mpcd::CellList::~CellList()
     {
     m_exec_conf->msg->notice(5) << "Destroying MPCD CellList" << std::endl;
-    m_mpcd_pdata->getSortSignal().disconnect<mpcd::CellList, &mpcd::CellList::sort>(this);
     m_mpcd_pdata->getNumVirtualSignal().disconnect<mpcd::CellList, &mpcd::CellList::slotNumVirtual>(
         this);
     m_pdata->getParticleSortSignal().disconnect<mpcd::CellList, &mpcd::CellList::slotSorted>(this);
@@ -113,6 +116,9 @@ void mpcd::CellList::compute(uint64_t timestep)
         m_force_compute = true;
         }
 
+    // ensure optional computation flags are up to date
+    updateFlags();
+
     if (peekCompute(timestep))
         {
         // ensure grid is shifted
@@ -138,23 +144,15 @@ void mpcd::CellList::compute(uint64_t timestep)
             m_embed_cell_ids.resize(m_embed_group->getNumMembers());
             }
 
-        bool overflowed = false;
-        do
-            {
-            buildCellList();
-
-            overflowed = checkConditions();
-
-            if (overflowed)
-                {
-                reallocate();
-                resetConditions();
-                }
-            } while (overflowed);
+        // bin particles and compute cell properties
+        buildCellList();
+        checkConditions();
+        finishComputeProperties();
 
         // we are finished building, explicitly mark everything (rather than using shouldCompute)
         m_first_compute = false;
         m_force_compute = false;
+        m_needs_net_reduce = true;
         m_last_computed = timestep;
 
         // signal to the ParticleData that the cell list cache is now valid
@@ -164,11 +162,10 @@ void mpcd::CellList::compute(uint64_t timestep)
 
 void mpcd::CellList::reallocate()
     {
-    m_exec_conf->msg->notice(6) << "Allocating MPCD cell list, " << m_cell_np_max
-                                << " particles in " << m_cell_indexer.getNumElements() << " cells."
-                                << std::endl;
-    m_cell_list_indexer = Index2D(m_cell_np_max, m_cell_indexer.getNumElements());
-    m_cell_list.resize(m_cell_list_indexer.getNumElements());
+    m_cell_np.resize(m_cell_indexer.getNumElements());
+    m_cell_vel.resize(m_cell_indexer.getNumElements());
+    m_cell_energy.resize(m_cell_indexer.getNumElements());
+    m_cell_temp.resize(m_cell_indexer.getNumElements());
     }
 
 void mpcd::CellList::computeDimensions()
@@ -307,7 +304,6 @@ void mpcd::CellList::computeDimensions()
     // resize the cell indexers and per-cell counter
     m_global_cell_indexer = Index3D(m_global_cell_dim.x, m_global_cell_dim.y, m_global_cell_dim.z);
     m_cell_indexer = Index3D(m_cell_dim.x, m_cell_dim.y, m_cell_dim.z);
-    m_cell_np.resize(m_cell_indexer.getNumElements());
 
     // reallocate per-cell memory
     reallocate();
@@ -317,6 +313,21 @@ void mpcd::CellList::computeDimensions()
     notifySizeChange();
     }
 
+void mpcd::CellList::startAutotuning()
+    {
+    Compute::startAutotuning();
+#ifdef ENABLE_MPI
+#endif // ENABLE_MPI
+    }
+
+bool mpcd::CellList::isAutotuningComplete()
+    {
+    bool result = Compute::isAutotuningComplete();
+#ifdef ENABLE_MPI
+#endif // ENABLE_MPI
+    return result;
+    }
+
 #ifdef ENABLE_MPI
 /*!
  * \param dir Direction of communication
@@ -352,12 +363,17 @@ void mpcd::CellList::buildCellList()
     const BoxDim& box = m_pdata->getBox();
     const uchar3 periodic = box.getPeriodic();
 
-    ArrayHandle<unsigned int> h_cell_list(m_cell_list,
-                                          access_location::host,
-                                          access_mode::overwrite);
-    ArrayHandle<unsigned int> h_cell_np(m_cell_np, access_location::host, access_mode::overwrite);
     // zero the cell counter
     m_cell_np.zeroFill();
+    m_cell_vel.zeroFill();
+    if (m_flags[mpcd::detail::thermo_options::energy])
+        {
+        m_cell_energy.zeroFill();
+        }
+
+    ArrayHandle<unsigned int> h_cell_np(m_cell_np, access_location::host, access_mode::readwrite);
+    ArrayHandle<double4> h_cell_vel(m_cell_vel, access_location::host, access_mode::readwrite);
+    ArrayHandle<double> h_cell_energy(m_cell_energy, access_location::host, access_mode::readwrite);
 
     uint3 conditions = make_uint3(0, 0, 0);
 
@@ -373,6 +389,7 @@ void mpcd::CellList::buildCellList()
     // we can't modify the velocity of embedded particles, so we only read their position
     std::unique_ptr<ArrayHandle<unsigned int>> h_embed_cell_ids;
     std::unique_ptr<ArrayHandle<Scalar4>> h_pos_embed;
+    std::unique_ptr<ArrayHandle<Scalar4>> h_vel_embed;
     std::unique_ptr<ArrayHandle<unsigned int>> h_embed_member_idx;
     if (m_embed_group)
         {
@@ -382,6 +399,9 @@ void mpcd::CellList::buildCellList()
         h_pos_embed.reset(new ArrayHandle<Scalar4>(m_pdata->getPositions(),
                                                    access_location::host,
                                                    access_mode::read));
+        h_vel_embed.reset(new ArrayHandle<Scalar4>(m_pdata->getVelocities(),
+                                                   access_location::host,
+                                                   access_mode::read));
         h_embed_member_idx.reset(new ArrayHandle<unsigned int>(m_embed_group->getIndexArray(),
                                                                access_location::host,
                                                                access_mode::read));
@@ -405,15 +425,22 @@ void mpcd::CellList::buildCellList()
     for (unsigned int cur_p = 0; cur_p < N_tot; ++cur_p)
         {
         Scalar4 postype_i;
+        Scalar4 vel_mass_i;
+        double mass_i;
         if (cur_p < N_mpcd)
             {
             postype_i = h_pos.data[cur_p];
+            vel_mass_i = h_vel.data[cur_p];
+            mass_i = m_mpcd_pdata->getMass();
             }
         else
             {
             postype_i = h_pos_embed->data[h_embed_member_idx->data[cur_p - N_mpcd]];
+            vel_mass_i = h_vel_embed->data[h_embed_member_idx->data[cur_p - N_mpcd]];
+            mass_i = vel_mass_i.w;
             }
-        Scalar3 pos_i = make_scalar3(postype_i.x, postype_i.y, postype_i.z);
+        const Scalar3 pos_i = make_scalar3(postype_i.x, postype_i.y, postype_i.z);
+        const double3 vel_i = make_double3(vel_mass_i.x, vel_mass_i.y, vel_mass_i.z);
 
         if (std::isnan(pos_i.x) || std::isnan(pos_i.y) || std::isnan(pos_i.z))
             {
@@ -488,16 +515,6 @@ void mpcd::CellList::buildCellList()
             }
 
         unsigned int bin_idx = m_cell_indexer(bin.x, bin.y, bin.z);
-        unsigned int offset = h_cell_np.data[bin_idx];
-        if (offset < m_cell_np_max)
-            {
-            h_cell_list.data[m_cell_list_indexer(offset, bin_idx)] = cur_p;
-            }
-        else
-            {
-            // overflow
-            conditions.x = std::max(conditions.x, offset + 1);
-            }
 
         // stash the current particle bin into the velocity array
         if (cur_p < N_mpcd)
@@ -509,6 +526,18 @@ void mpcd::CellList::buildCellList()
             h_embed_cell_ids->data[cur_p - N_mpcd] = bin_idx;
             }
 
+        // compute the contribution of the particle to cell velocity
+        double4& cell_vel = h_cell_vel.data[bin_idx];
+        cell_vel.x += mass_i * vel_i.x;
+        cell_vel.y += mass_i * vel_i.y;
+        cell_vel.z += mass_i * vel_i.z;
+        cell_vel.w += mass_i;
+        // compute optional cell properties
+        if (m_flags[mpcd::detail::thermo_options::energy])
+            {
+            h_cell_energy.data[bin_idx]
+                += 0.5 * mass_i * (vel_i.x * vel_i.x + vel_i.y * vel_i.y + vel_i.z * vel_i.z);
+            }
         // increment the counter always
         ++h_cell_np.data[bin_idx];
         }
@@ -517,53 +546,136 @@ void mpcd::CellList::buildCellList()
     m_conditions.resetFlags(conditions);
     }
 
-/*!
- * \param timestep Timestep that the sorting occurred
- * \param order Mapping of sorted particle indexes onto old particle indexes
- * \param rorder Mapping of old particle indexes onto sorted particle indexes
- */
-void mpcd::CellList::sort(uint64_t timestep,
-                          const GPUArray<unsigned int>& order,
-                          const GPUArray<unsigned int>& rorder)
+void mpcd::CellList::finishComputeProperties()
     {
-    // no need to do any sorting if we can still be called at the current timestep
-    if (peekCompute(timestep))
-        return;
-
-    // if mapping is not valid, signal that we need to force a recompute next time
-    // that the cell list is needed. We don't call forceCompute() directly because this
-    // always runs compute(), and we just want to defer to the next compute() call.
-    if (rorder.isNull())
+    const bool need_energy = m_flags[mpcd::detail::thermo_options::energy];
+    if (need_energy)
         {
-        m_force_compute = true;
-        return;
+        m_cell_temp.zeroFill();
         }
 
-    // iterate through particles in cell list, and update their indexes using reverse
-    // mapping
-    ArrayHandle<unsigned int> h_rorder(rorder, access_location::host, access_mode::read);
     ArrayHandle<unsigned int> h_cell_np(m_cell_np, access_location::host, access_mode::read);
-    ArrayHandle<unsigned int> h_cell_list(m_cell_list,
-                                          access_location::host,
-                                          access_mode::readwrite);
-    const unsigned int N_mpcd = m_mpcd_pdata->getN();
-
+    ArrayHandle<double4> h_cell_vel(m_cell_vel, access_location::host, access_mode::readwrite);
+    ArrayHandle<double> h_cell_energy(m_cell_energy, access_location::host, access_mode::read);
+    ArrayHandle<double> h_cell_temp(m_cell_temp, access_location::host, access_mode::overwrite);
+    // iterate over all cells and compute average velocity, energy, temperature
     for (unsigned int idx = 0; idx < getNCells(); ++idx)
         {
-        const unsigned int np = h_cell_np.data[idx];
-        for (unsigned int offset = 0; offset < np; ++offset)
+        // average cell properties if the cell has mass
+        const double4 cell_vel = h_cell_vel.data[idx];
+        double3 vel_cm = make_double3(cell_vel.x, cell_vel.y, cell_vel.z);
+        const double mass = cell_vel.w;
+
+        // get center of mass velocity from momentum
+        if (mass > 0.)
+            {
+            // average velocity is only defined when there is some mass in the cell
+            vel_cm.x /= mass;
+            vel_cm.y /= mass;
+            vel_cm.z /= mass;
+            }
+        h_cell_vel.data[idx] = make_double4(vel_cm.x, vel_cm.y, vel_cm.z, mass);
+
+        if (need_energy)
             {
-            const unsigned int cl_idx = m_cell_list_indexer(offset, idx);
-            const unsigned int pid = h_cell_list.data[cl_idx];
-            // only update indexes of MPCD particles, not virtual or embedded particles
-            if (pid < N_mpcd)
+            const double cell_energy = h_cell_energy.data[idx];
+            double temp(0.0);
+            const unsigned int np = h_cell_np.data[idx];
+            // temperature is only defined for 2 or more particles
+            if (np > 1)
                 {
-                h_cell_list.data[cl_idx] = h_rorder.data[pid];
+                const double ke_cm
+                    = 0.5 * mass
+                      * (vel_cm.x * vel_cm.x + vel_cm.y * vel_cm.y + vel_cm.z * vel_cm.z);
+                temp = 2. * (cell_energy - ke_cm) / (m_sysdef->getNDimensions() * (np - 1));
                 }
+            h_cell_temp.data[idx] = temp;
             }
         }
     }
 
+void mpcd::CellList::computeNetProperties()
+    {
+    unsigned int n_temp_cells = 0;
+        {
+        const bool need_energy = m_flags[mpcd::detail::thermo_options::energy];
+        ArrayHandle<unsigned int> h_cell_np(m_cell_np, access_location::host, access_mode::read);
+        ArrayHandle<double4> h_cell_vel(m_cell_vel, access_location::host, access_mode::read);
+        ArrayHandle<double> h_cell_energy(m_cell_energy, access_location::host, access_mode::read);
+        ArrayHandle<double> h_cell_temp(m_cell_temp, access_location::host, access_mode::read);
+        double3 net_momentum = make_double3(0, 0, 0);
+        double net_energy(0.0), net_temp(0.0);
+
+        // iterate over all cells and compute average velocity, energy, temperature
+        for (unsigned int idx = 0; idx < getNCells(); ++idx)
+            {
+            // average cell properties if the cell has mass
+            const double4 cell_vel = h_cell_vel.data[idx];
+            const double3 vel_cm = make_double3(cell_vel.x, cell_vel.y, cell_vel.z);
+            const double mass = cell_vel.w;
+
+            // add accumulated momentum to total net momentum
+            net_momentum.x += vel_cm.x * mass;
+            net_momentum.y += vel_cm.y * mass;
+            net_momentum.z += vel_cm.z * mass;
+
+            if (need_energy)
+                {
+                const double cell_energy = h_cell_energy.data[idx];
+                const double cell_temp = h_cell_temp.data[idx];
+                // temperature is only defined for 2 or more particles
+                if (h_cell_np.data[idx] > 1)
+                    {
+                    ++n_temp_cells;
+                    }
+                // accumulate
+                net_energy += cell_energy;
+                net_temp += cell_temp;
+                }
+            }
+        ArrayHandle<double> h_net_properties(m_net_properties,
+                                             access_location::host,
+                                             access_mode::overwrite);
+
+        h_net_properties.data[mpcd::detail::thermo_index::momentum_x] = net_momentum.x;
+        h_net_properties.data[mpcd::detail::thermo_index::momentum_y] = net_momentum.y;
+        h_net_properties.data[mpcd::detail::thermo_index::momentum_z] = net_momentum.z;
+        h_net_properties.data[mpcd::detail::thermo_index::energy] = net_energy;
+        h_net_properties.data[mpcd::detail::thermo_index::temperature] = net_temp;
+        }
+
+#ifdef ENABLE_MPI
+    if (m_decomposition)
+        {
+        ArrayHandle<double> h_net_properties(m_net_properties,
+                                             access_location::host,
+                                             access_mode::readwrite);
+        MPI_Allreduce(MPI_IN_PLACE,
+                      h_net_properties.data,
+                      mpcd::detail::thermo_index::num_quantities,
+                      MPI_DOUBLE,
+                      MPI_SUM,
+                      m_exec_conf->getMPICommunicator());
+
+        MPI_Allreduce(MPI_IN_PLACE,
+                      &n_temp_cells,
+                      1,
+                      MPI_UNSIGNED,
+                      MPI_SUM,
+                      m_exec_conf->getMPICommunicator());
+        }
+#endif // ENABLE_MPI
+    if (n_temp_cells > 0)
+        {
+        ArrayHandle<double> h_net_properties(m_net_properties,
+                                             access_location::host,
+                                             access_mode::readwrite);
+        h_net_properties.data[mpcd::detail::thermo_index::temperature] /= (double)n_temp_cells;
+        }
+    // ensure that properties will not be normalized twice
+    m_needs_net_reduce = false;
+    }
+
 #ifdef ENABLE_MPI
 bool mpcd::CellList::needsEmbedMigrate(uint64_t timestep)
     {
@@ -609,17 +721,24 @@ bool mpcd::CellList::needsEmbedMigrate(uint64_t timestep)
     }
 #endif // ENABLE_MPI
 
+void mpcd::CellList::updateFlags()
+    {
+    mpcd::detail::ThermoFlags flags;
+
+    if (!m_flag_signal.empty())
+        {
+        m_flag_signal.emit_accumulate([&](mpcd::detail::ThermoFlags f) { flags |= f; });
+        }
+
+    m_flags = flags;
+    }
+
 bool mpcd::CellList::checkConditions()
     {
     bool result = false;
 
     uint3 conditions = m_conditions.readFlags();
 
-    if (conditions.x > m_cell_np_max)
-        {
-        m_cell_np_max = conditions.x;
-        result = true;
-        }
     if (conditions.y)
         {
         unsigned int n = conditions.y - 1;
diff --git a/hoomd/mpcd/CellList.h b/hoomd/mpcd/CellList.h
index f63c37d40f..644c9ff273 100644
--- a/hoomd/mpcd/CellList.h
+++ b/hoomd/mpcd/CellList.h
@@ -13,6 +13,7 @@
 #error This header cannot be compiled by nvcc
 #endif
 
+#include "CellThermoTypes.h"
 #include "CommunicatorUtilities.h"
 #include "ParticleData.h"
 
@@ -48,11 +49,11 @@ class PYBIND11_EXPORT CellList : public Compute
     //! Sizes the cell list based on the box
     void computeDimensions();
 
-    //! Get the cell list data
-    const GPUArray<unsigned int>& getCellList() const
-        {
-        return m_cell_list;
-        }
+    //! Start autotuning kernel launch parameters
+    void startAutotuning() override;
+
+    //! Check if kernel autotuning is complete
+    bool isAutotuningComplete() override;
 
     //! Get the number of particles per cell
     const GPUArray<unsigned int>& getCellSizeArray() const
@@ -60,6 +61,24 @@ class PYBIND11_EXPORT CellList : public Compute
         return m_cell_np;
         }
 
+    //! Get the cell velocities from the last call to compute
+    const GPUArray<double4>& getCellVelocities() const
+        {
+        return m_cell_vel;
+        }
+
+    //! Get the cell energies from the last call to compute
+    const GPUArray<double>& getCellEnergies() const
+        {
+        return m_cell_energy;
+        }
+
+    //! Get the cell temperature from the last call to compute
+    const GPUArray<double>& getCellTemperature() const
+        {
+        return m_cell_temp;
+        }
+
     //! Get the total number of cells in the list
     const unsigned int getNCells() const
         {
@@ -78,12 +97,6 @@ class PYBIND11_EXPORT CellList : public Compute
         return m_global_cell_indexer;
         }
 
-    //! Get the cell list indexer
-    const Index2D& getCellListIndexer() const
-        {
-        return m_cell_list_indexer;
-        }
-
     //! Get the number of cells in each dimension
     const uint3& getDim() const
         {
@@ -112,12 +125,6 @@ class PYBIND11_EXPORT CellList : public Compute
     //! Wrap a cell into a global cell
     const int3 wrapGlobalCell(const int3& cell);
 
-    //! Get the maximum number of particles in a cell
-    const unsigned int getNmax() const
-        {
-        return m_cell_np_max;
-        }
-
     //! Get the MPCD cell size (deprecated)
     Scalar3 getCellSize();
 
@@ -242,6 +249,71 @@ class PYBIND11_EXPORT CellList : public Compute
         {
         return m_dim_signal;
         }
+    //! Get the net momentum of the particles from the last call to compute
+    Scalar3 getNetMomentum()
+        {
+        if (m_needs_net_reduce)
+            computeNetProperties();
+        ArrayHandle<double> h_net_properties(m_net_properties,
+                                             access_location::host,
+                                             access_mode::read);
+        const Scalar3 net_momentum
+            = make_scalar3(h_net_properties.data[mpcd::detail::thermo_index::momentum_x],
+                           h_net_properties.data[mpcd::detail::thermo_index::momentum_y],
+                           h_net_properties.data[mpcd::detail::thermo_index::momentum_z]);
+        return net_momentum;
+        }
+
+    //! Get the net energy of the particles from the last call to compute
+    Scalar getNetEnergy()
+        {
+        if (!m_flags[mpcd::detail::thermo_options::energy])
+            {
+            m_exec_conf->msg->error()
+                << "Energy requested from CellList, but was not computed." << std::endl;
+            throw std::runtime_error("Net cell energy not available");
+            }
+        if (m_needs_net_reduce)
+            computeNetProperties();
+        ArrayHandle<double> h_net_properties(m_net_properties,
+                                             access_location::host,
+                                             access_mode::read);
+        return h_net_properties.data[mpcd::detail::thermo_index::energy];
+        }
+
+    //! Get the average cell temperature from the last call to compute
+    Scalar getTemperature()
+        {
+        if (!m_flags[mpcd::detail::thermo_options::energy])
+            {
+            m_exec_conf->msg->error()
+                << "Temperature requested from CellList, but was not computed." << std::endl;
+            throw std::runtime_error("Net cell temperature not available");
+            }
+        if (m_needs_net_reduce)
+            computeNetProperties();
+        ArrayHandle<double> h_net_properties(m_net_properties,
+                                             access_location::host,
+                                             access_mode::read);
+        return h_net_properties.data[mpcd::detail::thermo_index::temperature];
+        }
+
+    //! Get the signal for requested thermo flags
+    /*!
+     * \returns A signal that subscribers can attach a callback to in order
+     *          to request certain data.
+     *
+     * For performance reasons, the CellList should be able to
+     * supply many related cell-level quantities from a single kernel launch.
+     * However, sometimes these quantities are not needed, and it is better
+     * to skip calculating them. Subscribing classes can optionally request
+     * some of these quantities via a callback return mpcd::detail::ThermoFlags
+     * with the appropriate bits set.
+     */
+    Nano::Signal<mpcd::detail::ThermoFlags()>& getFlagsSignal()
+        {
+        return m_flag_signal;
+        }
 
     protected:
     std::shared_ptr<mpcd::ParticleData> m_mpcd_pdata; //!< MPCD particle data
@@ -259,15 +331,21 @@ class PYBIND11_EXPORT CellList : public Compute
     Scalar3 m_global_cell_dim_inv; //!< Inverse of number of cells in each direction of global box
     Index3D m_cell_indexer;        //!< Indexer from 3D into cell list 1D
     Index3D m_global_cell_indexer; //!< Indexer from 3D into 1D for global cell indexes
-    Index2D m_cell_list_indexer;   //!< Indexer into cell list members
-    unsigned int m_cell_np_max;    //!< Maximum number of particles per cell
     GPUVector<unsigned int> m_cell_np;        //!< Number of particles per cell
-    GPUVector<unsigned int> m_cell_list;      //!< Cell list of particles
     GPUVector<unsigned int> m_embed_cell_ids; //!< Cell ids of the embedded particles
     GPUFlags<uint3> m_conditions; //!< Detect conditions that might fail building cell list
 
     int3 m_origin_idx; //!< Origin as a global index
 
+    GPUVector<double4> m_cell_vel;     //!< Average velocity of a cell + cell mass
+    GPUVector<double> m_cell_energy;   //!< Kinetic energy
+    GPUVector<double> m_cell_temp;     //!< Unscaled temperature
+    GPUArray<double> m_net_properties; //!< Scalar properties of the system
+    bool m_needs_net_reduce;           //!< Flag if a net reduction is necessary
+
+    Nano::Signal<mpcd::detail::ThermoFlags()> m_flag_signal; //!< Signal for requested flags
+    mpcd::detail::ThermoFlags m_flags;                       //!< Requested thermo flags
+
 #ifdef ENABLE_MPI
     unsigned int m_num_extra;               //!< Number of extra cells to communicate over
     std::array<unsigned int, 6> m_num_comm; //!< Number of cells to communicate on each face
@@ -277,6 +355,9 @@ class PYBIND11_EXPORT CellList : public Compute
     virtual bool needsEmbedMigrate(uint64_t timestep);
 #endif // ENABLE_MPI
 
+    //! Updates the requested optional flags
+    void updateFlags();
+
     //! Check the condition flags
     bool checkConditions();
 
@@ -286,10 +367,11 @@ class PYBIND11_EXPORT CellList : public Compute
     //! Builds the cell list and handles cell list memory
     virtual void buildCellList();
 
-    //! Callback to sort cell list when particle data is sorted
-    virtual void sort(uint64_t timestep,
-                      const GPUArray<unsigned int>& order,
-                      const GPUArray<unsigned int>& rorder);
+    //! Do final cell property calculation
+    virtual void finishComputeProperties();
+
+    //! Compute the net properties of all the cells
+    virtual void computeNetProperties();
 
     private:
     bool m_needs_compute_dim; //!< True if the dimensions need to be (re-)computed
diff --git a/hoomd/mpcd/CellListGPU.cc b/hoomd/mpcd/CellListGPU.cc
index ce51583a09..fb5d260dd9 100644
--- a/hoomd/mpcd/CellListGPU.cc
+++ b/hoomd/mpcd/CellListGPU.cc
@@ -7,22 +7,24 @@
  */
 
 #include "CellListGPU.h"
-#include "CellListGPU.cuh"
 
 namespace hoomd
     {
 mpcd::CellListGPU::CellListGPU(std::shared_ptr<SystemDefinition> sysdef,
                                Scalar cell_size,
                                bool shift)
-    : mpcd::CellList(sysdef, cell_size, shift)
+    : mpcd::CellList(sysdef, cell_size, shift), m_tmp_thermo(m_exec_conf), m_reduced(m_exec_conf)
     {
     m_tuner_cell.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
                                         m_exec_conf,
                                         "mpcd_cell"));
-    m_tuner_sort.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
-                                        m_exec_conf,
-                                        "mpcd_cell_sort"));
-    m_autotuners.insert(m_autotuners.end(), {m_tuner_cell, m_tuner_sort});
+    m_tuner_property.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
+                                            m_exec_conf,
+                                            "mpcd_cell_property"));
+    m_tuner_net.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
+                                       m_exec_conf,
+                                       "mpcd_cell_net_property"));
+    m_autotuners.insert(m_autotuners.end(), {m_tuner_cell, m_tuner_property, m_tuner_net});
 
 #ifdef ENABLE_MPI
     m_tuner_embed_migrate.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
@@ -38,15 +40,19 @@ mpcd::CellListGPU::CellListGPU(std::shared_ptr<SystemDefinition> sysdef,
 mpcd::CellListGPU::CellListGPU(std::shared_ptr<SystemDefinition> sysdef,
                                const uint3& global_cell_dim,
                                bool shift)
-    : mpcd::CellList(sysdef, global_cell_dim, shift)
+    : mpcd::CellList(sysdef, global_cell_dim, shift), m_tmp_thermo(m_exec_conf),
+      m_reduced(m_exec_conf)
     {
     m_tuner_cell.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
                                         m_exec_conf,
                                         "mpcd_cell"));
-    m_tuner_sort.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
-                                        m_exec_conf,
-                                        "mpcd_cell_sort"));
-    m_autotuners.insert(m_autotuners.end(), {m_tuner_cell, m_tuner_sort});
+    m_tuner_property.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
+                                            m_exec_conf,
+                                            "mpcd_cell_property"));
+    m_tuner_net.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
+                                       m_exec_conf,
+                                       "mpcd_cell_net_property"));
+    m_autotuners.insert(m_autotuners.end(), {m_tuner_cell, m_tuner_property, m_tuner_net});
 
 #ifdef ENABLE_MPI
     m_tuner_embed_migrate.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
@@ -63,10 +69,11 @@ mpcd::CellListGPU::~CellListGPU() { }
 
 void mpcd::CellListGPU::buildCellList()
     {
-    ArrayHandle<unsigned int> d_cell_list(m_cell_list,
-                                          access_location::device,
-                                          access_mode::overwrite);
     ArrayHandle<unsigned int> d_cell_np(m_cell_np, access_location::device, access_mode::overwrite);
+    ArrayHandle<double4> d_cell_vel(m_cell_vel, access_location::device, access_mode::overwrite);
+    ArrayHandle<double> d_cell_energy(m_cell_energy,
+                                      access_location::device,
+                                      access_mode::overwrite);
     ArrayHandle<Scalar4> d_pos(m_mpcd_pdata->getPositions(),
                                access_location::device,
                                access_mode::read);
@@ -97,6 +104,9 @@ void mpcd::CellListGPU::buildCellList()
         ArrayHandle<Scalar4> d_pos_embed(m_pdata->getPositions(),
                                          access_location::device,
                                          access_mode::read);
+        ArrayHandle<Scalar4> d_vel_embed(m_pdata->getVelocities(),
+                                         access_location::device,
+                                         access_mode::read);
         ArrayHandle<unsigned int> d_embed_member_idx(m_embed_group->getIndexArray(),
                                                      access_location::device,
                                                      access_mode::read);
@@ -104,12 +114,15 @@ void mpcd::CellListGPU::buildCellList()
 
         m_tuner_cell->begin();
         mpcd::gpu::compute_cell_list(d_cell_np.data,
-                                     d_cell_list.data,
+                                     d_cell_vel.data,
+                                     d_cell_energy.data,
                                      m_conditions.getDeviceFlags(),
                                      d_vel.data,
+                                     m_mpcd_pdata->getMass(),
                                      d_embed_cell_ids.data,
                                      d_pos.data,
                                      d_pos_embed.data,
+                                     d_vel_embed.data,
                                      d_embed_member_idx.data,
                                      m_pdata->getBox().getPeriodic(),
                                      m_origin_idx,
@@ -117,11 +130,10 @@ void mpcd::CellListGPU::buildCellList()
                                      m_pdata->getGlobalBox(),
                                      n_global_cells,
                                      m_global_cell_dim,
-                                     m_cell_np_max,
                                      m_cell_indexer,
-                                     m_cell_list_indexer,
                                      N_mpcd,
                                      N_tot,
+                                     m_flags[mpcd::detail::thermo_options::energy],
                                      m_tuner_cell->getParam()[0]);
         if (m_exec_conf->isCUDAErrorCheckingEnabled())
             CHECK_CUDA_ERROR();
@@ -131,24 +143,26 @@ void mpcd::CellListGPU::buildCellList()
         {
         m_tuner_cell->begin();
         mpcd::gpu::compute_cell_list(d_cell_np.data,
-                                     d_cell_list.data,
+                                     d_cell_vel.data,
+                                     d_cell_energy.data,
                                      m_conditions.getDeviceFlags(),
                                      d_vel.data,
+                                     m_mpcd_pdata->getMass(),
                                      NULL,
                                      d_pos.data,
                                      NULL,
                                      NULL,
+                                     NULL,
                                      m_pdata->getBox().getPeriodic(),
                                      m_origin_idx,
                                      m_grid_shift,
                                      m_pdata->getGlobalBox(),
                                      n_global_cells,
                                      m_global_cell_dim,
-                                     m_cell_np_max,
                                      m_cell_indexer,
-                                     m_cell_list_indexer,
                                      N_mpcd,
                                      N_tot,
+                                     m_flags[mpcd::detail::thermo_options::energy],
                                      m_tuner_cell->getParam()[0]);
         if (m_exec_conf->isCUDAErrorCheckingEnabled())
             CHECK_CUDA_ERROR();
@@ -156,43 +170,127 @@ void mpcd::CellListGPU::buildCellList()
         }
     }
 
-/*!
- * \param timestep Timestep that the sorting occurred
- * \param order Mapping of sorted particle indexes onto old particle indexes
- * \param rorder Mapping of old particle indexes onto sorted particle indexes
- */
-void mpcd::CellListGPU::sort(uint64_t timestep,
-                             const GPUArray<unsigned int>& order,
-                             const GPUArray<unsigned int>& rorder)
+void mpcd::CellListGPU::finishComputeProperties()
     {
-    // no need to do any sorting if we can still be called at the current timestep
-    if (peekCompute(timestep))
-        return;
+    ArrayHandle<unsigned int> d_cell_np(m_cell_np, access_location::device, access_mode::read);
+    ArrayHandle<double4> d_cell_vel(m_cell_vel, access_location::device, access_mode::readwrite);
+    ArrayHandle<double> d_cell_energy(m_cell_energy, access_location::device, access_mode::read);
+    ArrayHandle<double> d_cell_temp(m_cell_temp, access_location::device, access_mode::overwrite);
 
-    // force a recompute if mapping is invalid
-    if (rorder.isNull())
+    m_tuner_property->begin();
+    mpcd::gpu::finish_cell_properties(d_cell_np.data,
+                                      d_cell_vel.data,
+                                      d_cell_energy.data,
+                                      d_cell_temp.data,
+                                      getNCells(),
+                                      m_sysdef->getNDimensions(),
+                                      m_flags[mpcd::detail::thermo_options::energy],
+                                      m_tuner_property->getParam()[0]);
+    if (m_exec_conf->isCUDAErrorCheckingEnabled())
+        CHECK_CUDA_ERROR();
+    m_tuner_property->end();
+    }
+
+void mpcd::CellListGPU::computeNetProperties()
+    {
         {
-        m_force_compute = true;
-        return;
+        ArrayHandle<unsigned int> d_cell_np(m_cell_np, access_location::device, access_mode::read);
+        ArrayHandle<double4> d_cell_vel(m_cell_vel, access_location::device, access_mode::read);
+        ArrayHandle<double> d_cell_energy(m_cell_energy,
+                                          access_location::device,
+                                          access_mode::read);
+        ArrayHandle<double> d_cell_temp(m_cell_temp, access_location::device, access_mode::read);
+
+        m_tmp_thermo.resize(getNCells());
+
+        ArrayHandle<mpcd::detail::cell_thermo_element> d_tmp_thermo(m_tmp_thermo,
+                                                                    access_location::device,
+                                                                    access_mode::overwrite);
+
+        m_tuner_net->begin();
+        mpcd::gpu::stage_net_cell_thermo(d_tmp_thermo.data,
+                                         d_cell_np.data,
+                                         d_cell_vel.data,
+                                         d_cell_energy.data,
+                                         d_cell_temp.data,
+                                         getNCells(),
+                                         m_flags[mpcd::detail::thermo_options::energy],
+                                         m_tuner_property->getParam()[0]);
+        if (m_exec_conf->isCUDAErrorCheckingEnabled())
+            CHECK_CUDA_ERROR();
+        m_tuner_net->end();
+
+        // use cub to reduce the properties on the gpu
+        void* d_tmp = NULL;
+        size_t tmp_bytes = 0;
+        mpcd::gpu::reduce_net_cell_thermo(m_reduced.getDeviceFlags(),
+                                          d_tmp,
+                                          tmp_bytes,
+                                          d_tmp_thermo.data,
+                                          m_tmp_thermo.size());
+        if (m_exec_conf->isCUDAErrorCheckingEnabled())
+            CHECK_CUDA_ERROR();
+
+        ScopedAllocation<unsigned char> d_tmp_alloc(m_exec_conf->getCachedAllocator(),
+                                                    (tmp_bytes > 0) ? tmp_bytes : 1);
+        d_tmp = (void*)d_tmp_alloc();
+
+        mpcd::gpu::reduce_net_cell_thermo(m_reduced.getDeviceFlags(),
+                                          d_tmp,
+                                          tmp_bytes,
+                                          d_tmp_thermo.data,
+                                          m_tmp_thermo.size());
+        if (m_exec_conf->isCUDAErrorCheckingEnabled())
+            CHECK_CUDA_ERROR();
         }
 
-    // iterate through particles in cell list, and update their indexes using reverse mapping
-    ArrayHandle<unsigned int> d_cell_list(m_cell_list,
-                                          access_location::device,
-                                          access_mode::readwrite);
-    ArrayHandle<unsigned int> d_rorder(rorder, access_location::device, access_mode::read);
-    ArrayHandle<unsigned int> d_cell_np(m_cell_np, access_location::device, access_mode::read);
+    // now copy the net properties back to host from the flags
+    unsigned int n_temp_cells = 0;
+        {
+        const mpcd::detail::cell_thermo_element reduced = m_reduced.readFlags();
 
-    m_tuner_sort->begin();
-    mpcd::gpu::cell_apply_sort(d_cell_list.data,
-                               d_rorder.data,
-                               d_cell_np.data,
-                               m_cell_list_indexer,
-                               m_mpcd_pdata->getN(),
-                               m_tuner_sort->getParam()[0]);
-    if (m_exec_conf->isCUDAErrorCheckingEnabled())
-        CHECK_CUDA_ERROR();
-    m_tuner_sort->end();
+        ArrayHandle<double> h_net_properties(m_net_properties,
+                                             access_location::host,
+                                             access_mode::overwrite);
+        h_net_properties.data[mpcd::detail::thermo_index::momentum_x] = reduced.momentum.x;
+        h_net_properties.data[mpcd::detail::thermo_index::momentum_y] = reduced.momentum.y;
+        h_net_properties.data[mpcd::detail::thermo_index::momentum_z] = reduced.momentum.z;
+
+        h_net_properties.data[mpcd::detail::thermo_index::energy] = reduced.energy;
+        h_net_properties.data[mpcd::detail::thermo_index::temperature] = reduced.temperature;
+
+        n_temp_cells = reduced.flag;
+        }
+
+#ifdef ENABLE_MPI
+        {
+        ArrayHandle<double> h_net_properties(m_net_properties,
+                                             access_location::host,
+                                             access_mode::readwrite);
+        MPI_Allreduce(MPI_IN_PLACE,
+                      h_net_properties.data,
+                      mpcd::detail::thermo_index::num_quantities,
+                      MPI_DOUBLE,
+                      MPI_SUM,
+                      m_exec_conf->getMPICommunicator());
+
+        MPI_Allreduce(MPI_IN_PLACE,
+                      &n_temp_cells,
+                      1,
+                      MPI_UNSIGNED,
+                      MPI_SUM,
+                      m_exec_conf->getMPICommunicator());
+        }
+#endif // ENABLE_MPI
+
+    if (n_temp_cells > 0)
+        {
+        ArrayHandle<double> h_net_properties(m_net_properties,
+                                             access_location::host,
+                                             access_mode::readwrite);
+        h_net_properties.data[mpcd::detail::thermo_index::temperature] /= (double)n_temp_cells;
+        }
+    m_needs_net_reduce = false;
     }
 
 #ifdef ENABLE_MPI
diff --git a/hoomd/mpcd/CellListGPU.cu b/hoomd/mpcd/CellListGPU.cu
index 40435b7946..c77580da2a 100644
--- a/hoomd/mpcd/CellListGPU.cu
+++ b/hoomd/mpcd/CellListGPU.cu
@@ -5,6 +5,7 @@
  * \file mpcd/CellListGPU.cu
  * \brief Defines GPU functions and kernels used by mpcd::CellListGPU
  */
+#include <hipcub/hipcub.hpp>
 
 #include "CellListGPU.cuh"
 
@@ -19,12 +20,15 @@ namespace kernel
 //! Kernel to compute the MPCD cell list on the GPU
 /*!
  * \param d_cell_np Array of number of particles per cell
- * \param d_cell_list 2D array of MPCD particles in each cell
+ * \param d_cell_vel Array of center of mass velocity + cell mass per cell
+ * \param d_cell_energy Array of per cell kinetic energy
  * \param d_conditions Conditions flags for error reporting
  * \param d_vel MPCD particle velocities
+ * \param mpcd_mass MPCD particle mass
  * \param d_embed_cell_ids Cell indexes of embedded particles
  * \param d_pos MPCD particle positions
  * \param d_pos_embed Particle positions
+ * \param d_vel_embed Particle velocities
  * \param d_embed_member_idx Indexes of embedded particles in \a d_pos_embed
  * \param periodic Flags if local simulation is periodic
  * \param origin_idx Global origin index for the local box
@@ -32,26 +36,29 @@ namespace kernel
  * \param global_box Global simulation box
  * \param n_global_cell Global dimensions of the cell list, including padding
  * \param global_cell_dim Global cell dimensions, no padding
- * \param cell_np_max Maximum number of particles per cell
  * \param cell_indexer 3D indexer for cell id
- * \param cell_list_indexer 2D indexer for particle position in cell
  * \param N_mpcd Number of MPCD particles
  * \param N_tot Total number of particle (MPCD + embedded)
+ * \param need_energy True if computing the energy
  *
  * \b Implementation
  * One thread is launched per particle. The particle is floored into a bin subject to a random grid
- * shift. The number of particles in that bin is atomically incremented. If the addition of the
+ * shift. The number of particles in that bin is atomically incremented and the contribution of the
+ * particle's properties to the cell's properties is added to a running sum. If the addition of the
  * particle will not overflow the allocated memory, the particle is written into that bin.
  * Otherwise, a flag is set to resize the cell list and recompute. The MPCD particle's cell id is
  * stashed into the velocity array.
  */
 __global__ void compute_cell_list(unsigned int* d_cell_np,
-                                  unsigned int* d_cell_list,
+                                  double4* d_cell_vel,
+                                  double* d_cell_energy,
                                   uint3* d_conditions,
                                   Scalar4* d_vel,
+                                  double mpcd_mass,
                                   unsigned int* d_embed_cell_ids,
                                   const Scalar4* d_pos,
                                   const Scalar4* d_pos_embed,
+                                  const Scalar4* d_vel_embed,
                                   const unsigned int* d_embed_member_idx,
                                   const uchar3 periodic,
                                   const int3 origin_idx,
@@ -59,11 +66,10 @@ __global__ void compute_cell_list(unsigned int* d_cell_np,
                                   const BoxDim global_box,
                                   const uint3 n_global_cell,
                                   const uint3 global_cell_dim,
-                                  const unsigned int cell_np_max,
                                   const Index3D cell_indexer,
-                                  const Index2D cell_list_indexer,
                                   const unsigned int N_mpcd,
-                                  const unsigned int N_tot)
+                                  const unsigned int N_tot,
+                                  const bool need_energy)
     {
     // one thread per particle
     unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -71,15 +77,22 @@ __global__ void compute_cell_list(unsigned int* d_cell_np,
         return;
 
     Scalar4 postype_i;
+    Scalar4 vel_mass_i;
+    double mass_i;
     if (idx < N_mpcd)
         {
         postype_i = d_pos[idx];
+        vel_mass_i = d_vel[idx];
+        mass_i = mpcd_mass;
         }
     else
         {
         postype_i = d_pos_embed[d_embed_member_idx[idx - N_mpcd]];
+        vel_mass_i = d_vel_embed[d_embed_member_idx[idx - N_mpcd]];
+        mass_i = vel_mass_i.w;
         }
     const Scalar3 pos_i = make_scalar3(postype_i.x, postype_i.y, postype_i.z);
+    const double3 vel_i = make_double3(vel_mass_i.x, vel_mass_i.y, vel_mass_i.z);
 
     if (isnan(pos_i.x) || isnan(pos_i.y) || isnan(pos_i.z))
         {
@@ -156,25 +169,147 @@ __global__ void compute_cell_list(unsigned int* d_cell_np,
 
     const unsigned int bin_idx = cell_indexer(bin.x, bin.y, bin.z);
     const unsigned int offset = atomicInc(&d_cell_np[bin_idx], 0xffffffff);
-    if (offset < cell_np_max)
+
+    // stash the current particle bin into the velocity array
+    if (idx < N_mpcd)
         {
-        d_cell_list[cell_list_indexer(offset, bin_idx)] = idx;
+        d_vel[idx].w = __int_as_scalar(bin_idx);
         }
     else
         {
-        // overflow
-        atomicMax(&(*d_conditions).x, offset + 1);
+        d_embed_cell_ids[idx - N_mpcd] = bin_idx;
         }
 
-    // stash the current particle bin into the velocity array
-    if (idx < N_mpcd)
+    // compute the contribution of the particle to cell properties
+    double4& cell_vel = d_cell_vel[bin_idx];
+    atomicAdd(&cell_vel.x, vel_i.x * mass_i);
+    atomicAdd(&cell_vel.y, vel_i.y * mass_i);
+    atomicAdd(&cell_vel.z, vel_i.z * mass_i);
+    atomicAdd(&cell_vel.w, mass_i);
+
+    if (need_energy)
         {
-        d_vel[idx].w = __int_as_scalar(bin_idx);
+        double ke = 0.5 * mass_i * (vel_i.x * vel_i.x + vel_i.y * vel_i.y + vel_i.z * vel_i.z);
+        atomicAdd(&d_cell_energy[bin_idx], ke);
+        }
+    }
+
+/*!
+ * \param d_cell_np Array of number of particles per cell
+ * \param d_cell_vel Cell velocity to finish computing
+ * \param d_cell_energy Cell kinetic energy
+ * \param d_cell_temp Cell temperature to finish computing
+ * \param N_cells Number of cells
+ * \param N_dim Number of dimensions
+ * \param need_energy If true, compute the cell-level energy properties.
+ *
+ * \b Implementation details:
+ * Using one thread per cell, the cell properties are normalized from the
+ * additive contributions of the particles to their final values, e.g. the cell
+ * velocities are computed from the net momentum of the cell divided by the final
+ * mass. The temperature of each cell is calculated.
+ */
+__global__ void finish_cell_properties(const unsigned int* d_cell_np,
+                                       double4* d_cell_vel,
+                                       const double* d_cell_energy,
+                                       double* d_cell_temp,
+                                       const unsigned int N_cells,
+                                       const unsigned int N_dim,
+                                       const bool need_energy)
+    {
+    // one thread per cell
+    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N_cells)
+        return;
+
+    const double4 vel_cell = d_cell_vel[idx];
+    double3 vel_cm = make_double3(vel_cell.x, vel_cell.y, vel_cell.z);
+    const double mass = vel_cell.w;
+
+    // get center of mass velocity from momentum
+    if (mass > 0.)
+        {
+        // average velocity is only defined when there is some mass in the cell
+        vel_cm.x /= mass;
+        vel_cm.y /= mass;
+        vel_cm.z /= mass;
+        }
+    d_cell_vel[idx] = make_double4(vel_cm.x, vel_cm.y, vel_cm.z, mass);
+
+    if (need_energy)
+        {
+        const double cell_energy = d_cell_energy[idx];
+        double temp(0.0);
+        const unsigned int np = d_cell_np[idx];
+
+        if (np > 1)
+            {
+            const double ke_cm
+                = 0.5 * mass * (vel_cm.x * vel_cm.x + vel_cm.y * vel_cm.y + vel_cm.z * vel_cm.z);
+            temp = 2. * (cell_energy - ke_cm) / (N_dim * (np - 1));
+            }
+        d_cell_temp[idx] = temp;
+        }
+    }
+
+/*!
+ * \param d_tmp_thermo Temporary cell packed thermo element
+ * \param d_cell_np Number of particles per cell
+ * \param d_cell_vel Cell velocity
+ * \param d_cell_energy Cell kinetic energy
+ * \param d_cell_temperature Cell temperature
+ * \param N_cells Number of cells
+ * \tparam need_energy If true, compute the cell-level energy properties.
+ *
+ * \b Implementation details:
+ * Using one thread per \a temporary cell, the cell properties are normalized
+ * in a way suitable for reduction of net properties, e.g. the cell velocities
+ * are converted to momentum. The temperature is set to the cell energy, and a
+ * flag is set to 1 or 0 to indicate whether this cell has an energy that should
+ * be used in averaging the total temperature.
+ */
+template<bool need_energy>
+__global__ void stage_net_cell_thermo(mpcd::detail::cell_thermo_element* d_tmp_thermo,
+                                      const unsigned int* d_cell_np,
+                                      const double4* d_cell_vel,
+                                      const double* d_cell_energy,
+                                      const double* d_cell_temp,
+                                      const unsigned int N_cells)
+    {
+    // one thread per cell
+    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N_cells)
+        return;
+
+    const double4 vel_cell = d_cell_vel[idx];
+    const double mass = vel_cell.w;
+
+    // add accumulated momentum to net momentum
+    mpcd::detail::cell_thermo_element thermo;
+    thermo.momentum = make_double3(vel_cell.x * mass, vel_cell.y * mass, vel_cell.z * mass);
+
+    if (need_energy)
+        {
+        if (d_cell_np[idx] > 1)
+            {
+            thermo.temperature = d_cell_temp[idx];
+            thermo.flag = 1;
+            }
+        else
+            {
+            thermo.temperature = 0.;
+            thermo.flag = 0;
+            }
+        thermo.energy = d_cell_energy[idx];
         }
     else
         {
-        d_embed_cell_ids[idx - N_mpcd] = bin_idx;
+        thermo.energy = 0.;
+        thermo.temperature = 0.;
+        thermo.flag = 0;
         }
+
+    d_tmp_thermo[idx] = thermo;
     }
 
 /*!
@@ -217,48 +352,21 @@ __global__ void cell_check_migrate_embed(unsigned int* d_migrate_flag,
         atomicMax(d_migrate_flag, 1);
         }
     }
-
-__global__ void cell_apply_sort(unsigned int* d_cell_list,
-                                const unsigned int* d_rorder,
-                                const unsigned int* d_cell_np,
-                                const Index2D cli,
-                                const unsigned int N_mpcd,
-                                const unsigned int N_cli)
-    {
-    // one thread per cell-list entry
-    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= N_cli)
-        return;
-
-    // convert the entry 1D index into a 2D index
-    const unsigned int cell = idx / cli.getW();
-    const unsigned int offset = idx - (cell * cli.getW());
-
-    /* here comes some terrible execution divergence */
-    // check if the cell is filled
-    const unsigned int np = d_cell_np[cell];
-    if (offset < np)
-        {
-        // check if this is an MPCD particle
-        const unsigned int pid = d_cell_list[idx];
-        if (pid < N_mpcd)
-            {
-            d_cell_list[idx] = d_rorder[pid];
-            }
-        }
-    }
     } // end namespace kernel
     } // end namespace gpu
     } // end namespace mpcd
 
 /*!
  * \param d_cell_np Array of number of particles per cell
- * \param d_cell_list 2D array of MPCD particles in each cell
+ * \param d_cell_vel Array of center of mass velocity + cell mass per cell
+ * \param d_cell_energy Array of per cell kinetic energy, temperature, and dof
  * \param d_conditions Conditions flags for error reporting
  * \param d_vel MPCD particle velocities
+ * \param mpcd_mass MPCD particle mass
  * \param d_embed_cell_ids Cell indexes of embedded particles
  * \param d_pos MPCD particle positions
  * \param d_pos_embed Particle positions
+ * \param d_vel_embed Particle velocities
  * \param d_embed_member_idx Indexes of embedded particles in \a d_pos_embed
  * \param periodic Flags if local simulation is periodic
  * \param origin_idx Global origin index for the local box
@@ -266,22 +374,24 @@ __global__ void cell_apply_sort(unsigned int* d_cell_list,
  * \param global_box Global simulation box
  * \param n_global_cell Global dimensions of the cell list, including padding
  * \param global_cell_dim Global cell dimensions, no padding
- * \param cell_np_max Maximum number of particles per cell
  * \param cell_indexer 3D indexer for cell id
- * \param cell_list_indexer 2D indexer for particle position in cell
  * \param N_mpcd Number of MPCD particles
  * \param N_tot Total number of particle (MPCD + embedded)
+ * \param need_energy True if computing the energy
  * \param block_size Number of threads per block
  *
  * \returns cudaSuccess on completion, or an error on failure
  */
 cudaError_t mpcd::gpu::compute_cell_list(unsigned int* d_cell_np,
-                                         unsigned int* d_cell_list,
+                                         double4* d_cell_vel,
+                                         double* d_cell_energy,
                                          uint3* d_conditions,
                                          Scalar4* d_vel,
+                                         double mpcd_mass,
                                          unsigned int* d_embed_cell_ids,
                                          const Scalar4* d_pos,
                                          const Scalar4* d_pos_embed,
+                                         const Scalar4* d_vel_embed,
                                          const unsigned int* d_embed_member_idx,
                                          const uchar3& periodic,
                                          const int3& origin_idx,
@@ -289,11 +399,10 @@ cudaError_t mpcd::gpu::compute_cell_list(unsigned int* d_cell_np,
                                          const BoxDim& global_box,
                                          const uint3& n_global_cell,
                                          const uint3& global_cell_dim,
-                                         const unsigned int cell_np_max,
                                          const Index3D& cell_indexer,
-                                         const Index2D& cell_list_indexer,
                                          const unsigned int N_mpcd,
                                          const unsigned int N_tot,
+                                         const bool need_energy,
                                          const unsigned int block_size)
     {
     // set the number of particles in each cell to zero
@@ -301,6 +410,17 @@ cudaError_t mpcd::gpu::compute_cell_list(unsigned int* d_cell_np,
         = cudaMemset(d_cell_np, 0, sizeof(unsigned int) * cell_indexer.getNumElements());
     if (error != cudaSuccess)
         return error;
+    cudaError_t error_vel
+        = cudaMemset(d_cell_vel, 0, sizeof(double4) * cell_indexer.getNumElements());
+    if (error_vel != cudaSuccess)
+        return error_vel;
+    if (need_energy)
+        {
+        cudaError_t error_energy
+            = cudaMemset(d_cell_energy, 0, sizeof(double) * cell_indexer.getNumElements());
+        if (error_energy != cudaSuccess)
+            return error_energy;
+        }
 
     unsigned int max_block_size;
     cudaFuncAttributes attr;
@@ -310,12 +430,15 @@ cudaError_t mpcd::gpu::compute_cell_list(unsigned int* d_cell_np,
     unsigned int run_block_size = min(block_size, max_block_size);
     dim3 grid(N_tot / run_block_size + 1);
     mpcd::gpu::kernel::compute_cell_list<<<grid, run_block_size>>>(d_cell_np,
-                                                                   d_cell_list,
+                                                                   d_cell_vel,
+                                                                   d_cell_energy,
                                                                    d_conditions,
                                                                    d_vel,
+                                                                   mpcd_mass,
                                                                    d_embed_cell_ids,
                                                                    d_pos,
                                                                    d_pos_embed,
+                                                                   d_vel_embed,
                                                                    d_embed_member_idx,
                                                                    periodic,
                                                                    origin_idx,
@@ -323,12 +446,144 @@ cudaError_t mpcd::gpu::compute_cell_list(unsigned int* d_cell_np,
                                                                    global_box,
                                                                    n_global_cell,
                                                                    global_cell_dim,
-                                                                   cell_np_max,
                                                                    cell_indexer,
-                                                                   cell_list_indexer,
                                                                    N_mpcd,
-                                                                   N_tot);
+                                                                   N_tot,
+                                                                   need_energy);
+
+    return cudaSuccess;
+    }
+
+/*!
+ * \param d_cell_np Number of particles in cells
+ * \param d_cell_vel Cell velocity to reduce
+ * \param d_cell_energy Cell kinetic energy
+ * \param d_cell_temp Cell temperature to finish computing
+ * \param N_cells Number of total cells
+ * \param N_dim Number of dimensions
+ * \param need_energy If true, compute the cell-level energy properties
+ * \param block_size Number of threads per block
+ *
+ * \returns cudaSuccess on completion
+ *
+ * \sa mpcd::gpu::kernel::finish_cell_properties
+ */
+cudaError_t mpcd::gpu::finish_cell_properties(const unsigned int* d_cell_np,
+                                              double4* d_cell_vel,
+                                              const double* d_cell_energy,
+                                              double* d_cell_temp,
+                                              const unsigned int N_cells,
+                                              const unsigned int N_dim,
+                                              const bool need_energy,
+                                              const unsigned int block_size)
+    {
+    // set all temperatures to zero
+    if (need_energy)
+        {
+        cudaError_t error_temp = cudaMemset(d_cell_temp, 0, sizeof(double) * N_cells);
+        if (error_temp != cudaSuccess)
+            return error_temp;
+        }
+
+    unsigned int max_block_size;
+    cudaFuncAttributes attr;
+    cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::finish_cell_properties);
+    max_block_size = attr.maxThreadsPerBlock;
+
+    unsigned int run_block_size = min(block_size, max_block_size);
+    dim3 grid(N_cells / run_block_size + 1);
+    mpcd::gpu::kernel::finish_cell_properties<<<grid, run_block_size>>>(d_cell_np,
+                                                                        d_cell_vel,
+                                                                        d_cell_energy,
+                                                                        d_cell_temp,
+                                                                        N_cells,
+                                                                        N_dim,
+                                                                        need_energy);
+
+    return cudaSuccess;
+    }
+
+/*!
+ * \param d_tmp_thermo Temporary cell packed thermo element
+ * \param d_cell_np Number of particles per cell
+ * \param d_cell_vel Cell velocity
+ * \param d_cell_energy Cell kinetic energy
+ * \param d_cell_temp Cell temperature
+ * \param N_cells Number of total cells
+ * \param need_energy If true, compute the cell-level energy properties
+ * \param block_size Number of threads per block
+ *
+ * \returns cudaSuccess on completion
+ *
+ * \sa mpcd::gpu::kernel::stage_net_cell_thermo
+ */
+cudaError_t mpcd::gpu::stage_net_cell_thermo(mpcd::detail::cell_thermo_element* d_tmp_thermo,
+                                             const unsigned int* d_cell_np,
+                                             const double4* d_cell_vel,
+                                             const double* d_cell_energy,
+                                             const double* d_cell_temp,
+                                             const unsigned int N_cells,
+                                             const bool need_energy,
+                                             const unsigned int block_size)
+    {
+    if (need_energy)
+        {
+        unsigned int max_block_size_energy;
+        cudaFuncAttributes attr;
+        cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::stage_net_cell_thermo<true>);
+        max_block_size_energy = attr.maxThreadsPerBlock;
+
+        unsigned int run_block_size = min(block_size, max_block_size_energy);
+        dim3 grid(N_cells / run_block_size + 1);
+        mpcd::gpu::kernel::stage_net_cell_thermo<true><<<grid, run_block_size>>>(d_tmp_thermo,
+                                                                                 d_cell_np,
+                                                                                 d_cell_vel,
+                                                                                 d_cell_energy,
+                                                                                 d_cell_temp,
+                                                                                 N_cells);
+        }
+    else
+        {
+        unsigned int max_block_size_noenergy;
+        cudaFuncAttributes attr;
+        cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::stage_net_cell_thermo<false>);
+        max_block_size_noenergy = attr.maxThreadsPerBlock;
 
+        unsigned int run_block_size = min(block_size, max_block_size_noenergy);
+        dim3 grid(N_cells / run_block_size + 1);
+        mpcd::gpu::kernel::stage_net_cell_thermo<false><<<grid, run_block_size>>>(d_tmp_thermo,
+                                                                                  d_cell_np,
+                                                                                  d_cell_vel,
+                                                                                  d_cell_energy,
+                                                                                  d_cell_temp,
+                                                                                  N_cells);
+        }
+    return cudaSuccess;
+    }
+
+/*!
+ * \param d_reduced Cell thermo properties reduced across all cells (output on second call)
+ * \param d_tmp Temporary storage for reduction (output on first call)
+ * \param tmp_bytes Number of bytes allocated for temporary storage (output on first call)
+ * \param d_tmp_thermo Cell thermo properties to reduce
+ * \param N_cells The number of cells to reduce across
+ *
+ * \returns cudaSuccess on completion
+ *
+ * \b Implementation details:
+ * CUB DeviceReduce is used to perform the reduction. Hence, this function requires
+ * two calls to perform the reduction. The first call sizes the temporary storage,
+ * which is returned in \a d_tmp and \a tmp_bytes. The caller must then allocate
+ * the required bytes, and call the function a second time. This performs the
+ * reduction and returns the result in \a d_reduced.
+ */
+cudaError_t mpcd::gpu::reduce_net_cell_thermo(mpcd::detail::cell_thermo_element* d_reduced,
+                                              void* d_tmp,
+                                              size_t& tmp_bytes,
+                                              const mpcd::detail::cell_thermo_element* d_tmp_thermo,
+                                              const size_t N_cells)
+    {
+    cub::DeviceReduce::Sum(d_tmp, tmp_bytes, d_tmp_thermo, d_reduced, (unsigned int)N_cells);
     return cudaSuccess;
     }
 
@@ -370,30 +625,4 @@ cudaError_t mpcd::gpu::cell_check_migrate_embed(unsigned int* d_migrate_flag,
     return cudaSuccess;
     }
 
-cudaError_t mpcd::gpu::cell_apply_sort(unsigned int* d_cell_list,
-                                       const unsigned int* d_rorder,
-                                       const unsigned int* d_cell_np,
-                                       const Index2D& cli,
-                                       const unsigned int N_mpcd,
-                                       const unsigned int block_size)
-    {
-    unsigned int max_block_size;
-    cudaFuncAttributes attr;
-    cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::cell_apply_sort);
-    max_block_size = attr.maxThreadsPerBlock;
-
-    const unsigned int N_cli = cli.getNumElements();
-
-    unsigned int run_block_size = min(block_size, max_block_size);
-    dim3 grid(N_cli / run_block_size + 1);
-    mpcd::gpu::kernel::cell_apply_sort<<<grid, run_block_size>>>(d_cell_list,
-                                                                 d_rorder,
-                                                                 d_cell_np,
-                                                                 cli,
-                                                                 N_mpcd,
-                                                                 N_cli);
-
-    return cudaSuccess;
-    }
-
     } // end namespace hoomd
diff --git a/hoomd/mpcd/CellListGPU.cuh b/hoomd/mpcd/CellListGPU.cuh
index e74440ba81..d7609ba62c 100644
--- a/hoomd/mpcd/CellListGPU.cuh
+++ b/hoomd/mpcd/CellListGPU.cuh
@@ -19,16 +19,49 @@ namespace hoomd
     {
 namespace mpcd
     {
+namespace detail
+    {
+#ifdef __HIPCC__
+#define HOSTDEVICE __host__ __device__
+#else
+#define HOSTDEVICE
+#endif
+//! Custom cell thermo element for reductions on the gpu
+struct cell_thermo_element
+    {
+    double3 momentum;   //!< Momentum of the cell
+    double energy;      //!< Energy of the cell
+    double temperature; //!< Temperature of the cell (0 if < 2 particles)
+    unsigned int flag;  //!< Flag to be used to compute filled cells
+
+    //! Addition operator for summed reduction
+    HOSTDEVICE cell_thermo_element operator+(const cell_thermo_element& other) const
+        {
+        cell_thermo_element sum;
+        sum.momentum.x = momentum.x + other.momentum.x;
+        sum.momentum.y = momentum.y + other.momentum.y;
+        sum.momentum.z = momentum.z + other.momentum.z;
+        sum.energy = energy + other.energy;
+        sum.temperature = temperature + other.temperature;
+        sum.flag = flag + other.flag;
+
+        return sum;
+        }
+    };
+    } // namespace detail
 namespace gpu
     {
 //! Kernel driver to compute mpcd cell list
 cudaError_t compute_cell_list(unsigned int* d_cell_np,
-                              unsigned int* d_cell_list,
+                              double4* d_cell_vel,
+                              double* d_cell_energy,
                               uint3* d_conditions,
                               Scalar4* d_vel,
+                              double mpcd_mass,
                               unsigned int* d_embed_cell_ids,
                               const Scalar4* d_pos,
                               const Scalar4* d_pos_embed,
+                              const Scalar4* d_vel_embed,
                               const unsigned int* d_embed_member_idx,
                               const uchar3& periodic,
                               const int3& origin_idx,
@@ -36,13 +69,39 @@ cudaError_t compute_cell_list(unsigned int* d_cell_np,
                               const BoxDim& global_box,
                               const uint3& n_global_cell,
                               const uint3& global_cell_dim,
-                              const unsigned int cell_np_max,
                               const Index3D& cell_indexer,
-                              const Index2D& cell_list_indexer,
                               const unsigned int N_mpcd,
                               const unsigned int N_tot,
+                              const bool need_energy,
                               const unsigned int block_size);
 
+//! Kernel driver to finalize cell property calculation
+cudaError_t finish_cell_properties(const unsigned int* d_cell_np,
+                                   double4* d_cell_vel,
+                                   const double* d_cell_energy,
+                                   double* d_cell_temp,
+                                   const unsigned int N_cells,
+                                   const unsigned int N_dim,
+                                   const bool need_energy,
+                                   const unsigned int block_size);
+
+//! Kernel driver to normalize center of mass velocity and compute net properties
+cudaError_t stage_net_cell_thermo(mpcd::detail::cell_thermo_element* d_tmp_thermo,
+                                  const unsigned int* d_cell_np,
+                                  const double4* d_cell_vel,
+                                  const double* d_cell_energy,
+                                  const double* d_cell_temp,
+                                  const unsigned int N_cells,
+                                  const bool need_energy,
+                                  const unsigned int block_size);
+
+//! Wrapper to cub device reduce for cell thermo properties
+cudaError_t reduce_net_cell_thermo(mpcd::detail::cell_thermo_element* d_reduced,
+                                   void* d_tmp,
+                                   size_t& tmp_bytes,
+                                   const mpcd::detail::cell_thermo_element* d_tmp_thermo,
+                                   const size_t N_cells);
+
 //! Kernel driver to check if any embedded particles require migration
 cudaError_t cell_check_migrate_embed(unsigned int* d_migrate_flag,
                                      const Scalar4* d_pos,
@@ -52,14 +111,6 @@ cudaError_t cell_check_migrate_embed(unsigned int* d_migrate_flag,
                                      const unsigned int N,
                                      const unsigned int block_size);
 
-//! Kernel drive to apply sorted order to MPCD particles in cell list
-cudaError_t cell_apply_sort(unsigned int* d_cell_list,
-                            const unsigned int* d_rorder,
-                            const unsigned int* d_cell_np,
-                            const Index2D& cli,
-                            const unsigned int N_mpcd,
-                            const unsigned int block_size);
-
     } // end namespace gpu
     } // end namespace mpcd
     } // end namespace hoomd
diff --git a/hoomd/mpcd/CellListGPU.h b/hoomd/mpcd/CellListGPU.h
index f9f78f8779..a1e3e3d494 100644
--- a/hoomd/mpcd/CellListGPU.h
+++ b/hoomd/mpcd/CellListGPU.h
@@ -14,6 +14,7 @@
 #endif
 
 #include "CellList.h"
+#include "CellListGPU.cuh"
 #include "hoomd/Autotuner.h"
 
 namespace hoomd
@@ -36,10 +37,11 @@ class PYBIND11_EXPORT CellListGPU : public mpcd::CellList
     //! Compute the cell list of particles on the GPU
     void buildCellList() override;
 
-    //! Callback to sort cell list on the GPU when particle data is sorted
-    virtual void sort(uint64_t timestep,
-                      const GPUArray<unsigned int>& order,
-                      const GPUArray<unsigned int>& rorder);
+    //! Do final cell property calculation
+    void finishComputeProperties() override;
+
+    //! Compute the net properties of all the cells
+    void computeNetProperties() override;
 
 #ifdef ENABLE_MPI
     //! Determine if embedded particles require migration on the gpu
@@ -48,11 +50,19 @@ class PYBIND11_EXPORT CellListGPU : public mpcd::CellList
 #endif                                     // ENABLE_MPI
 
     private:
+    GPUVector<mpcd::detail::cell_thermo_element>
+        m_tmp_thermo; //!< Temporary array for holding cell data
+    GPUFlags<mpcd::detail::cell_thermo_element> m_reduced; //!< Flags to hold reduced sum
+
     /// Autotuner for the cell list calculation.
     std::shared_ptr<Autotuner<1>> m_tuner_cell;
 
-    /// Autotuner for sorting the cell list.
-    std::shared_ptr<Autotuner<1>> m_tuner_sort;
+    /// Autotuner for finishing cell property calculation.
+    std::shared_ptr<Autotuner<1>> m_tuner_property;
+
+    /// Autotuner for the net property calculation.
+    std::shared_ptr<Autotuner<1>> m_tuner_net;
+
 #ifdef ENABLE_MPI
     /// Autotuner for checking embedded migration.
     std::shared_ptr<Autotuner<1>> m_tuner_embed_migrate;
diff --git a/hoomd/mpcd/CellThermoCompute.cc b/hoomd/mpcd/CellThermoCompute.cc
deleted file mode 100644
index c77285e3f7..0000000000
--- a/hoomd/mpcd/CellThermoCompute.cc
+++ /dev/null
@@ -1,617 +0,0 @@
-// Copyright (c) 2009-2025 The Regents of the University of Michigan.
-// Part of HOOMD-blue, released under the BSD 3-Clause License.
-
-/*!
- * \file mpcd/CellThermoCompute.cc
- * \brief Definition of mpcd::CellThermoCompute
- */
-
-#include "CellThermoCompute.h"
-#include "ReductionOperators.h"
-
-namespace hoomd
-    {
-/*!
- * \param sysdef System definition
- */
-mpcd::CellThermoCompute::CellThermoCompute(std::shared_ptr<SystemDefinition> sysdef,
-                                           std::shared_ptr<mpcd::CellList> cl)
-    : Compute(sysdef), m_mpcd_pdata(m_sysdef->getMPCDParticleData()), m_cl(cl),
-      m_needs_net_reduce(true), m_cell_vel(m_exec_conf), m_cell_energy(m_exec_conf),
-      m_ncells_alloc(0)
-    {
-    m_exec_conf->msg->notice(5) << "Constructing MPCD CellThermoCompute" << std::endl;
-
-    GPUArray<double> net_properties(mpcd::detail::thermo_index::num_quantities, m_exec_conf);
-    m_net_properties.swap(net_properties);
-
-#ifdef ENABLE_MPI
-    if (m_exec_conf->getNRanks() > 1)
-        {
-        m_use_mpi = true;
-        m_vel_comm = std::make_shared<mpcd::CellCommunicator>(m_sysdef, m_cl);
-        m_energy_comm = std::make_shared<mpcd::CellCommunicator>(m_sysdef, m_cl);
-        }
-    else
-        {
-        m_use_mpi = false;
-        }
-#endif // ENABLE_MPI
-
-    // the thermo properties need to be recomputed if the virtual particles change
-    m_mpcd_pdata->getNumVirtualSignal()
-        .connect<mpcd::CellThermoCompute, &mpcd::CellThermoCompute::slotNumVirtual>(this);
-    }
-
-mpcd::CellThermoCompute::~CellThermoCompute()
-    {
-    m_exec_conf->msg->notice(5) << "Destroying MPCD CellThermoCompute" << std::endl;
-    m_mpcd_pdata->getNumVirtualSignal()
-        .disconnect<mpcd::CellThermoCompute, &mpcd::CellThermoCompute::slotNumVirtual>(this);
-    }
-
-void mpcd::CellThermoCompute::compute(uint64_t timestep)
-    {
-    Compute::compute(timestep);
-    // check if computation should proceed, and always mark the calculation as occurring at this
-    // timestep, even if forced
-    if (!shouldCompute(timestep))
-        return;
-    m_last_computed = timestep;
-
-    // cell list needs to be up to date first
-    m_cl->compute(timestep);
-
-    // ensure optional flags are up to date
-    updateFlags();
-
-    const unsigned int ncells = m_cl->getNCells();
-    if (ncells != m_ncells_alloc)
-        {
-        reallocate(ncells);
-        }
-
-    computeCellProperties(timestep);
-    m_needs_net_reduce = true;
-    }
-
-void mpcd::CellThermoCompute::startAutotuning()
-    {
-    Compute::startAutotuning();
-#ifdef ENABLE_MPI
-    if (m_vel_comm)
-        m_vel_comm->startAutotuning();
-    if (m_energy_comm)
-        m_energy_comm->startAutotuning();
-#endif // ENABLE_MPI
-    }
-
-bool mpcd::CellThermoCompute::isAutotuningComplete()
-    {
-    bool result = Compute::isAutotuningComplete();
-#ifdef ENABLE_MPI
-    if (m_vel_comm)
-        result = result && m_vel_comm->isAutotuningComplete();
-    if (m_energy_comm)
-        result = result && m_energy_comm->isAutotuningComplete();
-#endif // ENABLE_MPI
-    return result;
-    }
-
-void mpcd::CellThermoCompute::computeCellProperties(uint64_t timestep)
-    {
-/*
- * In MPI simulations, begin by calculating the velocities and energies of
- * cells that lie along the boundaries. These values will then be communicated
- * while calculations proceed on the inner cells.
- */
-#ifdef ENABLE_MPI
-    if (m_use_mpi)
-        {
-        beginOuterCellProperties();
-
-        m_vel_comm->begin(m_cell_vel, mpcd::detail::CellVelocityPackOp());
-        if (m_flags[mpcd::detail::thermo_options::energy])
-            m_energy_comm->begin(m_cell_energy, mpcd::detail::CellEnergyPackOp());
-        }
-#endif // ENABLE_MPI
-
-    /*
-     * While communication is occurring on the outer cells, do the full calculation
-     * on the inner cells. In non-MPI simulations, only this part happens.
-     */
-    calcInnerCellProperties();
-
-    /*
-     * Execute any additional callbacks that can be overlapped with outer communication.
-     */
-    if (!m_callbacks.empty())
-        m_callbacks.emit(timestep);
-
-/*
- * In MPI simulations, we need to finalize the communication on the outer cells
- * and normalize the communicated data.
- */
-#ifdef ENABLE_MPI
-    if (m_use_mpi)
-        {
-        if (m_flags[mpcd::detail::thermo_options::energy])
-            m_energy_comm->finalize(m_cell_energy, mpcd::detail::CellEnergyPackOp());
-        m_vel_comm->finalize(m_cell_vel, mpcd::detail::CellVelocityPackOp());
-
-        finishOuterCellProperties();
-        }
-#endif // ENABLE_MPI
-    }
-
-namespace mpcd
-    {
-namespace detail
-    {
-//! Sums properties of an MPCD cell on the CPU
-/*!
- * This lightweight class is used in both beginOuterCellProperties() and
- * calcInnerCellProperties(). The code has been consolidated into one place
- * here to avoid some duplication.
- */
-struct CellPropertySum
-    {
-    //! Constructor
-    /*!
-     * \param cell_list_ Cell list
-     * \param cell_np_ Number of particles per cell
-     * \param cli_ Cell list indexer
-     * \param vel_ MPCD particle velocities
-     * \param mass_ MPCD mass
-     * \param embed_vel_ Embedded particle velocities
-     * \param embed_idx_ Embedded particle indexes
-     * \param N_mpcd_ Number of MPCD particles
-     */
-    CellPropertySum(const unsigned int* cell_list_,
-                    const unsigned int* cell_np_,
-                    const Index2D& cli_,
-                    const Scalar4* vel_,
-                    const Scalar mass_,
-                    const Scalar4* embed_vel_,
-                    const unsigned int* embed_idx_,
-                    const unsigned int N_mpcd_)
-        : cell_list(cell_list_), cell_np(cell_np_), cli(cli_), vel(vel_), mass(mass_),
-          embed_vel(embed_vel_), embed_idx(embed_idx_), N_mpcd(N_mpcd_)
-        {
-        }
-
-    //! Computes the total momentum, kinetic energy, and number of particles in a cell
-    /*!
-     * \param momentum Cell momentum (output)
-     * \param ke Cell kinetic energy (output)
-     * \param np Number of particles in cell (output)
-     * \param cell Index of cell to evaluate
-     * \param energy If true, then the kinetic energy is evaluated into \a ke
-     */
-    inline void compute(double4& momentum,
-                        double& ke,
-                        unsigned int& np,
-                        const unsigned int cell,
-                        const bool energy)
-        {
-        momentum = make_double4(0.0, 0.0, 0.0, 0.0);
-        ke = 0.0;
-        np = cell_np[cell];
-
-        for (unsigned int offset = 0; offset < np; ++offset)
-            {
-            // Load particle data
-            const unsigned int cur_p = cell_list[cli(offset, cell)];
-            double3 vel_i;
-            double mass_i;
-            if (cur_p < N_mpcd)
-                {
-                Scalar4 vel_cell = vel[cur_p];
-                vel_i = make_double3(vel_cell.x, vel_cell.y, vel_cell.z);
-                mass_i = mass;
-                }
-            else
-                {
-                Scalar4 vel_m = embed_vel[embed_idx[cur_p - N_mpcd]];
-                vel_i = make_double3(vel_m.x, vel_m.y, vel_m.z);
-                mass_i = vel_m.w;
-                }
-
-            // add momentum
-            momentum.x += mass_i * vel_i.x;
-            momentum.y += mass_i * vel_i.y;
-            momentum.z += mass_i * vel_i.z;
-            momentum.w += mass_i;
-
-            // also compute ke of the particle
-            if (energy)
-                ke += 0.5 * mass_i * (vel_i.x * vel_i.x + vel_i.y * vel_i.y + vel_i.z * vel_i.z);
-            }
-        }
-
-    const unsigned int* cell_list; //!< Cell list
-    const unsigned int* cell_np;   //!< Number of particles per cell
-    const Index2D cli;             //!< Cell list indexer
-
-    const Scalar4* vel;            //!< MPCD particle velocities
-    const Scalar mass;             //!< MPCD particle mass
-    const Scalar4* embed_vel;      //!< Embedded particle velocities
-    const unsigned int* embed_idx; //!< Embedded particle indexes
-    const unsigned int N_mpcd;     //!< Number of MPCD particles
-    };
-    } // end namespace detail
-    } // end namespace mpcd
-
-#ifdef ENABLE_MPI
-void mpcd::CellThermoCompute::beginOuterCellProperties()
-    {
-    // Cell list
-    ArrayHandle<unsigned int> h_cell_list(m_cl->getCellList(),
-                                          access_location::host,
-                                          access_mode::read);
-    ArrayHandle<unsigned int> h_cell_np(m_cl->getCellSizeArray(),
-                                        access_location::host,
-                                        access_mode::read);
-    const Index2D& cli = m_cl->getCellListIndexer();
-
-    // MPCD particle data
-    ArrayHandle<Scalar4> h_vel(m_mpcd_pdata->getVelocities(),
-                               access_location::host,
-                               access_mode::read);
-    const Scalar mpcd_mass = m_mpcd_pdata->getMass();
-    const unsigned int N_mpcd = m_mpcd_pdata->getN() + m_mpcd_pdata->getNVirtual();
-
-    // Embedded particle data
-    std::unique_ptr<ArrayHandle<Scalar4>> h_embed_vel;
-    std::unique_ptr<ArrayHandle<unsigned int>> h_embed_member_idx;
-    if (m_cl->getEmbeddedGroup())
-        {
-        h_embed_vel.reset(new ArrayHandle<Scalar4>(m_pdata->getVelocities(),
-                                                   access_location::host,
-                                                   access_mode::read));
-        h_embed_member_idx.reset(
-            new ArrayHandle<unsigned int>(m_cl->getEmbeddedGroup()->getIndexArray(),
-                                          access_location::host,
-                                          access_mode::read));
-        }
-
-    // Cell properties
-    ArrayHandle<double4> h_cell_vel(m_cell_vel, access_location::host, access_mode::overwrite);
-    ArrayHandle<double3> h_cell_energy(m_cell_energy,
-                                       access_location::host,
-                                       access_mode::overwrite);
-    ArrayHandle<unsigned int> h_cells(m_vel_comm->getCells(),
-                                      access_location::host,
-                                      access_mode::read);
-    mpcd::detail::CellPropertySum summer(h_cell_list.data,
-                                         h_cell_np.data,
-                                         cli,
-                                         h_vel.data,
-                                         mpcd_mass,
-                                         (m_cl->getEmbeddedGroup()) ? h_embed_vel->data : NULL,
-                                         (m_cl->getEmbeddedGroup()) ? h_embed_member_idx->data
-                                                                    : NULL,
-                                         N_mpcd);
-
-    // Loop over all outer cells and compute total momentum, mass, energy
-    const bool need_energy = m_flags[mpcd::detail::thermo_options::energy];
-    for (unsigned int idx = 0; idx < m_vel_comm->getNCells(); ++idx)
-        {
-        const unsigned int cur_cell = h_cells.data[idx];
-
-        // compute the cell properties
-        double4 momentum;
-        double ke(0.0);
-        unsigned int np(0);
-        summer.compute(momentum, ke, np, cur_cell, need_energy);
-
-        h_cell_vel.data[cur_cell] = make_double4(momentum.x, momentum.y, momentum.z, momentum.w);
-        if (need_energy)
-            h_cell_energy.data[cur_cell] = make_double3(ke, 0.0, __int_as_double(np));
-        }
-    }
-
-void mpcd::CellThermoCompute::finishOuterCellProperties()
-    {
-    ArrayHandle<double4> h_cell_vel(m_cell_vel, access_location::host, access_mode::readwrite);
-    ArrayHandle<double3> h_cell_energy(m_cell_energy,
-                                       access_location::host,
-                                       access_mode::readwrite);
-    ArrayHandle<unsigned int> h_cells(m_vel_comm->getCells(),
-                                      access_location::host,
-                                      access_mode::read);
-
-    // Loop over all outer cells and normalize the summed quantities
-    const bool need_energy = m_flags[mpcd::detail::thermo_options::energy];
-    for (unsigned int idx = 0; idx < m_vel_comm->getNCells(); ++idx)
-        {
-        const unsigned int cur_cell = h_cells.data[idx];
-
-        // average cell properties if the cell has mass
-        const double4 cell_vel = h_cell_vel.data[cur_cell];
-        double3 vel_cm = make_double3(cell_vel.x, cell_vel.y, cell_vel.z);
-        const double mass = cell_vel.w;
-
-        if (mass > 0.)
-            {
-            // average velocity is only defined when there is some mass in the cell
-            vel_cm.x /= mass;
-            vel_cm.y /= mass;
-            vel_cm.z /= mass;
-            }
-        h_cell_vel.data[cur_cell] = make_double4(vel_cm.x, vel_cm.y, vel_cm.z, mass);
-
-        if (need_energy)
-            {
-            const double3 cell_energy = h_cell_energy.data[cur_cell];
-            const double ke = cell_energy.x;
-            double temp(0.0);
-            const unsigned int np = __double_as_int(cell_energy.z);
-            // temperature is only defined for 2 or more particles
-            if (np > 1)
-                {
-                const double ke_cm
-                    = 0.5 * mass
-                      * (vel_cm.x * vel_cm.x + vel_cm.y * vel_cm.y + vel_cm.z * vel_cm.z);
-                temp = 2. * (ke - ke_cm) / (m_sysdef->getNDimensions() * (np - 1));
-                }
-            h_cell_energy.data[cur_cell] = make_double3(ke, temp, __int_as_double(np));
-            }
-        }
-    }
-#endif // ENABLE_MPI
-
-void mpcd::CellThermoCompute::calcInnerCellProperties()
-    {
-    // Cell list
-    const Index2D& cli = m_cl->getCellListIndexer();
-    ArrayHandle<unsigned int> h_cell_list(m_cl->getCellList(),
-                                          access_location::host,
-                                          access_mode::read);
-    ArrayHandle<unsigned int> h_cell_np(m_cl->getCellSizeArray(),
-                                        access_location::host,
-                                        access_mode::read);
-
-    // MPCD particle data
-    const unsigned int N_mpcd = m_mpcd_pdata->getN() + m_mpcd_pdata->getNVirtual();
-    const Scalar mpcd_mass = m_mpcd_pdata->getMass();
-    ArrayHandle<Scalar4> h_vel(m_mpcd_pdata->getVelocities(),
-                               access_location::host,
-                               access_mode::read);
-
-    // Embedded particle data
-    std::unique_ptr<ArrayHandle<Scalar4>> h_embed_vel;
-    std::unique_ptr<ArrayHandle<unsigned int>> h_embed_member_idx;
-    if (m_cl->getEmbeddedGroup())
-        {
-        h_embed_vel.reset(new ArrayHandle<Scalar4>(m_pdata->getVelocities(),
-                                                   access_location::host,
-                                                   access_mode::read));
-        h_embed_member_idx.reset(
-            new ArrayHandle<unsigned int>(m_cl->getEmbeddedGroup()->getIndexArray(),
-                                          access_location::host,
-                                          access_mode::read));
-        }
-
-    // Cell properties
-    ArrayHandle<double4> h_cell_vel(m_cell_vel, access_location::host, access_mode::readwrite);
-    ArrayHandle<double3> h_cell_energy(m_cell_energy,
-                                       access_location::host,
-                                       access_mode::readwrite);
-    mpcd::detail::CellPropertySum summer(h_cell_list.data,
-                                         h_cell_np.data,
-                                         cli,
-                                         h_vel.data,
-                                         mpcd_mass,
-                                         (m_cl->getEmbeddedGroup()) ? h_embed_vel->data : NULL,
-                                         (m_cl->getEmbeddedGroup()) ? h_embed_member_idx->data
-                                                                    : NULL,
-                                         N_mpcd);
-
-    // determine which cells are inner
-    uint3 lo, hi;
-    const Index3D& ci = m_cl->getCellIndexer();
-#ifdef ENABLE_MPI
-    if (m_use_mpi)
-        {
-        auto num_comm_cells = m_cl->getNComm();
-        lo = make_uint3(num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::west)],
-                        num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::south)],
-                        num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::down)]);
-        hi = make_uint3(
-            ci.getW() - num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::east)],
-            ci.getH() - num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::north)],
-            ci.getD() - num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::up)]);
-        }
-    else
-#endif // ENABLE_MPI
-        {
-        lo = make_uint3(0, 0, 0);
-        hi = m_cl->getDim();
-        }
-
-    // iterate over all of the inner cells and compute average velocity, energy, temperature
-    const bool need_energy = m_flags[mpcd::detail::thermo_options::energy];
-    for (unsigned int k = lo.z; k < hi.z; ++k)
-        {
-        for (unsigned int j = lo.y; j < hi.y; ++j)
-            {
-            for (unsigned int i = lo.x; i < hi.x; ++i)
-                {
-                const unsigned int cur_cell = ci(i, j, k);
-
-                // compute the cell properties
-                double4 momentum;
-                double ke(0.0);
-                unsigned int np(0);
-                summer.compute(momentum, ke, np, cur_cell, need_energy);
-
-                const double mass = momentum.w;
-                double3 vel_cm = make_double3(0.0, 0.0, 0.0);
-                if (mass > 0.)
-                    {
-                    vel_cm.x = momentum.x / mass;
-                    vel_cm.y = momentum.y / mass;
-                    vel_cm.z = momentum.z / mass;
-                    }
-
-                h_cell_vel.data[cur_cell] = make_double4(vel_cm.x, vel_cm.y, vel_cm.z, mass);
-                if (need_energy)
-                    {
-                    double temp(0.0);
-                    if (np > 1)
-                        {
-                        const double ke_cm
-                            = 0.5 * mass
-                              * (vel_cm.x * vel_cm.x + vel_cm.y * vel_cm.y + vel_cm.z * vel_cm.z);
-                        temp = 2. * (ke - ke_cm) / (m_sysdef->getNDimensions() * (np - 1));
-                        }
-                    h_cell_energy.data[cur_cell] = make_double3(ke, temp, __int_as_double(np));
-                    }
-                } // i
-            } // j
-        } // k
-    }
-
-void mpcd::CellThermoCompute::computeNetProperties()
-    {
-    // first reduce the properties on the rank
-    unsigned int n_temp_cells = 0;
-        {
-        const Index3D& ci = m_cl->getCellIndexer();
-        uint3 upper = make_uint3(ci.getW(), ci.getH(), ci.getD());
-#ifdef ENABLE_MPI
-        // in MPI, remove duplicate cells along direction of communication
-        if (m_use_mpi)
-            {
-            auto num_comm = m_cl->getNComm();
-            upper.x -= num_comm[static_cast<unsigned int>(mpcd::detail::face::east)];
-            upper.y -= num_comm[static_cast<unsigned int>(mpcd::detail::face::north)];
-            upper.z -= num_comm[static_cast<unsigned int>(mpcd::detail::face::up)];
-            }
-#endif // ENABLE_MPI
-
-        ArrayHandle<double4> h_cell_vel(m_cell_vel, access_location::host, access_mode::read);
-        ArrayHandle<double3> h_cell_energy(m_cell_energy, access_location::host, access_mode::read);
-
-        const bool need_energy = m_flags[mpcd::detail::thermo_options::energy];
-
-        double3 net_momentum = make_double3(0, 0, 0);
-        double energy(0.0), temp(0.0);
-        for (unsigned int k = 0; k < upper.z; ++k)
-            {
-            for (unsigned int j = 0; j < upper.y; ++j)
-                {
-                for (unsigned int i = 0; i < upper.x; ++i)
-                    {
-                    const unsigned int idx = ci(i, j, k);
-
-                    const double4 cell_vel_mass = h_cell_vel.data[idx];
-                    const double3 cell_vel
-                        = make_double3(cell_vel_mass.x, cell_vel_mass.y, cell_vel_mass.z);
-                    const double cell_mass = cell_vel_mass.w;
-
-                    net_momentum.x += cell_mass * cell_vel.x;
-                    net_momentum.y += cell_mass * cell_vel.y;
-                    net_momentum.z += cell_mass * cell_vel.z;
-
-                    if (need_energy)
-                        {
-                        const double3 cell_energy = h_cell_energy.data[idx];
-                        energy += cell_energy.x;
-
-                        if (__double_as_int(cell_energy.z) > 1)
-                            {
-                            temp += cell_energy.y;
-                            ++n_temp_cells;
-                            }
-                        }
-                    }
-                }
-            }
-
-        ArrayHandle<double> h_net_properties(m_net_properties,
-                                             access_location::host,
-                                             access_mode::overwrite);
-        h_net_properties.data[mpcd::detail::thermo_index::momentum_x] = net_momentum.x;
-        h_net_properties.data[mpcd::detail::thermo_index::momentum_y] = net_momentum.y;
-        h_net_properties.data[mpcd::detail::thermo_index::momentum_z] = net_momentum.z;
-
-        h_net_properties.data[mpcd::detail::thermo_index::energy] = energy;
-        h_net_properties.data[mpcd::detail::thermo_index::temperature] = temp;
-        }
-
-#ifdef ENABLE_MPI
-    if (m_use_mpi)
-        {
-        ArrayHandle<double> h_net_properties(m_net_properties,
-                                             access_location::host,
-                                             access_mode::readwrite);
-        MPI_Allreduce(MPI_IN_PLACE,
-                      h_net_properties.data,
-                      mpcd::detail::thermo_index::num_quantities,
-                      MPI_DOUBLE,
-                      MPI_SUM,
-                      m_exec_conf->getMPICommunicator());
-
-        MPI_Allreduce(MPI_IN_PLACE,
-                      &n_temp_cells,
-                      1,
-                      MPI_UNSIGNED,
-                      MPI_SUM,
-                      m_exec_conf->getMPICommunicator());
-        }
-#endif // ENABLE_MPI
-
-    if (n_temp_cells > 0)
-        {
-        ArrayHandle<double> h_net_properties(m_net_properties,
-                                             access_location::host,
-                                             access_mode::readwrite);
-        h_net_properties.data[mpcd::detail::thermo_index::temperature] /= (double)n_temp_cells;
-        }
-
-    m_needs_net_reduce = false;
-    }
-
-void mpcd::CellThermoCompute::updateFlags()
-    {
-    mpcd::detail::ThermoFlags flags;
-
-    if (!m_flag_signal.empty())
-        {
-        m_flag_signal.emit_accumulate([&](mpcd::detail::ThermoFlags f) { flags |= f; });
-        }
-
-    m_flags = flags;
-    }
-
-/*!
- * \param ncells Number of cells
- */
-void mpcd::CellThermoCompute::reallocate(unsigned int ncells)
-    {
-    // Grow arrays to match the size if necessary
-    m_cell_vel.resize(ncells);
-    m_cell_energy.resize(ncells);
-
-    m_ncells_alloc = ncells;
-    }
-
-namespace mpcd
-    {
-namespace detail
-    {
-/*!
- * \param m Python module to export to
- */
-void export_CellThermoCompute(pybind11::module& m)
-    {
-    pybind11::class_<mpcd::CellThermoCompute, Compute, std::shared_ptr<mpcd::CellThermoCompute>>(
-        m,
-        "CellThermoCompute")
-        .def(pybind11::init<std::shared_ptr<SystemDefinition>, std::shared_ptr<mpcd::CellList>>());
-    }
-    } // namespace detail
-    } // namespace mpcd
-    } // end namespace hoomd
diff --git a/hoomd/mpcd/CellThermoCompute.h b/hoomd/mpcd/CellThermoCompute.h
deleted file mode 100644
index 0f7c60923d..0000000000
--- a/hoomd/mpcd/CellThermoCompute.h
+++ /dev/null
@@ -1,202 +0,0 @@
-// Copyright (c) 2009-2025 The Regents of the University of Michigan.
-// Part of HOOMD-blue, released under the BSD 3-Clause License.
-
-/*!
- * \file mpcd/CellThermoCompute.h
- * \brief Declaration of mpcd::CellThermoCompute
- */
-
-#ifndef MPCD_CELL_THERMO_COMPUTE_H_
-#define MPCD_CELL_THERMO_COMPUTE_H_
-
-#ifdef __HIPCC__
-#error This header cannot be compiled by nvcc
-#endif
-
-#include "CellList.h"
-#include "CellThermoTypes.h"
-#ifdef ENABLE_MPI
-#include "CellCommunicator.h"
-#endif // ENABLE_MPI
-
-#include "hoomd/Compute.h"
-#include "hoomd/SystemDefinition.h"
-#include "hoomd/extern/nano-signal-slot/nano_signal_slot.hpp"
-#include <pybind11/pybind11.h>
-
-namespace hoomd
-    {
-namespace mpcd
-    {
-//! Computes the cell (thermodynamic) properties
-class PYBIND11_EXPORT CellThermoCompute : public Compute
-    {
-    public:
-    //! Constructor
-    CellThermoCompute(std::shared_ptr<SystemDefinition> sysdef, std::shared_ptr<mpcd::CellList> cl);
-
-    //! Destructor
-    virtual ~CellThermoCompute();
-
-    //! Compute the cell thermodynamic properties
-    void compute(uint64_t timestep) override;
-
-    //! Start autotuning kernel launch parameters
-    void startAutotuning() override;
-
-    //! Check if kernel autotuning is complete
-    bool isAutotuningComplete() override;
-
-    //! Get the cell indexer for the attached cell list
-    const Index3D& getCellIndexer() const
-        {
-        return m_cl->getCellIndexer();
-        }
-
-    //! Get the cell velocities from the last call to compute
-    const GPUArray<double4>& getCellVelocities() const
-        {
-        return m_cell_vel;
-        }
-
-    //! Get the cell energies from the last call to compute
-    const GPUArray<double3>& getCellEnergies() const
-        {
-        return m_cell_energy;
-        }
-
-    //! Get the net momentum of the particles from the last call to compute
-    Scalar3 getNetMomentum()
-        {
-        if (m_needs_net_reduce)
-            computeNetProperties();
-
-        ArrayHandle<double> h_net_properties(m_net_properties,
-                                             access_location::host,
-                                             access_mode::read);
-        const Scalar3 net_momentum
-            = make_scalar3(h_net_properties.data[mpcd::detail::thermo_index::momentum_x],
-                           h_net_properties.data[mpcd::detail::thermo_index::momentum_y],
-                           h_net_properties.data[mpcd::detail::thermo_index::momentum_z]);
-        return net_momentum;
-        }
-
-    //! Get the net energy of the particles from the last call to compute
-    Scalar getNetEnergy()
-        {
-        if (!m_flags[mpcd::detail::thermo_options::energy])
-            {
-            m_exec_conf->msg->error()
-                << "Energy requested from CellThermoCompute, but was not computed." << std::endl;
-            throw std::runtime_error("Net cell energy not available");
-            }
-
-        if (m_needs_net_reduce)
-            computeNetProperties();
-
-        ArrayHandle<double> h_net_properties(m_net_properties,
-                                             access_location::host,
-                                             access_mode::read);
-        return h_net_properties.data[mpcd::detail::thermo_index::energy];
-        }
-
-    //! Get the average cell temperature from the last call to compute
-    Scalar getTemperature()
-        {
-        if (!m_flags[mpcd::detail::thermo_options::energy])
-            {
-            m_exec_conf->msg->error()
-                << "Temperature requested from CellThermoCompute, but was not computed."
-                << std::endl;
-            throw std::runtime_error("Net cell temperature not available");
-            }
-        if (m_needs_net_reduce)
-            computeNetProperties();
-
-        ArrayHandle<double> h_net_properties(m_net_properties,
-                                             access_location::host,
-                                             access_mode::read);
-        return h_net_properties.data[mpcd::detail::thermo_index::temperature];
-        }
-
-    //! Get the signal for requested thermo flags
-    /*!
-     * \returns A signal that subscribers can attach a callback to in order
-     *          to request certain data.
-     *
-     * For performance reasons, the CellThermoCompute should be able to
-     * supply many related cell-level quantities from a single kernel launch.
-     * However, sometimes these quantities are not needed, and it is better
-     * to skip calculating them. Subscribing classes can optionally request
-     * some of these quantities via a callback return mpcd::detail::ThermoFlags
-     * with the appropriate bits set.
-     */
-    Nano::Signal<mpcd::detail::ThermoFlags()>& getFlagsSignal()
-        {
-        return m_flag_signal;
-        }
-
-    //! Get the signal to attach a callback function
-    Nano::Signal<void(uint64_t timestep)>& getCallbackSignal()
-        {
-        return m_callbacks;
-        }
-
-    protected:
-    //! Compute the cell properties
-    void computeCellProperties(uint64_t timestep);
-
-#ifdef ENABLE_MPI
-    //! Begin the calculation of outer cell properties
-    virtual void beginOuterCellProperties();
-
-    //! Finish the calculation of outer cell properties
-    virtual void finishOuterCellProperties();
-#endif // ENABLE_MPI
-
-    //! Calculate the inner cell properties
-    virtual void calcInnerCellProperties();
-
-    //! Compute the net properties from the cell properties
-    virtual void computeNetProperties();
-
-    std::shared_ptr<mpcd::ParticleData> m_mpcd_pdata; //!< MPCD particle data
-    std::shared_ptr<mpcd::CellList> m_cl;             //!< MPCD cell list
-#ifdef ENABLE_MPI
-    bool m_use_mpi;                                  //!< Flag if communication is required
-    std::shared_ptr<CellCommunicator> m_vel_comm;    //!< Cell velocity communicator
-    std::shared_ptr<CellCommunicator> m_energy_comm; //!< Cell energy communicator
-#endif                                               // ENABLE_MPI
-
-    bool m_needs_net_reduce;           //!< Flag if a net reduction is necessary
-    GPUArray<double> m_net_properties; //!< Scalar properties of the system
-
-    GPUVector<double4> m_cell_vel;    //!< Average velocity of a cell + cell mass
-    GPUVector<double3> m_cell_energy; //!< Kinetic energy, unscaled temperature, dof in each cell
-    unsigned int m_ncells_alloc;      //!< Number of cells allocated for
-
-    Nano::Signal<mpcd::detail::ThermoFlags()> m_flag_signal; //!< Signal for requested flags
-    mpcd::detail::ThermoFlags m_flags;                       //!< Requested thermo flags
-    //! Updates the requested optional flags
-    void updateFlags();
-
-    Nano::Signal<void(uint64_t timestep)> m_callbacks; //!< Signal for callback functions
-
-    private:
-    //! Allocate memory per cell
-    void reallocate(unsigned int ncells);
-
-    //! Slot for the number of virtual particles changing
-    /*!
-     * All thermo properties should be recomputed if the number of virtual particles changes.
-     * The cases where the thermo is first used without virtual particles
-     * followed by with virtual particles are small, and so it is easiest to just recompute.
-     */
-    void slotNumVirtual()
-        {
-        m_force_compute = true;
-        }
-    };
-    } // end namespace mpcd
-    } // end namespace hoomd
-#endif // #define MPCD_CELL_THERMO_COMPUTE_H_
diff --git a/hoomd/mpcd/CellThermoComputeGPU.cc b/hoomd/mpcd/CellThermoComputeGPU.cc
deleted file mode 100644
index bd75883cac..0000000000
--- a/hoomd/mpcd/CellThermoComputeGPU.cc
+++ /dev/null
@@ -1,374 +0,0 @@
-// Copyright (c) 2009-2025 The Regents of the University of Michigan.
-// Part of HOOMD-blue, released under the BSD 3-Clause License.
-
-/*!
- * \file mpcd/CellThermoComputeGPU.cc
- * \brief Definition of mpcd::CellThermoComputeGPU
- */
-
-#include "CellThermoComputeGPU.h"
-#include "ReductionOperators.h"
-
-namespace hoomd
-    {
-/*!
- * \param sysdef System definition
- * \param cl MPCD cell list
- */
-mpcd::CellThermoComputeGPU::CellThermoComputeGPU(std::shared_ptr<SystemDefinition> sysdef,
-                                                 std::shared_ptr<mpcd::CellList> cl)
-    : mpcd::CellThermoCompute(sysdef, cl), m_tmp_thermo(m_exec_conf), m_reduced(m_exec_conf)
-    {
-    m_begin_tuner.reset(new Autotuner<2>({AutotunerBase::makeBlockSizeRange(m_exec_conf),
-                                          AutotunerBase::getTppListPow2(this->m_exec_conf)},
-                                         m_exec_conf,
-                                         "mpcd_cell_thermo_begin"));
-    m_end_tuner.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
-                                       m_exec_conf,
-                                       "mpcd_cell_thermo_end"));
-    m_inner_tuner.reset(new Autotuner<2>({AutotunerBase::makeBlockSizeRange(m_exec_conf),
-                                          AutotunerBase::getTppListPow2(this->m_exec_conf)},
-                                         m_exec_conf,
-                                         "mpcd_cell_thermo_inner"));
-    m_stage_tuner.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
-                                         m_exec_conf,
-                                         "mpcd_cell_thermo_stage"));
-
-    m_autotuners.insert(m_autotuners.end(),
-                        {m_begin_tuner, m_end_tuner, m_inner_tuner, m_stage_tuner});
-    }
-
-mpcd::CellThermoComputeGPU::~CellThermoComputeGPU() { }
-
-#ifdef ENABLE_MPI
-void mpcd::CellThermoComputeGPU::beginOuterCellProperties()
-    {
-    ArrayHandle<double4> d_cell_vel(m_cell_vel, access_location::device, access_mode::overwrite);
-    ArrayHandle<double3> d_cell_energy(m_cell_energy,
-                                       access_location::device,
-                                       access_mode::overwrite);
-
-    ArrayHandle<unsigned int> d_cells(m_vel_comm->getCells(),
-                                      access_location::device,
-                                      access_mode::read);
-    ArrayHandle<unsigned int> d_cell_np(m_cl->getCellSizeArray(),
-                                        access_location::device,
-                                        access_mode::read);
-    ArrayHandle<unsigned int> d_cell_list(m_cl->getCellList(),
-                                          access_location::device,
-                                          access_mode::read);
-
-    ArrayHandle<Scalar4> d_vel(m_mpcd_pdata->getVelocities(),
-                               access_location::device,
-                               access_mode::read);
-
-    if (m_cl->getEmbeddedGroup())
-        {
-        // Embedded particle data
-        ArrayHandle<Scalar4> d_embed_vel(m_pdata->getVelocities(),
-                                         access_location::device,
-                                         access_mode::read);
-        ArrayHandle<unsigned int> d_embed_cell(m_cl->getEmbeddedGroup()->getIndexArray(),
-                                               access_location::device,
-                                               access_mode::read);
-
-        mpcd::detail::thermo_args_t args(d_cell_vel.data,
-                                         d_cell_energy.data,
-                                         d_cell_np.data,
-                                         d_cell_list.data,
-                                         m_cl->getCellListIndexer(),
-                                         d_vel.data,
-                                         m_mpcd_pdata->getN() + m_mpcd_pdata->getNVirtual(),
-                                         m_mpcd_pdata->getMass(),
-                                         d_embed_vel.data,
-                                         d_embed_cell.data,
-                                         m_flags[mpcd::detail::thermo_options::energy]);
-
-        m_begin_tuner->begin();
-        auto param = m_begin_tuner->getParam();
-        const unsigned int block_size = param[0];
-        const unsigned int tpp = param[1];
-        gpu::begin_cell_thermo(args, d_cells.data, m_vel_comm->getNCells(), block_size, tpp);
-        if (m_exec_conf->isCUDAErrorCheckingEnabled())
-            CHECK_CUDA_ERROR();
-        m_begin_tuner->end();
-        }
-    else
-        {
-        mpcd::detail::thermo_args_t args(d_cell_vel.data,
-                                         d_cell_energy.data,
-                                         d_cell_np.data,
-                                         d_cell_list.data,
-                                         m_cl->getCellListIndexer(),
-                                         d_vel.data,
-                                         m_mpcd_pdata->getN() + m_mpcd_pdata->getNVirtual(),
-                                         m_mpcd_pdata->getMass(),
-                                         NULL,
-                                         NULL,
-                                         m_flags[mpcd::detail::thermo_options::energy]);
-
-        m_begin_tuner->begin();
-        auto param = m_begin_tuner->getParam();
-        const unsigned int block_size = param[0];
-        const unsigned int tpp = param[1];
-        gpu::begin_cell_thermo(args, d_cells.data, m_vel_comm->getNCells(), block_size, tpp);
-        if (m_exec_conf->isCUDAErrorCheckingEnabled())
-            CHECK_CUDA_ERROR();
-        m_begin_tuner->end();
-        }
-    }
-
-void mpcd::CellThermoComputeGPU::finishOuterCellProperties()
-    {
-    ArrayHandle<double4> d_cell_vel(m_cell_vel, access_location::device, access_mode::readwrite);
-    ArrayHandle<double3> d_cell_energy(m_cell_energy,
-                                       access_location::device,
-                                       access_mode::readwrite);
-    ArrayHandle<unsigned int> d_cells(m_vel_comm->getCells(),
-                                      access_location::device,
-                                      access_mode::read);
-    m_end_tuner->begin();
-    gpu::end_cell_thermo(d_cell_vel.data,
-                         d_cell_energy.data,
-                         d_cells.data,
-                         m_vel_comm->getNCells(),
-                         m_sysdef->getNDimensions(),
-                         m_flags[mpcd::detail::thermo_options::energy],
-                         m_end_tuner->getParam()[0]);
-    if (m_exec_conf->isCUDAErrorCheckingEnabled())
-        CHECK_CUDA_ERROR();
-    m_end_tuner->end();
-    }
-#endif // ENABLE_MPI
-
-void mpcd::CellThermoComputeGPU::calcInnerCellProperties()
-    {
-    ArrayHandle<double4> d_cell_vel(m_cell_vel, access_location::device, access_mode::overwrite);
-    ArrayHandle<double3> d_cell_energy(m_cell_energy,
-                                       access_location::device,
-                                       access_mode::overwrite);
-
-    ArrayHandle<unsigned int> d_cell_np(m_cl->getCellSizeArray(),
-                                        access_location::device,
-                                        access_mode::read);
-    ArrayHandle<unsigned int> d_cell_list(m_cl->getCellList(),
-                                          access_location::device,
-                                          access_mode::read);
-
-    ArrayHandle<Scalar4> d_vel(m_mpcd_pdata->getVelocities(),
-                               access_location::device,
-                               access_mode::read);
-
-    /*
-     * Determine the inner cell indexer and offset. The inner indexer is the cube containing
-     * all non-communicating cells, and its offset is the number of communicating cells on the
-     * lo side of the box.
-     */
-    uint3 lo, hi;
-    const Index3D& ci = m_cl->getCellIndexer();
-#ifdef ENABLE_MPI
-    if (m_use_mpi)
-        {
-        auto num_comm_cells = m_cl->getNComm();
-        lo = make_uint3(num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::west)],
-                        num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::south)],
-                        num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::down)]);
-        hi = make_uint3(
-            ci.getW() - num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::east)],
-            ci.getH() - num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::north)],
-            ci.getD() - num_comm_cells[static_cast<unsigned int>(mpcd::detail::face::up)]);
-        }
-    else
-#endif // ENABLE_MPI
-        {
-        lo = make_uint3(0, 0, 0);
-        hi = m_cl->getDim();
-        }
-    Index3D inner_ci(hi.x - lo.x, hi.y - lo.y, hi.z - lo.z);
-
-    if (m_cl->getEmbeddedGroup())
-        {
-        // Embedded particle data
-        ArrayHandle<Scalar4> d_embed_vel(m_pdata->getVelocities(),
-                                         access_location::device,
-                                         access_mode::read);
-        ArrayHandle<unsigned int> d_embed_cell(m_cl->getEmbeddedGroup()->getIndexArray(),
-                                               access_location::device,
-                                               access_mode::read);
-
-        mpcd::detail::thermo_args_t args(d_cell_vel.data,
-                                         d_cell_energy.data,
-                                         d_cell_np.data,
-                                         d_cell_list.data,
-                                         m_cl->getCellListIndexer(),
-                                         d_vel.data,
-                                         m_mpcd_pdata->getN() + m_mpcd_pdata->getNVirtual(),
-                                         m_mpcd_pdata->getMass(),
-                                         d_embed_vel.data,
-                                         d_embed_cell.data,
-                                         m_flags[mpcd::detail::thermo_options::energy]);
-
-        m_inner_tuner->begin();
-        auto param = m_inner_tuner->getParam();
-        const unsigned int block_size = param[0];
-        const unsigned int tpp = param[1];
-        gpu::inner_cell_thermo(args, ci, inner_ci, lo, m_sysdef->getNDimensions(), block_size, tpp);
-        if (m_exec_conf->isCUDAErrorCheckingEnabled())
-            CHECK_CUDA_ERROR();
-        m_inner_tuner->end();
-        }
-    else
-        {
-        mpcd::detail::thermo_args_t args(d_cell_vel.data,
-                                         d_cell_energy.data,
-                                         d_cell_np.data,
-                                         d_cell_list.data,
-                                         m_cl->getCellListIndexer(),
-                                         d_vel.data,
-                                         m_mpcd_pdata->getN() + m_mpcd_pdata->getNVirtual(),
-                                         m_mpcd_pdata->getMass(),
-                                         NULL,
-                                         NULL,
-                                         m_flags[mpcd::detail::thermo_options::energy]);
-
-        m_inner_tuner->begin();
-        auto param = m_inner_tuner->getParam();
-        const unsigned int block_size = param[0];
-        const unsigned int tpp = param[1];
-        gpu::inner_cell_thermo(args, ci, inner_ci, lo, m_sysdef->getNDimensions(), block_size, tpp);
-        if (m_exec_conf->isCUDAErrorCheckingEnabled())
-            CHECK_CUDA_ERROR();
-        m_inner_tuner->end();
-        }
-    }
-
-void mpcd::CellThermoComputeGPU::computeNetProperties()
-    {
-        // first reduce the properties on the rank
-        {
-        const Index3D& ci = m_cl->getCellIndexer();
-        uint3 upper = make_uint3(ci.getW(), ci.getH(), ci.getD());
-#ifdef ENABLE_MPI
-        // in MPI, remove duplicate cells along direction of communication
-        if (m_use_mpi)
-            {
-            auto num_comm = m_cl->getNComm();
-            upper.x -= num_comm[static_cast<unsigned int>(mpcd::detail::face::east)];
-            upper.y -= num_comm[static_cast<unsigned int>(mpcd::detail::face::north)];
-            upper.z -= num_comm[static_cast<unsigned int>(mpcd::detail::face::up)];
-            }
-#endif // ENABLE_MPI
-
-        // temporary cell indexer for mapping 1d kernel threads to 3d grid
-        const Index3D tmp_ci(upper.x, upper.y, upper.z);
-        m_tmp_thermo.resize(tmp_ci.getNumElements());
-
-        ArrayHandle<mpcd::detail::cell_thermo_element> d_tmp_thermo(m_tmp_thermo,
-                                                                    access_location::device,
-                                                                    access_mode::overwrite);
-        ArrayHandle<double4> d_cell_vel(m_cell_vel, access_location::device, access_mode::read);
-        ArrayHandle<double3> d_cell_energy(m_cell_energy,
-                                           access_location::device,
-                                           access_mode::read);
-
-        m_stage_tuner->begin();
-        mpcd::gpu::stage_net_cell_thermo(d_tmp_thermo.data,
-                                         d_cell_vel.data,
-                                         d_cell_energy.data,
-                                         tmp_ci,
-                                         ci,
-                                         m_flags[mpcd::detail::thermo_options::energy],
-                                         m_stage_tuner->getParam()[0]);
-        if (m_exec_conf->isCUDAErrorCheckingEnabled())
-            CHECK_CUDA_ERROR();
-        m_stage_tuner->end();
-
-        // use cub to reduce the properties on the gpu
-        void* d_tmp = NULL;
-        size_t tmp_bytes = 0;
-        mpcd::gpu::reduce_net_cell_thermo(m_reduced.getDeviceFlags(),
-                                          d_tmp,
-                                          tmp_bytes,
-                                          d_tmp_thermo.data,
-                                          m_tmp_thermo.size());
-        if (m_exec_conf->isCUDAErrorCheckingEnabled())
-            CHECK_CUDA_ERROR();
-
-        ScopedAllocation<unsigned char> d_tmp_alloc(m_exec_conf->getCachedAllocator(),
-                                                    (tmp_bytes > 0) ? tmp_bytes : 1);
-        d_tmp = (void*)d_tmp_alloc();
-
-        mpcd::gpu::reduce_net_cell_thermo(m_reduced.getDeviceFlags(),
-                                          d_tmp,
-                                          tmp_bytes,
-                                          d_tmp_thermo.data,
-                                          m_tmp_thermo.size());
-        if (m_exec_conf->isCUDAErrorCheckingEnabled())
-            CHECK_CUDA_ERROR();
-        }
-
-    // now copy the net properties back to host from the flags
-    unsigned int n_temp_cells = 0;
-        {
-        const mpcd::detail::cell_thermo_element reduced = m_reduced.readFlags();
-
-        ArrayHandle<double> h_net_properties(m_net_properties,
-                                             access_location::host,
-                                             access_mode::overwrite);
-        h_net_properties.data[mpcd::detail::thermo_index::momentum_x] = reduced.momentum.x;
-        h_net_properties.data[mpcd::detail::thermo_index::momentum_y] = reduced.momentum.y;
-        h_net_properties.data[mpcd::detail::thermo_index::momentum_z] = reduced.momentum.z;
-
-        h_net_properties.data[mpcd::detail::thermo_index::energy] = reduced.energy;
-        h_net_properties.data[mpcd::detail::thermo_index::temperature] = reduced.temperature;
-
-        n_temp_cells = reduced.flag;
-        }
-
-#ifdef ENABLE_MPI
-    if (m_use_mpi)
-        {
-        ArrayHandle<double> h_net_properties(m_net_properties,
-                                             access_location::host,
-                                             access_mode::readwrite);
-        MPI_Allreduce(MPI_IN_PLACE,
-                      h_net_properties.data,
-                      mpcd::detail::thermo_index::num_quantities,
-                      MPI_DOUBLE,
-                      MPI_SUM,
-                      m_exec_conf->getMPICommunicator());
-
-        MPI_Allreduce(MPI_IN_PLACE,
-                      &n_temp_cells,
-                      1,
-                      MPI_UNSIGNED,
-                      MPI_SUM,
-                      m_exec_conf->getMPICommunicator());
-        }
-#endif // ENABLE_MPI
-
-    if (n_temp_cells > 0)
-        {
-        ArrayHandle<double> h_net_properties(m_net_properties,
-                                             access_location::host,
-                                             access_mode::readwrite);
-        h_net_properties.data[mpcd::detail::thermo_index::temperature] /= (double)n_temp_cells;
-        }
-
-    m_needs_net_reduce = false;
-    }
-
-namespace mpcd
-    {
-namespace detail
-    {
-void export_CellThermoComputeGPU(pybind11::module& m)
-    {
-    pybind11::class_<mpcd::CellThermoComputeGPU,
-                     mpcd::CellThermoCompute,
-                     std::shared_ptr<mpcd::CellThermoComputeGPU>>(m, "CellThermoComputeGPU")
-        .def(pybind11::init<std::shared_ptr<SystemDefinition>, std::shared_ptr<mpcd::CellList>>());
-    }
-    } // namespace detail
-    } // namespace mpcd
-    } // end namespace hoomd
diff --git a/hoomd/mpcd/CellThermoComputeGPU.cu b/hoomd/mpcd/CellThermoComputeGPU.cu
deleted file mode 100644
index 1059096075..0000000000
--- a/hoomd/mpcd/CellThermoComputeGPU.cu
+++ /dev/null
@@ -1,808 +0,0 @@
-// Copyright (c) 2009-2025 The Regents of the University of Michigan.
-// Part of HOOMD-blue, released under the BSD 3-Clause License.
-
-/*!
- * \file mpcd/CellThermoComputeGPU.cu
- * \brief Explicitly instantiates reduction operators and declares kernel drivers
- *        for mpcd::CellThermoComputeGPU.
- */
-
-#include <hipcub/hipcub.hpp>
-
-#include "CellThermoComputeGPU.cuh"
-#include "CellThermoTypes.h"
-
-#include "CellCommunicator.cuh"
-#include "ReductionOperators.h"
-
-#include "hoomd/WarpTools.cuh"
-
-namespace hoomd
-    {
-namespace mpcd
-    {
-namespace gpu
-    {
-namespace kernel
-    {
-//! Begins the cell thermo compute by summing cell quantities on outer cells
-/*!
- * \param d_cell_vel Velocity and mass per cell (output)
- * \param d_cell_energy Energy, temperature, number of particles per cell (output)
- * \param d_cells Cell indexes to compute
- * \param d_cell_np Number of particles per cell
- * \param d_cell_list MPCD cell list
- * \param cli Indexer into the cell list
- * \param d_vel MPCD particle velocities
- * \param N_mpcd Number of MPCD particles
- * \param mpcd_mass Mass of MPCD particle
- * \param d_embed_vel Embedded particle velocity
- * \param d_embed_idx Embedded particle indexes
- * \param num_cells Number of cells to compute for
- *
- * \tparam need_energy If true, compute the cell-level energy properties
- * \tparam tpp Number of threads to use per cell
- *
- * \b Implementation details:
- * Using \a tpp threads per cell, the cell properties are accumulated into \a d_cell_vel
- * and \a d_cell_energy. Shuffle-based intrinsics are used to reduce the accumulated
- * properties per-cell, and the first thread for each cell writes the result into
- * global memory.
- */
-template<bool need_energy, unsigned int tpp>
-__global__ void begin_cell_thermo(double4* d_cell_vel,
-                                  double3* d_cell_energy,
-                                  const unsigned int* d_cells,
-                                  const unsigned int* d_cell_np,
-                                  const unsigned int* d_cell_list,
-                                  const Index2D cli,
-                                  const Scalar4* d_vel,
-                                  const unsigned int N_mpcd,
-                                  const Scalar mpcd_mass,
-                                  const Scalar4* d_embed_vel,
-                                  const unsigned int* d_embed_idx,
-                                  const unsigned int num_cells)
-    {
-    // tpp threads per cell
-    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= tpp * num_cells)
-        return;
-
-    const unsigned int cell_id = d_cells[idx / tpp];
-    const unsigned int np = d_cell_np[cell_id];
-    double4 momentum = make_double4(0.0, 0.0, 0.0, 0.0);
-    double ke(0.0);
-
-    for (unsigned int offset = (idx % tpp); offset < np; offset += tpp)
-        {
-        // Load particle data
-        const unsigned int cur_p = d_cell_list[cli(offset, cell_id)];
-        double3 vel_i;
-        double mass_i;
-        if (cur_p < N_mpcd)
-            {
-            Scalar4 vel_cell = d_vel[cur_p];
-            vel_i = make_double3(vel_cell.x, vel_cell.y, vel_cell.z);
-            mass_i = mpcd_mass;
-            }
-        else
-            {
-            Scalar4 vel_m = d_embed_vel[d_embed_idx[cur_p - N_mpcd]];
-            vel_i = make_double3(vel_m.x, vel_m.y, vel_m.z);
-            mass_i = vel_m.w;
-            }
-
-        // add momentum
-        momentum.x += mass_i * vel_i.x;
-        momentum.y += mass_i * vel_i.y;
-        momentum.z += mass_i * vel_i.z;
-        momentum.w += mass_i;
-
-        // also compute ke of the particle
-        if (need_energy)
-            ke += (double)(0.5) * mass_i
-                  * (vel_i.x * vel_i.x + vel_i.y * vel_i.y + vel_i.z * vel_i.z);
-        }
-
-    // reduce quantities down into the 0-th lane per logical warp
-    if (tpp > 1)
-        {
-        hoomd::detail::WarpReduce<Scalar, tpp> reducer;
-        momentum.x = reducer.Sum(momentum.x);
-        momentum.y = reducer.Sum(momentum.y);
-        momentum.z = reducer.Sum(momentum.z);
-        momentum.w = reducer.Sum(momentum.w);
-        if (need_energy)
-            ke = reducer.Sum(ke);
-        }
-
-    // 0-th lane in each warp writes the result
-    if (idx % tpp == 0)
-        {
-        d_cell_vel[cell_id] = make_double4(momentum.x, momentum.y, momentum.z, momentum.w);
-        if (need_energy)
-            d_cell_energy[cell_id] = make_double3(ke, 0.0, __int_as_double(np));
-        }
-    }
-
-//! Finalizes the cell thermo compute by properly averaging cell quantities
-/*!
- * \param d_cell_vel Cell velocity and masses
- * \param d_cell_energy Cell energy and temperature
- * \param d_cells Cells to compute for
- * \param Ncell Number of cells
- * \param n_dimensions Number of dimensions in system
- *
- * \tparam need_energy If true, compute the cell-level energy properties.
- *
- * \b Implementation details:
- * Using one thread per cell, the properties are averaged by mass, number of particles,
- * etc. The temperature is computed from the cell kinetic energy.
- */
-template<bool need_energy>
-__global__ void end_cell_thermo(double4* d_cell_vel,
-                                double3* d_cell_energy,
-                                const unsigned int* d_cells,
-                                const unsigned int Ncell,
-                                const unsigned int n_dimensions)
-    {
-    // one thread per cell
-    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= Ncell)
-        return;
-
-    const unsigned int cell_id = d_cells[idx];
-
-    // average cell properties if the cell has mass
-    const double4 cell_vel = d_cell_vel[cell_id];
-    double3 vel_cm = make_double3(cell_vel.x, cell_vel.y, cell_vel.z);
-    const double mass = cell_vel.w;
-
-    if (mass > 0.)
-        {
-        // average velocity is only defined when there is some mass in the cell
-        vel_cm.x /= mass;
-        vel_cm.y /= mass;
-        vel_cm.z /= mass;
-        }
-    d_cell_vel[cell_id] = make_double4(vel_cm.x, vel_cm.y, vel_cm.z, mass);
-
-    if (need_energy)
-        {
-        const double3 cell_energy = d_cell_energy[cell_id];
-        const double ke = cell_energy.x;
-        double temp(0.0);
-        const unsigned int np = __double_as_int(cell_energy.z);
-        // temperature is only defined for 2 or more particles
-        if (np > 1)
-            {
-            const double ke_cm
-                = 0.5 * mass * (vel_cm.x * vel_cm.x + vel_cm.y * vel_cm.y + vel_cm.z * vel_cm.z);
-            temp = 2. * (ke - ke_cm) / (n_dimensions * (np - 1));
-            }
-        d_cell_energy[cell_id] = make_double3(ke, temp, __int_as_double(np));
-        }
-    }
-
-//! Computes the cell thermo for inner cells
-/*!
- * \param d_cell_vel Velocity and mass per cell (output)
- * \param d_cell_energy Energy, temperature, number of particles per cell (output)
- * \param ci Cell indexer
- * \param inner_ci Cell indexer for the inner cells
- * \param offset Offset of \a inner_ci from \a ci
- * \param d_cell_np Number of particles per cell
- * \param d_cell_list MPCD cell list
- * \param cli Indexer into the cell list
- * \param d_vel MPCD particle velocities
- * \param N_mpcd Number of MPCD particles
- * \param mpcd_mass Mass of MPCD particle
- * \param d_embed_vel Embedded particle velocity
- * \param d_embed_idx Embedded particle indexes
- * \param n_dimensions System dimensionality
- *
- * \tparam need_energy If true, compute the cell-level energy properties.
- * \tparam tpp Number of threads to use per cell
- *
- * \b Implementation details:
- * Using \a tpp threads per cell, the cell properties are accumulated into \a d_cell_vel
- * and \a d_cell_energy. Shuffle-based intrinsics are used to reduce the accumulated
- * properties per-cell, and the first thread for each cell writes the result into
- * global memory. The properties are properly normalized
- *
- * See mpcd::gpu::kernel::begin_cell_thermo for an almost identical implementation
- * without the normalization at the end, which is used for the outer cells.
- */
-template<bool need_energy, unsigned int tpp>
-__global__ void inner_cell_thermo(double4* d_cell_vel,
-                                  double3* d_cell_energy,
-                                  const Index3D ci,
-                                  const Index3D inner_ci,
-                                  const uint3 offset,
-                                  const unsigned int* d_cell_np,
-                                  const unsigned int* d_cell_list,
-                                  const Index2D cli,
-                                  const Scalar4* d_vel,
-                                  const unsigned int N_mpcd,
-                                  const Scalar mpcd_mass,
-                                  const Scalar4* d_embed_vel,
-                                  const unsigned int* d_embed_idx,
-                                  const unsigned int n_dimensions)
-    {
-    // tpp threads per cell
-    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= tpp * inner_ci.getNumElements())
-        return;
-
-    // reinterpret the thread id as a cell by first mapping the thread into the inner indexer,
-    // shifting by the offset of the inner indexer from the full indexer, and then compressing
-    // back into a 1D cell id
-    const uint3 inner_cell = inner_ci.getTriple(idx / tpp);
-    const uint3 cell
-        = make_uint3(inner_cell.x + offset.x, inner_cell.y + offset.y, inner_cell.z + offset.z);
-    const unsigned int cell_id = ci(cell.x, cell.y, cell.z);
-
-    const unsigned int np = d_cell_np[cell_id];
-    double4 momentum = make_double4(0.0, 0.0, 0.0, 0.0);
-    double ke(0.0);
-
-    for (unsigned int offset = (idx % tpp); offset < np; offset += tpp)
-        {
-        // Load particle data
-        const unsigned int cur_p = d_cell_list[cli(offset, cell_id)];
-        double3 vel_i;
-        double mass_i;
-        if (cur_p < N_mpcd)
-            {
-            Scalar4 vel_cell = d_vel[cur_p];
-            vel_i = make_double3(vel_cell.x, vel_cell.y, vel_cell.z);
-            mass_i = mpcd_mass;
-            }
-        else
-            {
-            Scalar4 vel_m = d_embed_vel[d_embed_idx[cur_p - N_mpcd]];
-            vel_i = make_double3(vel_m.x, vel_m.y, vel_m.z);
-            mass_i = vel_m.w;
-            }
-
-        // add momentum
-        momentum.x += mass_i * vel_i.x;
-        momentum.y += mass_i * vel_i.y;
-        momentum.z += mass_i * vel_i.z;
-        momentum.w += mass_i;
-
-        // also compute ke of the particle
-        if (need_energy)
-            ke += 0.5 * mass_i * (vel_i.x * vel_i.x + vel_i.y * vel_i.y + vel_i.z * vel_i.z);
-        }
-
-    // reduce quantities down into the 0-th lane per logical warp
-    if (tpp > 1)
-        {
-        hoomd::detail::WarpReduce<Scalar, tpp> reducer;
-        momentum.x = reducer.Sum(momentum.x);
-        momentum.y = reducer.Sum(momentum.y);
-        momentum.z = reducer.Sum(momentum.z);
-        momentum.w = reducer.Sum(momentum.w);
-        if (need_energy)
-            ke = reducer.Sum(ke);
-        }
-
-    // 0-th lane in each warp writes the result
-    if (idx % tpp == 0)
-        {
-        const double mass = momentum.w;
-        double3 vel_cm = make_double3(0.0, 0.0, 0.0);
-        if (mass > 0.)
-            {
-            vel_cm.x = momentum.x / mass;
-            vel_cm.y = momentum.y / mass;
-            vel_cm.z = momentum.z / mass;
-            }
-        d_cell_vel[cell_id] = make_double4(vel_cm.x, vel_cm.y, vel_cm.z, mass);
-
-        if (need_energy)
-            {
-            double temp(0.0);
-            if (np > 1)
-                {
-                const double ke_cm
-                    = 0.5 * mass
-                      * (vel_cm.x * vel_cm.x + vel_cm.y * vel_cm.y + vel_cm.z * vel_cm.z);
-                temp = 2. * (ke - ke_cm) / (n_dimensions * (np - 1));
-                }
-            d_cell_energy[cell_id] = make_double3(ke, temp, __int_as_double(np));
-            }
-        }
-    }
-
-/*!
- * \param d_tmp_thermo Temporary cell packed thermo element
- * \param d_cell_vel Cell velocity to reduce
- * \param d_cell_energy Cell energy to reduce
- * \param tmp_ci Temporary cell indexer for cells undergoing reduction
- * \param ci Cell indexer Regular cell list indexer
- *
- * \tparam need_energy If true, compute the cell-level energy properties.
- *
- * \b Implementation details:
- * Using one thread per \a temporary cell, the cell properties are normalized
- * in a way suitable for reduction of net properties, e.g. the cell velocities
- * are converted to momentum. The temperature is set to the cell energy, and a
- * flag is set to 1 or 0 to indicate whether this cell has an energy that should
- * be used in averaging the total temperature.
- */
-template<bool need_energy>
-__global__ void stage_net_cell_thermo(mpcd::detail::cell_thermo_element* d_tmp_thermo,
-                                      const double4* d_cell_vel,
-                                      const double3* d_cell_energy,
-                                      const Index3D tmp_ci,
-                                      const Index3D ci)
-    {
-    // one thread per cell
-    unsigned int tmp_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tmp_idx >= tmp_ci.getNumElements())
-        return;
-
-    // use the temporary cell indexer to map to a cell, then use the real cell indexer to
-    // get the read index
-    uint3 cell = tmp_ci.getTriple(tmp_idx);
-    const unsigned int idx = ci(cell.x, cell.y, cell.z);
-
-    const double4 vel_mass = d_cell_vel[idx];
-    const double3 vel = make_double3(vel_mass.x, vel_mass.y, vel_mass.z);
-    const double mass = vel_mass.w;
-
-    mpcd::detail::cell_thermo_element thermo;
-    thermo.momentum = make_double3(mass * vel.x, mass * vel.y, mass * vel.z);
-
-    if (need_energy)
-        {
-        const double3 cell_energy = d_cell_energy[idx];
-        thermo.energy = cell_energy.x;
-        if (__double_as_int(cell_energy.z) > 1)
-            {
-            thermo.temperature = cell_energy.y;
-            thermo.flag = 1;
-            }
-        else
-            {
-            thermo.temperature = 0.0;
-            thermo.flag = 0;
-            }
-        }
-    else
-        {
-        thermo.energy = 0.;
-        thermo.temperature = 0.;
-        thermo.flag = 0;
-        }
-
-    d_tmp_thermo[tmp_idx] = thermo;
-    }
-
-    } // end namespace kernel
-
-//! Templated launcher for multiple threads-per-cell kernel for outer cells
-/*
- * \param args Common arguments to thermo kernels
- * \param d_cells Cell indexes to compute
- * \param num_cells Number of cells to compute for
- * \param block_size Number of threads per block
- * \param tpp Number of threads to use per-cell
- *
- * \tparam cur_tpp Number of threads-per-cell for this template instantiation
- *
- * Launchers are recursively instantiated at compile-time in order to match the
- * correct number of threads at runtime. If the templated number of threads matches
- * the runtime number of threads, then the kernel is launched. Otherwise, the
- * next template (with threads reduced by a factor of 2) is launched. This
- * recursion is broken by a specialized template for 0 threads, which does no
- * work.
- */
-template<unsigned int cur_tpp>
-inline void launch_begin_cell_thermo(const mpcd::detail::thermo_args_t& args,
-                                     const unsigned int* d_cells,
-                                     const unsigned int num_cells,
-                                     const unsigned int block_size,
-                                     const unsigned int tpp)
-    {
-    if (cur_tpp == tpp)
-        {
-        if (args.need_energy)
-            {
-            unsigned int max_block_size_energy;
-            cudaFuncAttributes attr;
-            cudaFuncGetAttributes(&attr,
-                                  (const void*)mpcd::gpu::kernel::begin_cell_thermo<true, cur_tpp>);
-            max_block_size_energy = attr.maxThreadsPerBlock;
-
-            unsigned int run_block_size = min(block_size, max_block_size_energy);
-            dim3 grid(cur_tpp * num_cells / run_block_size + 1);
-            mpcd::gpu::kernel::begin_cell_thermo<true, cur_tpp>
-                <<<grid, run_block_size>>>(args.cell_vel,
-                                           args.cell_energy,
-                                           d_cells,
-                                           args.cell_np,
-                                           args.cell_list,
-                                           args.cli,
-                                           args.vel,
-                                           args.N_mpcd,
-                                           args.mass,
-                                           args.embed_vel,
-                                           args.embed_idx,
-                                           num_cells);
-            }
-        else
-            {
-            unsigned int max_block_size_noenergy;
-            cudaFuncAttributes attr;
-            cudaFuncGetAttributes(
-                &attr,
-                (const void*)mpcd::gpu::kernel::begin_cell_thermo<false, cur_tpp>);
-            max_block_size_noenergy = attr.maxThreadsPerBlock;
-
-            unsigned int run_block_size = min(block_size, max_block_size_noenergy);
-            dim3 grid(cur_tpp * num_cells / run_block_size + 1);
-            mpcd::gpu::kernel::begin_cell_thermo<false, cur_tpp>
-                <<<grid, run_block_size>>>(args.cell_vel,
-                                           args.cell_energy,
-                                           d_cells,
-                                           args.cell_np,
-                                           args.cell_list,
-                                           args.cli,
-                                           args.vel,
-                                           args.N_mpcd,
-                                           args.mass,
-                                           args.embed_vel,
-                                           args.embed_idx,
-                                           num_cells);
-            }
-        }
-    else
-        {
-        launch_begin_cell_thermo<cur_tpp / 2>(args, d_cells, num_cells, block_size, tpp);
-        }
-    }
-//! Template specialization to break recursion
-template<>
-inline void launch_begin_cell_thermo<0>(const mpcd::detail::thermo_args_t& args,
-                                        const unsigned int* d_cells,
-                                        const unsigned int num_cells,
-                                        const unsigned int block_size,
-                                        const unsigned int tpp)
-    {
-    }
-
-/*
- * \param args Common arguments to thermo kernels
- * \param d_cells Cell indexes to compute
- * \param num_cells Number of cells to compute for
- * \param block_size Number of threads per block
- * \param tpp Number of threads per cell
- *
- * \returns cudaSuccess on completion
- *
- * \sa mpcd::gpu::launch_begin_cell_thermo
- * \sa mpcd::gpu::kernel::begin_cell_thermo
- */
-cudaError_t begin_cell_thermo(const mpcd::detail::thermo_args_t& args,
-                              const unsigned int* d_cells,
-                              const unsigned int num_cells,
-                              const unsigned int block_size,
-                              const unsigned int tpp)
-    {
-    if (num_cells == 0)
-        return cudaSuccess;
-
-    launch_begin_cell_thermo<32>(args, d_cells, num_cells, block_size, tpp);
-    return cudaSuccess;
-    }
-
-/*!
- * \param d_cell_vel Cell velocity and masses
- * \param d_cell_energy Cell energy and temperature
- * \param d_cells Cells to compute for
- * \param Ncell Number of cells
- * \param n_dimensions Number of dimensions in system
- * \param need_energy If true, compute the cell-level energy properties
- *
- * \returns cudaSuccess on completion
- *
- * \sa mpcd::gpu::kernel::end_cell_thermo
- */
-cudaError_t end_cell_thermo(double4* d_cell_vel,
-                            double3* d_cell_energy,
-                            const unsigned int* d_cells,
-                            const unsigned int Ncell,
-                            const unsigned int n_dimensions,
-                            const bool need_energy,
-                            const unsigned int block_size)
-    {
-    if (Ncell == 0)
-        return cudaSuccess;
-
-    if (need_energy)
-        {
-        unsigned int max_block_size_energy;
-        cudaFuncAttributes attr;
-        cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::end_cell_thermo<true>);
-        max_block_size_energy = attr.maxThreadsPerBlock;
-
-        unsigned int run_block_size = min(block_size, max_block_size_energy);
-        dim3 grid(Ncell / run_block_size + 1);
-        mpcd::gpu::kernel::end_cell_thermo<true>
-            <<<grid, run_block_size>>>(d_cell_vel, d_cell_energy, d_cells, Ncell, n_dimensions);
-        }
-    else
-        {
-        unsigned int max_block_size_noenergy;
-        cudaFuncAttributes attr;
-        cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::end_cell_thermo<true>);
-        max_block_size_noenergy = attr.maxThreadsPerBlock;
-
-        unsigned int run_block_size = min(block_size, max_block_size_noenergy);
-        dim3 grid(Ncell / run_block_size + 1);
-        mpcd::gpu::kernel::end_cell_thermo<false>
-            <<<grid, run_block_size>>>(d_cell_vel, d_cell_energy, d_cells, Ncell, n_dimensions);
-        }
-
-    return cudaSuccess;
-    }
-
-//! Templated launcher for multiple threads-per-cell kernel for inner cells
-/*
- * \param args Common arguments to thermo kernels
- * \param ci Cell indexer
- * \param inner_ci Cell indexer for the inner cells
- * \param offset Offset of \a inner_ci from \a ci
- * \param n_dimensions System dimensionality
- * \param block_size Number of threads per block
- * \param tpp Number of threads per cell
- *
- * \tparam cur_tpp Number of threads-per-cell for this template instantiation
- *
- * Launchers are recursively instantiated at compile-time in order to match the
- * correct number of threads at runtime. If the templated number of threads matches
- * the runtime number of threads, then the kernel is launched. Otherwise, the
- * next template (with threads reduced by a factor of 2) is launched. This
- * recursion is broken by a specialized template for 0 threads, which does no
- * work.
- */
-template<unsigned int cur_tpp>
-inline void launch_inner_cell_thermo(const mpcd::detail::thermo_args_t& args,
-                                     const Index3D& ci,
-                                     const Index3D& inner_ci,
-                                     const uint3& offset,
-                                     const unsigned int n_dimensions,
-                                     const unsigned int block_size,
-                                     const unsigned int tpp)
-    {
-    if (cur_tpp == tpp)
-        {
-        if (args.need_energy)
-            {
-            unsigned int max_block_size_energy;
-            cudaFuncAttributes attr;
-            cudaFuncGetAttributes(&attr,
-                                  (const void*)mpcd::gpu::kernel::inner_cell_thermo<true, cur_tpp>);
-            max_block_size_energy = attr.maxThreadsPerBlock;
-
-            unsigned int run_block_size = min(block_size, max_block_size_energy);
-            dim3 grid(cur_tpp * ci.getNumElements() / run_block_size + 1);
-            mpcd::gpu::kernel::inner_cell_thermo<true, cur_tpp>
-                <<<grid, run_block_size>>>(args.cell_vel,
-                                           args.cell_energy,
-                                           ci,
-                                           inner_ci,
-                                           offset,
-                                           args.cell_np,
-                                           args.cell_list,
-                                           args.cli,
-                                           args.vel,
-                                           args.N_mpcd,
-                                           args.mass,
-                                           args.embed_vel,
-                                           args.embed_idx,
-                                           n_dimensions);
-            }
-        else
-            {
-            unsigned int max_block_size_noenergy;
-            cudaFuncAttributes attr;
-            cudaFuncGetAttributes(
-                &attr,
-                (const void*)mpcd::gpu::kernel::inner_cell_thermo<false, cur_tpp>);
-            max_block_size_noenergy = attr.maxThreadsPerBlock;
-
-            unsigned int run_block_size = min(block_size, max_block_size_noenergy);
-            dim3 grid(cur_tpp * ci.getNumElements() / run_block_size + 1);
-            mpcd::gpu::kernel::inner_cell_thermo<false, cur_tpp>
-                <<<grid, run_block_size>>>(args.cell_vel,
-                                           args.cell_energy,
-                                           ci,
-                                           inner_ci,
-                                           offset,
-                                           args.cell_np,
-                                           args.cell_list,
-                                           args.cli,
-                                           args.vel,
-                                           args.N_mpcd,
-                                           args.mass,
-                                           args.embed_vel,
-                                           args.embed_idx,
-                                           n_dimensions);
-            }
-        }
-    else
-        {
-        launch_inner_cell_thermo<cur_tpp / 2>(args,
-                                              ci,
-                                              inner_ci,
-                                              offset,
-                                              n_dimensions,
-                                              block_size,
-                                              tpp);
-        }
-    }
-//! Template specialization to break recursion
-template<>
-inline void launch_inner_cell_thermo<0>(const mpcd::detail::thermo_args_t& args,
-                                        const Index3D& ci,
-                                        const Index3D& inner_ci,
-                                        const uint3& offset,
-                                        const unsigned int n_dimensions,
-                                        const unsigned int block_size,
-                                        const unsigned int tpp)
-    {
-    }
-
-/*!
- * \param args Common arguments for cell thermo compute
- * \param ci Cell indexer
- * \param inner_ci Cell indexer for the inner cells
- * \param offset Offset of \a inner_ci from \a ci
- * \param n_dimensions System dimensionality
- * \param block_size Number of threads per block
- * \param tpp Number of threads per cell
- *
- * \returns cudaSuccess on completion
- *
- * \sa mpcd::gpu::launch_inner_cell_thermo
- * \sa mpcd::gpu::kernel::inner_cell_thermo
- */
-cudaError_t inner_cell_thermo(const mpcd::detail::thermo_args_t& args,
-                              const Index3D& ci,
-                              const Index3D& inner_ci,
-                              const uint3& offset,
-                              const unsigned int n_dimensions,
-                              const unsigned int block_size,
-                              const unsigned int tpp)
-    {
-    if (inner_ci.getNumElements() == 0)
-        return cudaSuccess;
-
-    launch_inner_cell_thermo<32>(args, ci, inner_ci, offset, n_dimensions, block_size, tpp);
-
-    return cudaSuccess;
-    }
-
-/*!
- * \param d_tmp_thermo Temporary cell packed thermo element
- * \param d_cell_vel Cell velocity to reduce
- * \param d_cell_energy Cell energy to reduce
- * \param tmp_ci Temporary cell indexer for cells undergoing reduction
- * \param ci Cell indexer Regular cell list indexer
- * \param need_energy If true, compute the cell-level energy properties
- * \param block_size Number of threads per block
- *
- * \returns cudaSuccess on completion
- *
- * \sa mpcd::gpu::kernel::stage_net_cell_thermo
- */
-cudaError_t stage_net_cell_thermo(mpcd::detail::cell_thermo_element* d_tmp_thermo,
-                                  const double4* d_cell_vel,
-                                  const double3* d_cell_energy,
-                                  const Index3D& tmp_ci,
-                                  const Index3D& ci,
-                                  bool need_energy,
-                                  const unsigned int block_size)
-    {
-    if (need_energy)
-        {
-        unsigned int max_block_size_energy;
-        cudaFuncAttributes attr;
-        cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::stage_net_cell_thermo<true>);
-        max_block_size_energy = attr.maxThreadsPerBlock;
-
-        unsigned int run_block_size = min(block_size, max_block_size_energy);
-        dim3 grid(tmp_ci.getNumElements() / run_block_size + 1);
-        mpcd::gpu::kernel::stage_net_cell_thermo<true>
-            <<<grid, run_block_size>>>(d_tmp_thermo, d_cell_vel, d_cell_energy, tmp_ci, ci);
-        }
-    else
-        {
-        unsigned int max_block_size_noenergy;
-        cudaFuncAttributes attr;
-        cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::stage_net_cell_thermo<false>);
-        max_block_size_noenergy = attr.maxThreadsPerBlock;
-
-        unsigned int run_block_size = min(block_size, max_block_size_noenergy);
-        dim3 grid(tmp_ci.getNumElements() / run_block_size + 1);
-        mpcd::gpu::kernel::stage_net_cell_thermo<false>
-            <<<grid, run_block_size>>>(d_tmp_thermo, d_cell_vel, d_cell_energy, tmp_ci, ci);
-        }
-    return cudaSuccess;
-    }
-
-/*!
- * \param d_reduced Cell thermo properties reduced across all cells (output on second call)
- * \param d_tmp Temporary storage for reduction (output on first call)
- * \param tmp_bytes Number of bytes allocated for temporary storage (output on first call)
- * \param d_tmp_thermo Cell thermo properties to reduce
- * \param Ncell The number of cells to reduce across
- *
- * \returns cudaSuccess on completion
- *
- * \b Implementation details:
- * CUB DeviceReduce is used to perform the reduction. Hence, this function requires
- * two calls to perform the reduction. The first call sizes the temporary storage,
- * which is returned in \a d_tmp and \a tmp_bytes. The caller must then allocate
- * the required bytes, and call the function a second time. This performs the
- * reduction and returns the result in \a d_reduced.
- */
-cudaError_t reduce_net_cell_thermo(mpcd::detail::cell_thermo_element* d_reduced,
-                                   void* d_tmp,
-                                   size_t& tmp_bytes,
-                                   const mpcd::detail::cell_thermo_element* d_tmp_thermo,
-                                   const size_t Ncell)
-    {
-    cub::DeviceReduce::Sum(d_tmp, tmp_bytes, d_tmp_thermo, d_reduced, (unsigned int)Ncell);
-    return cudaSuccess;
-    }
-
-//! Explicit template instantiation of pack for cell velocity
-template cudaError_t __attribute__((visibility("default")))
-pack_cell_buffer(typename mpcd::detail::CellVelocityPackOp::element* d_send_buf,
-                 const double4* d_props,
-                 const unsigned int* d_send_idx,
-                 const mpcd::detail::CellVelocityPackOp op,
-                 const unsigned int num_send,
-                 unsigned int block_size);
-
-//! Explicit template instantiation of pack for cell energy
-template cudaError_t __attribute__((visibility("default")))
-pack_cell_buffer(typename mpcd::detail::CellEnergyPackOp::element* d_send_buf,
-                 const double3* d_props,
-                 const unsigned int* d_send_idx,
-                 const mpcd::detail::CellEnergyPackOp op,
-                 const unsigned int num_send,
-                 unsigned int block_size);
-
-//! Explicit template instantiation of unpack for cell velocity
-template cudaError_t __attribute__((visibility("default")))
-unpack_cell_buffer(double4* d_props,
-                   const unsigned int* d_cells,
-                   const unsigned int* d_recv,
-                   const unsigned int* d_recv_begin,
-                   const unsigned int* d_recv_end,
-                   const typename mpcd::detail::CellVelocityPackOp::element* d_recv_buf,
-                   const mpcd::detail::CellVelocityPackOp op,
-                   const unsigned int num_cells,
-                   const unsigned int block_size);
-
-//! Explicit template instantiation of unpack for cell energy
-template cudaError_t __attribute__((visibility("default")))
-unpack_cell_buffer(double3* d_props,
-                   const unsigned int* d_cells,
-                   const unsigned int* d_recv,
-                   const unsigned int* d_recv_begin,
-                   const unsigned int* d_recv_end,
-                   const typename mpcd::detail::CellEnergyPackOp::element* d_recv_buf,
-                   const mpcd::detail::CellEnergyPackOp op,
-                   const unsigned int num_cells,
-                   const unsigned int block_size);
-
-    } // end namespace gpu
-    } // end namespace mpcd
-    } // end namespace hoomd
diff --git a/hoomd/mpcd/CellThermoComputeGPU.cuh b/hoomd/mpcd/CellThermoComputeGPU.cuh
deleted file mode 100644
index b040e8ebbb..0000000000
--- a/hoomd/mpcd/CellThermoComputeGPU.cuh
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright (c) 2009-2025 The Regents of the University of Michigan.
-// Part of HOOMD-blue, released under the BSD 3-Clause License.
-
-/*!
- * \file mpcd/CellThermoComputeGPU.cuh
- * \brief Declaration of CUDA kernels for mpcd::CellThermoComputeGPU
- */
-
-#ifndef MPCD_CELL_THERMO_COMPUTE_GPU_CUH_
-#define MPCD_CELL_THERMO_COMPUTE_GPU_CUH_
-
-#include "hoomd/BoxDim.h"
-#include "hoomd/HOOMDMath.h"
-#include "hoomd/Index1D.h"
-
-#include <cuda_runtime.h>
-
-namespace hoomd
-    {
-namespace mpcd
-    {
-namespace detail
-    {
-#ifdef __HIPCC__
-#define HOSTDEVICE __host__ __device__
-#else
-#define HOSTDEVICE
-#endif
-
-//! Custom cell thermo element for reductions on the gpu
-struct cell_thermo_element
-    {
-    double3 momentum;   //!< Momentum of the cell
-    double energy;      //!< Energy of the cell
-    double temperature; //!< Temperature of the cell (0 if < 2 particles)
-    unsigned int flag;  //!< Flag to be used to compute filled cells
-
-    //! Addition operator for summed reduction
-    HOSTDEVICE cell_thermo_element operator+(const cell_thermo_element& other) const
-        {
-        cell_thermo_element sum;
-        sum.momentum.x = momentum.x + other.momentum.x;
-        sum.momentum.y = momentum.y + other.momentum.y;
-        sum.momentum.z = momentum.z + other.momentum.z;
-        sum.energy = energy + other.energy;
-        sum.temperature = temperature + other.temperature;
-        sum.flag = flag + other.flag;
-
-        return sum;
-        }
-    };
-
-//! Convenience struct for common parameters passed to GPU kernels
-struct thermo_args_t
-    {
-    thermo_args_t(double4* cell_vel_,
-                  double3* cell_energy_,
-                  const unsigned int* cell_np_,
-                  const unsigned int* cell_list_,
-                  const Index2D& cli_,
-                  const Scalar4* vel_,
-                  const unsigned int N_mpcd_,
-                  const Scalar mass_,
-                  const Scalar4* embed_vel_,
-                  const unsigned int* embed_idx_,
-                  bool need_energy_)
-        : cell_vel(cell_vel_), cell_energy(cell_energy_), cell_np(cell_np_), cell_list(cell_list_),
-          cli(cli_), vel(vel_), N_mpcd(N_mpcd_), mass(mass_), embed_vel(embed_vel_),
-          embed_idx(embed_idx_), need_energy(need_energy_)
-        {
-        }
-
-    double4* cell_vel;    //!< Cell velocities (output)
-    double3* cell_energy; //!< Cell energies (output)
-
-    const unsigned int* cell_np;   //!< Number of particles per cell
-    const unsigned int* cell_list; //!< MPCD cell list
-    const Index2D cli;             //!< MPCD cell list indexer
-    const Scalar4* vel;            //!< MPCD particle velocities
-    const unsigned int N_mpcd;     //!< Number of MPCD particles
-    const Scalar mass;             //!< MPCD particle mass
-    const Scalar4* embed_vel;      //!< Embedded particle velocities
-    const unsigned int* embed_idx; //!< Embedded particle indexes
-    const bool need_energy;        //!< Flag if energy calculations are required
-    };
-#undef HOSTDEVICE
-    } // namespace detail
-
-namespace gpu
-    {
-//! Kernel driver to begin cell thermo compute of outer cells
-cudaError_t begin_cell_thermo(const mpcd::detail::thermo_args_t& args,
-                              const unsigned int* d_cells,
-                              const unsigned int num_cells,
-                              const unsigned int block_size,
-                              const unsigned int tpp);
-
-//! Kernel driver to finalize cell thermo compute of outer cells
-cudaError_t end_cell_thermo(double4* d_cell_vel,
-                            double3* d_cell_energy,
-                            const unsigned int* d_cells,
-                            const unsigned int Ncell,
-                            const unsigned int n_dimensions,
-                            const bool need_energy,
-                            const unsigned int block_size);
-
-//! Kernel driver to perform cell thermo compute for inner cells
-cudaError_t inner_cell_thermo(const mpcd::detail::thermo_args_t& args,
-                              const Index3D& ci,
-                              const Index3D& inner_ci,
-                              const uint3& offset,
-                              const unsigned int n_dimensions,
-                              const unsigned int block_size,
-                              const unsigned int tpp);
-
-//! Kernel driver to stage cell properties for net thermo reduction
-cudaError_t stage_net_cell_thermo(mpcd::detail::cell_thermo_element* d_tmp_thermo,
-                                  const double4* d_cell_vel,
-                                  const double3* d_cell_energy,
-                                  const Index3D& tmp_ci,
-                                  const Index3D& ci,
-                                  const bool need_energy,
-                                  const unsigned int block_size);
-
-//! Wrapper to cub device reduce for cell thermo properties
-cudaError_t reduce_net_cell_thermo(mpcd::detail::cell_thermo_element* d_reduced,
-                                   void* d_tmp,
-                                   size_t& tmp_bytes,
-                                   const mpcd::detail::cell_thermo_element* d_tmp_thermo,
-                                   const size_t Ncell);
-
-    } // end namespace gpu
-    } // end namespace mpcd
-    } // end namespace hoomd
-#endif // MPCD_CELL_THERMO_COMPUTE_GPU_CUH_
diff --git a/hoomd/mpcd/CellThermoComputeGPU.h b/hoomd/mpcd/CellThermoComputeGPU.h
deleted file mode 100644
index f5c0d652c1..0000000000
--- a/hoomd/mpcd/CellThermoComputeGPU.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright (c) 2009-2025 The Regents of the University of Michigan.
-// Part of HOOMD-blue, released under the BSD 3-Clause License.
-
-/*!
- * \file mpcd/CellThermoComputeGPU.h
- * \brief Declaration of mpcd::CellThermoComputeGPU
- */
-
-#ifndef MPCD_CELL_THERMO_COMPUTE_GPU_H_
-#define MPCD_CELL_THERMO_COMPUTE_GPU_H_
-
-#ifdef __HIPCC__
-#error This header cannot be compiled by nvcc
-#endif
-
-#include "CellThermoCompute.h"
-#include "CellThermoComputeGPU.cuh"
-
-#include "hoomd/Autotuner.h"
-
-// pybind11
-#include <pybind11/pybind11.h>
-
-namespace hoomd
-    {
-namespace mpcd
-    {
-//! Computes the cell (thermodynamic) properties on the GPU
-class PYBIND11_EXPORT CellThermoComputeGPU : public mpcd::CellThermoCompute
-    {
-    public:
-    //! Constructor
-    CellThermoComputeGPU(std::shared_ptr<SystemDefinition> sysdef,
-                         std::shared_ptr<mpcd::CellList> cl);
-
-    //! Destructor
-    virtual ~CellThermoComputeGPU();
-
-    protected:
-#ifdef ENABLE_MPI
-    //! Begin the calculation of outer cell properties on the GPU
-    void beginOuterCellProperties() override;
-
-    //! Finish the calculation of outer cell properties on the GPU
-    void finishOuterCellProperties() override;
-#endif // ENABLE_MPI
-
-    //! Calculate the inner cell properties on the GPU
-    void calcInnerCellProperties() override;
-
-    //! Compute the net properties from the cell properties
-    void computeNetProperties() override;
-
-    private:
-    std::shared_ptr<Autotuner<2>> m_begin_tuner; //!< Tuner for cell begin kernel
-    std::shared_ptr<Autotuner<1>> m_end_tuner;   //!< Tuner for cell end kernel
-    std::shared_ptr<Autotuner<2>> m_inner_tuner; //!< Tuner for inner cell compute kernel
-    std::shared_ptr<Autotuner<1>> m_stage_tuner; //!< Tuner for staging net property compute
-
-    GPUVector<mpcd::detail::cell_thermo_element>
-        m_tmp_thermo; //!< Temporary array for holding cell data
-    GPUFlags<mpcd::detail::cell_thermo_element> m_reduced; //!< Flags to hold reduced sum
-    };
-    } // end namespace mpcd
-    } // end namespace hoomd
-
-#endif // MPCD_CELL_THERMO_COMPUTE_GPU_H_
diff --git a/hoomd/mpcd/CellThermoTypes.h b/hoomd/mpcd/CellThermoTypes.h
index 7bc05e590d..a900b0c57d 100644
--- a/hoomd/mpcd/CellThermoTypes.h
+++ b/hoomd/mpcd/CellThermoTypes.h
@@ -3,7 +3,7 @@
 
 /*!
  * \file mpcd/CellThermoTypes.h
- * \brief Defines enums for mpcd::CellThermoCompute
+ * \brief Defines enums for mpcd::CellList
  */
 
 #ifndef MPCD_CELL_THERMO_TYPES_H_
diff --git a/hoomd/mpcd/CollisionMethod.cc b/hoomd/mpcd/CollisionMethod.cc
index b7d59b66f7..503ca9bfb5 100644
--- a/hoomd/mpcd/CollisionMethod.cc
+++ b/hoomd/mpcd/CollisionMethod.cc
@@ -99,12 +99,6 @@ void mpcd::CollisionMethod::collide(uint64_t timestep)
         }
 #endif // ENABLE_HIP
 
-    // set random grid shift
-    m_cl->drawGridShift(timestep);
-
-    // update cell list
-    m_cl->compute(timestep);
-
     // setup rigid bodies before collision happens
     const bool rigid_body_collision = m_embed_group && m_rigid_bodies;
     if (rigid_body_collision)
@@ -143,6 +137,8 @@ void mpcd::CollisionMethod::collide(uint64_t timestep)
             }
         }
 
+    // update cell list
+    m_cl->compute(timestep);
     // apply collisions
     rule(timestep);
 
diff --git a/hoomd/mpcd/Integrator.cc b/hoomd/mpcd/Integrator.cc
index 7ed9e842f8..a463e67ec4 100644
--- a/hoomd/mpcd/Integrator.cc
+++ b/hoomd/mpcd/Integrator.cc
@@ -87,13 +87,14 @@ void mpcd::Integrator::update(uint64_t timestep)
             }
         }
 
+    // perform the core MPCD steps of collision and streaming
+    if (m_collide)
+        m_collide->collide(timestep);
+
     // optionally sort for performance
     if (m_sorter && (*m_sorter->getTrigger())(timestep))
         m_sorter->update(timestep);
 
-    // perform the core MPCD steps of collision and streaming
-    if (m_collide)
-        m_collide->collide(timestep);
     if (m_stream)
         m_stream->stream(timestep);
 
diff --git a/hoomd/mpcd/ParticleData.h b/hoomd/mpcd/ParticleData.h
index f1a0c3298b..6453098dfa 100644
--- a/hoomd/mpcd/ParticleData.h
+++ b/hoomd/mpcd/ParticleData.h
@@ -57,15 +57,6 @@ namespace mpcd
  * are based on around the velocity and cell. For details of what the cell means,
  * refer to the mpcd::CellList.
  *
- * \todo Because the local cell index changes with position, a signal will be put
- * in place to indicate when the cached cell index is still valid.
- *
- * \todo Likewise, MPCD benefits from sorting data into cell order, so a signal
- * needs to be put in place when the ordering changes.
- *
- * \todo Likewise, a signal should be incorporated to indicate when particles are
- * added or removed locally, as is the case during particle migration.
- *
  * \ingroup data_structs
  */
 class PYBIND11_EXPORT ParticleData : public Autotuned
@@ -272,9 +263,7 @@ class PYBIND11_EXPORT ParticleData : public Autotuned
         }
 
     //! Signature for particle sort signal
-    typedef Nano::Signal<
-        void(uint64_t timestep, const GPUArray<unsigned int>&, const GPUArray<unsigned int>&)>
-        SortSignal;
+    typedef Nano::Signal<void(uint64_t timestep)> SortSignal;
 
     //! Get the sort signal
     /*!
@@ -286,23 +275,6 @@ class PYBIND11_EXPORT ParticleData : public Autotuned
         return m_sort_signal;
         }
 
-    //! Notify subscribers of a particle sort
-    /*!
-     * \param timestep Timestep that the sorting occurred
-     * \param order Mapping of sorted particle indexes onto old particle indexes
-     * \param rorder Mapping of old particle indexes onto sorted particle indexes
-     *
-     * This method notifies the subscribers of the sort occurring at \a timestep.
-     * Subscribers may choose to use \a order and \a rorder to reorder their
-     * per-particle data immediately, or delay the sort until their next call.
-     */
-    void notifySort(uint64_t timestep,
-                    const GPUArray<unsigned int>& order,
-                    const GPUArray<unsigned int>& rorder)
-        {
-        m_sort_signal.emit(timestep, order, rorder);
-        }
-
     //! Notify subscribers of a particle sort
     /*!
      * \param timestep Timestep that the sorting occurred
@@ -312,8 +284,7 @@ class PYBIND11_EXPORT ParticleData : public Autotuned
      */
     void notifySort(uint64_t timestep)
         {
-        GPUArray<unsigned int> order, rorder;
-        m_sort_signal.emit(timestep, order, rorder);
+        m_sort_signal.emit(timestep);
         }
     //@}
 
diff --git a/hoomd/mpcd/SRDCollisionMethod.cc b/hoomd/mpcd/SRDCollisionMethod.cc
index c69b8ed16e..f9c92eec43 100644
--- a/hoomd/mpcd/SRDCollisionMethod.cc
+++ b/hoomd/mpcd/SRDCollisionMethod.cc
@@ -32,22 +32,20 @@ mpcd::SRDCollisionMethod::~SRDCollisionMethod()
 void mpcd::SRDCollisionMethod::startAutotuning()
     {
     mpcd::CollisionMethod::startAutotuning();
-    if (m_thermo)
-        m_thermo->startAutotuning();
+    if (m_cl)
+        m_cl->startAutotuning();
     }
 
 bool mpcd::SRDCollisionMethod::isAutotuningComplete()
     {
     bool result = mpcd::CollisionMethod::isAutotuningComplete();
-    if (m_thermo)
-        result = result && m_thermo->isAutotuningComplete();
+    if (m_cl)
+        result = result && m_cl->isAutotuningComplete();
     return result;
     }
 
 void mpcd::SRDCollisionMethod::rule(uint64_t timestep)
     {
-    m_thermo->compute(timestep);
-
     // resize the rotation vectors and rescale factors
     m_rotvec.resize(m_cl->getNCells());
     if (m_T)
@@ -71,16 +69,20 @@ void mpcd::SRDCollisionMethod::drawRotationVectors(uint64_t timestep)
 
     // optional scale factors
     std::unique_ptr<ArrayHandle<double>> h_factors;
-    std::unique_ptr<ArrayHandle<double3>> h_cell_energy;
+    std::unique_ptr<ArrayHandle<unsigned int>> h_cell_np;
+    std::unique_ptr<ArrayHandle<double>> h_cell_temp;
     Scalar T_set(1.0);
     const bool use_thermostat = (m_T) ? true : false;
     if (use_thermostat)
         {
         h_factors.reset(
             new ArrayHandle<double>(m_factors, access_location::host, access_mode::overwrite));
-        h_cell_energy.reset(new ArrayHandle<double3>(m_thermo->getCellEnergies(),
-                                                     access_location::host,
-                                                     access_mode::read));
+        h_cell_np.reset(new ArrayHandle<unsigned int>(m_cl->getCellSizeArray(),
+                                                      access_location::host,
+                                                      access_mode::read));
+        h_cell_temp.reset(new ArrayHandle<double>(m_cl->getCellTemperature(),
+                                                  access_location::host,
+                                                  access_mode::read));
         T_set = (*m_T)(timestep);
         }
 
@@ -110,8 +112,8 @@ void mpcd::SRDCollisionMethod::drawRotationVectors(uint64_t timestep)
 
                 if (use_thermostat)
                     {
-                    const double3 cell_energy = h_cell_energy->data[idx];
-                    const unsigned int np = __double_as_int(cell_energy.z);
+                    const double cell_temp = h_cell_temp->data[idx];
+                    const unsigned int np = h_cell_np->data[idx];
                     double factor = 1.0;
                     if (np > 1)
                         {
@@ -125,7 +127,7 @@ void mpcd::SRDCollisionMethod::drawRotationVectors(uint64_t timestep)
                         // generate the scale factor from the current temperature
                         // (don't use the kinetic energy of this cell, since this
                         // is total not relative to COM)
-                        const double cur_ke = alpha * cell_energy.y;
+                        const double cur_ke = alpha * cell_temp;
                         factor = (cur_ke > 0.) ? fast::sqrt(rand_ke / cur_ke) : 1.;
                         }
                     h_factors->data[idx] = factor;
@@ -162,7 +164,7 @@ void mpcd::SRDCollisionMethod::rotate(uint64_t timestep)
         }
 
     // acquire cell velocities
-    ArrayHandle<double4> h_cell_vel(m_thermo->getCellVelocities(),
+    ArrayHandle<double4> h_cell_vel(m_cl->getCellVelocities(),
                                     access_location::host,
                                     access_mode::read);
 
@@ -264,29 +266,24 @@ void mpcd::SRDCollisionMethod::setCellList(std::shared_ptr<mpcd::CellList> cl)
         detachCallbacks();
         if (m_cl)
             {
-            m_thermo = std::make_shared<mpcd::CellThermoCompute>(m_sysdef, m_cl);
             attachCallbacks();
             }
-        else
-            {
-            m_thermo = std::shared_ptr<mpcd::CellThermoCompute>();
-            }
         }
     }
 
 void mpcd::SRDCollisionMethod::attachCallbacks()
     {
-    assert(m_thermo);
-    m_thermo->getFlagsSignal()
+    assert(m_cl);
+    m_cl->getFlagsSignal()
         .connect<mpcd::SRDCollisionMethod, &mpcd::SRDCollisionMethod::getRequestedThermoFlags>(
             this);
     }
 
 void mpcd::SRDCollisionMethod::detachCallbacks()
     {
-    if (m_thermo)
+    if (m_cl)
         {
-        m_thermo->getFlagsSignal()
+        m_cl->getFlagsSignal()
             .disconnect<mpcd::SRDCollisionMethod,
                         &mpcd::SRDCollisionMethod::getRequestedThermoFlags>(this);
         }
diff --git a/hoomd/mpcd/SRDCollisionMethod.h b/hoomd/mpcd/SRDCollisionMethod.h
index d4732269dd..2d7c09b84e 100644
--- a/hoomd/mpcd/SRDCollisionMethod.h
+++ b/hoomd/mpcd/SRDCollisionMethod.h
@@ -13,7 +13,6 @@
 #error This header cannot be compiled by nvcc
 #endif
 
-#include "CellThermoCompute.h"
 #include "CollisionMethod.h"
 
 #include "hoomd/Variant.h"
@@ -81,9 +80,8 @@ class PYBIND11_EXPORT SRDCollisionMethod : public mpcd::CollisionMethod
         }
 
     protected:
-    std::shared_ptr<mpcd::CellThermoCompute> m_thermo; //!< Cell thermo
-    GPUVector<double3> m_rotvec;                       //!< MPCD rotation vectors
-    Scalar m_angle;                                    //!< MPCD rotation angle (degrees)
+    GPUVector<double3> m_rotvec; //!< MPCD rotation vectors
+    Scalar m_angle;              //!< MPCD rotation angle (degrees)
 
     GPUVector<double> m_factors; //!< Cell-level rescale factors
 
diff --git a/hoomd/mpcd/SRDCollisionMethodGPU.cc b/hoomd/mpcd/SRDCollisionMethodGPU.cc
index bb13dd00bb..b124879487 100644
--- a/hoomd/mpcd/SRDCollisionMethodGPU.cc
+++ b/hoomd/mpcd/SRDCollisionMethodGPU.cc
@@ -7,7 +7,6 @@
  */
 
 #include "SRDCollisionMethodGPU.h"
-#include "CellThermoComputeGPU.h"
 #include "SRDCollisionMethodGPU.cuh"
 
 namespace hoomd
@@ -37,14 +36,18 @@ void mpcd::SRDCollisionMethodGPU::drawRotationVectors(uint64_t timestep)
     if (m_T)
         {
         ArrayHandle<double> d_factors(m_factors, access_location::device, access_mode::overwrite);
-        ArrayHandle<double3> d_cell_energy(m_thermo->getCellEnergies(),
-                                           access_location::device,
-                                           access_mode::read);
+        ArrayHandle<unsigned int> d_cell_np(m_cl->getCellSizeArray(),
+                                            access_location::device,
+                                            access_mode::read);
+        ArrayHandle<double> d_cell_temp(m_cl->getCellTemperature(),
+                                        access_location::device,
+                                        access_mode::read);
 
         m_tuner_rotvec->begin();
         mpcd::gpu::srd_draw_vectors(d_rotvec.data,
                                     d_factors.data,
-                                    d_cell_energy.data,
+                                    d_cell_np.data,
+                                    d_cell_temp.data,
                                     m_cl->getCellIndexer(),
                                     m_cl->getOriginIndex(),
                                     m_cl->getGlobalDim(),
@@ -62,6 +65,7 @@ void mpcd::SRDCollisionMethodGPU::drawRotationVectors(uint64_t timestep)
         {
         m_tuner_rotvec->begin();
         mpcd::gpu::srd_draw_vectors(d_rotvec.data,
+                                    NULL,
                                     NULL,
                                     NULL,
                                     m_cl->getCellIndexer(),
@@ -89,7 +93,7 @@ void mpcd::SRDCollisionMethodGPU::rotate(uint64_t timestep)
     unsigned int N_tot = N_mpcd;
 
     // acquire cell velocities and rotation vectors
-    ArrayHandle<double4> d_cell_vel(m_thermo->getCellVelocities(),
+    ArrayHandle<double4> d_cell_vel(m_cl->getCellVelocities(),
                                     access_location::device,
                                     access_mode::read);
     ArrayHandle<double3> d_rotvec(m_rotvec, access_location::device, access_mode::read);
@@ -162,13 +166,8 @@ void mpcd::SRDCollisionMethodGPU::setCellList(std::shared_ptr<mpcd::CellList> cl
         detachCallbacks();
         if (m_cl)
             {
-            m_thermo = std::make_shared<mpcd::CellThermoComputeGPU>(m_sysdef, m_cl);
             attachCallbacks();
             }
-        else
-            {
-            m_thermo = std::shared_ptr<mpcd::CellThermoComputeGPU>();
-            }
         }
     }
 
diff --git a/hoomd/mpcd/SRDCollisionMethodGPU.cu b/hoomd/mpcd/SRDCollisionMethodGPU.cu
index d33a7a56c5..c8185114f4 100644
--- a/hoomd/mpcd/SRDCollisionMethodGPU.cu
+++ b/hoomd/mpcd/SRDCollisionMethodGPU.cu
@@ -21,7 +21,8 @@ namespace kernel
 template<bool use_thermostat>
 __global__ void srd_draw_vectors(double3* d_rotvec,
                                  double* d_factors,
-                                 const double3* d_cell_energy,
+                                 const unsigned int* d_cell_np,
+                                 const double* d_cell_temp,
                                  const Index3D ci,
                                  const int3 origin,
                                  const uint3 global_dim,
@@ -73,8 +74,8 @@ __global__ void srd_draw_vectors(double3* d_rotvec,
 
     if (use_thermostat)
         {
-        const double3 cell_energy = d_cell_energy[idx];
-        const unsigned int np = __double_as_int(cell_energy.z);
+        const double cell_temp = d_cell_temp[idx];
+        const unsigned int np = d_cell_np[idx];
         double factor = 1.0;
         if (np > 1)
             {
@@ -88,7 +89,7 @@ __global__ void srd_draw_vectors(double3* d_rotvec,
             // generate the scale factor from the current temperature
             // (don't use the kinetic energy of this cell, since this
             // is total not relative to COM)
-            const double cur_ke = alpha * cell_energy.y;
+            const double cur_ke = alpha * cell_temp;
             factor = (cur_ke > 0.) ? fast::sqrt(rand_ke / cur_ke) : 1.;
             }
         d_factors[idx] = factor;
@@ -184,7 +185,8 @@ __global__ void srd_rotate(Scalar4* d_vel,
 
 cudaError_t srd_draw_vectors(double3* d_rotvec,
                              double* d_factors,
-                             const double3* d_cell_energy,
+                             const unsigned int* d_cell_np,
+                             const double* d_cell_temp,
                              const Index3D& ci,
                              const int3 origin,
                              const uint3 global_dim,
@@ -208,7 +210,8 @@ cudaError_t srd_draw_vectors(double3* d_rotvec,
         dim3 grid(Ncell / run_block_size + 1);
         mpcd::gpu::kernel::srd_draw_vectors<true><<<grid, run_block_size>>>(d_rotvec,
                                                                             d_factors,
-                                                                            d_cell_energy,
+                                                                            d_cell_np,
+                                                                            d_cell_temp,
                                                                             ci,
                                                                             origin,
                                                                             global_dim,
@@ -232,7 +235,8 @@ cudaError_t srd_draw_vectors(double3* d_rotvec,
         dim3 grid(Ncell / run_block_size + 1);
         mpcd::gpu::kernel::srd_draw_vectors<false><<<grid, run_block_size>>>(d_rotvec,
                                                                              d_factors,
-                                                                             d_cell_energy,
+                                                                             d_cell_np,
+                                                                             d_cell_temp,
                                                                              ci,
                                                                              origin,
                                                                              global_dim,
diff --git a/hoomd/mpcd/SRDCollisionMethodGPU.cuh b/hoomd/mpcd/SRDCollisionMethodGPU.cuh
index 41d451ebca..eb22607340 100644
--- a/hoomd/mpcd/SRDCollisionMethodGPU.cuh
+++ b/hoomd/mpcd/SRDCollisionMethodGPU.cuh
@@ -22,7 +22,8 @@ namespace gpu
     {
 cudaError_t srd_draw_vectors(double3* d_rotvec,
                              double* d_factors,
-                             const double3* d_cell_energy,
+                             const unsigned int* d_cell_np,
+                             const double* d_cell_temp,
                              const Index3D& ci,
                              const int3 origin,
                              const uint3 global_dim,
diff --git a/hoomd/mpcd/Sorter.cc b/hoomd/mpcd/Sorter.cc
index cc178c372a..3c26cf5a04 100644
--- a/hoomd/mpcd/Sorter.cc
+++ b/hoomd/mpcd/Sorter.cc
@@ -14,8 +14,7 @@ namespace hoomd
  * \param sysdef System definition
  */
 mpcd::Sorter::Sorter(std::shared_ptr<SystemDefinition> sysdef, std::shared_ptr<Trigger> trigger)
-    : Tuner(sysdef, trigger), m_mpcd_pdata(m_sysdef->getMPCDParticleData()), m_order(m_exec_conf),
-      m_rorder(m_exec_conf)
+    : Tuner(sysdef, trigger), m_mpcd_pdata(m_sysdef->getMPCDParticleData()), m_order(m_exec_conf)
     {
     m_exec_conf->msg->notice(5) << "Constructing MPCD Sorter" << std::endl;
     }
@@ -36,17 +35,18 @@ void mpcd::Sorter::update(uint64_t timestep)
         {
         throw std::runtime_error("Cell list has not been set");
         }
+    // ensure that the cell list is built
+    m_cl->compute(timestep);
 
     // resize the sorted order vector to the current number of particles
     m_order.resize(m_mpcd_pdata->getN());
-    m_rorder.resize(m_mpcd_pdata->getN());
 
     // generate and apply the sorted order
     computeOrder(timestep);
     applyOrder();
 
     // trigger the sort signal for ParticleData callbacks using the current sortings
-    m_mpcd_pdata->notifySort(timestep, m_order, m_rorder);
+    m_mpcd_pdata->notifySort(timestep);
     }
 
 /*!
@@ -58,37 +58,18 @@ void mpcd::Sorter::update(uint64_t timestep)
  */
 void mpcd::Sorter::computeOrder(uint64_t timestep)
     {
-    // compute the cell list at current timestep, guarantees owned particles are on rank
-    m_cl->compute(timestep);
-
-    ArrayHandle<unsigned int> h_cell_list(m_cl->getCellList(),
-                                          access_location::host,
-                                          access_mode::read);
-    ArrayHandle<unsigned int> h_cell_np(m_cl->getCellSizeArray(),
-                                        access_location::host,
-                                        access_mode::read);
-    const Index2D& cli = m_cl->getCellListIndexer();
-
-    // loop through the cell list to generate the sorting order for MPCD particles
     ArrayHandle<unsigned int> h_order(m_order, access_location::host, access_mode::overwrite);
-    ArrayHandle<unsigned int> h_rorder(m_rorder, access_location::host, access_mode::overwrite);
-    const unsigned int N_mpcd = m_mpcd_pdata->getN();
-    unsigned int cur_p = 0;
-    for (unsigned int idx = 0; idx < m_cl->getNCells(); ++idx)
-        {
-        const unsigned int np = h_cell_np.data[idx];
-        for (unsigned int offset = 0; offset < np; ++offset)
-            {
-            const unsigned int pid = h_cell_list.data[cli(offset, idx)];
-            // only count MPCD particles, and skip embedded particles
-            if (pid < N_mpcd)
-                {
-                h_order.data[cur_p] = pid;
-                h_rorder.data[pid] = cur_p;
-                ++cur_p;
-                }
-            }
-        }
+    ArrayHandle<Scalar4> h_vel(m_mpcd_pdata->getVelocities(),
+                               access_location::host,
+                               access_mode::read);
+    unsigned int mpcd_N = m_mpcd_pdata->getN();
+    // sort indices
+    std::iota(h_order.data, h_order.data + mpcd_N, 0);
+
+    std::sort(h_order.data,
+              h_order.data + mpcd_N,
+              [&h_vel](unsigned int i, unsigned int j)
+              { return __scalar_as_int(h_vel.data[i].w) < __scalar_as_int(h_vel.data[j].w); });
     }
 
 /*!
diff --git a/hoomd/mpcd/Sorter.h b/hoomd/mpcd/Sorter.h
index 3a308cca0f..427f5c61c6 100644
--- a/hoomd/mpcd/Sorter.h
+++ b/hoomd/mpcd/Sorter.h
@@ -64,8 +64,7 @@ class PYBIND11_EXPORT Sorter : public Tuner
     std::shared_ptr<mpcd::ParticleData> m_mpcd_pdata; //!< MPCD particle data
     std::shared_ptr<mpcd::CellList> m_cl;             //!< MPCD cell list
 
-    GPUVector<unsigned int> m_order;  //!< Maps new sorted index onto old particle indexes
-    GPUVector<unsigned int> m_rorder; //!< Maps old particle indexes onto new sorted indexes
+    GPUVector<unsigned int> m_order; //!< Maps new sorted index onto old particle indexes
 
     //! Compute the sorting order at the current timestep
     virtual void computeOrder(uint64_t timestep);
diff --git a/hoomd/mpcd/SorterGPU.cc b/hoomd/mpcd/SorterGPU.cc
index 897b81919b..cc88796b32 100644
--- a/hoomd/mpcd/SorterGPU.cc
+++ b/hoomd/mpcd/SorterGPU.cc
@@ -16,18 +16,15 @@ namespace hoomd
  */
 mpcd::SorterGPU::SorterGPU(std::shared_ptr<SystemDefinition> sysdef,
                            std::shared_ptr<Trigger> trigger)
-    : mpcd::Sorter(sysdef, trigger)
+    : mpcd::Sorter(sysdef, trigger), m_cell_id(m_exec_conf)
     {
-    m_sentinel_tuner.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
-                                            m_exec_conf,
-                                            "mpcd_sort_sentinel"));
-    m_reverse_tuner.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
-                                           m_exec_conf,
-                                           "mpcd_sort_reverse"));
+    m_compute_order_tuner.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
+                                                 m_exec_conf,
+                                                 "mpcd_sort_sentinel"));
     m_apply_tuner.reset(new Autotuner<1>({AutotunerBase::makeBlockSizeRange(m_exec_conf)},
                                          m_exec_conf,
                                          "mpcd_sort_apply"));
-    m_autotuners.insert(m_autotuners.end(), {m_sentinel_tuner, m_reverse_tuner, m_apply_tuner});
+    m_autotuners.insert(m_autotuners.end(), {m_compute_order_tuner, m_apply_tuner});
     }
 
 /*!
@@ -39,64 +36,38 @@ mpcd::SorterGPU::SorterGPU(std::shared_ptr<SystemDefinition> sysdef,
  */
 void mpcd::SorterGPU::computeOrder(uint64_t timestep)
     {
-    // compute the cell list at current timestep, guarantees owned particles are on rank
-    m_cl->compute(timestep);
-
-        // fill the empty cell list entries with a sentinel larger than number of MPCD particles
+    // ensure auxiliary array is correct size
+    const unsigned int mpcd_N = m_mpcd_pdata->getN();
+    if (mpcd_N > m_cell_id.getNumElements())
         {
-        ArrayHandle<unsigned int> d_cell_list(m_cl->getCellList(),
-                                              access_location::device,
-                                              access_mode::readwrite);
-        ArrayHandle<unsigned int> d_cell_np(m_cl->getCellSizeArray(),
-                                            access_location::device,
-                                            access_mode::read);
-
-        m_sentinel_tuner->begin();
-        mpcd::gpu::sort_set_sentinel(d_cell_list.data,
-                                     d_cell_np.data,
-                                     m_cl->getCellListIndexer(),
-                                     0xffffffff,
-                                     m_sentinel_tuner->getParam()[0]);
-        if (m_exec_conf->isCUDAErrorCheckingEnabled())
-            CHECK_CUDA_ERROR();
-        m_sentinel_tuner->end();
+        GPUArray<unsigned int> cell_id(mpcd_N, m_exec_conf);
+        m_cell_id.swap(cell_id);
         }
 
-        // use thrust to select out the indexes of MPCD particles
+        // compute the cell list at current timestep, guarantees owned particles are on rank
+        // create an order to sort the particle indices by their cell
         {
-        ArrayHandle<unsigned int> d_cell_list(m_cl->getCellList(),
-                                              access_location::device,
-                                              access_mode::read);
         ArrayHandle<unsigned int> d_order(m_order, access_location::device, access_mode::overwrite);
-        const unsigned int num_select
-            = mpcd::gpu::sort_cell_compact(d_order.data,
-                                           d_cell_list.data,
-                                           m_cl->getCellListIndexer().getNumElements(),
-                                           m_mpcd_pdata->getN());
+        ArrayHandle<unsigned int> d_cell_id(m_cell_id,
+                                            access_location::device,
+                                            access_mode::readwrite);
+        ArrayHandle<Scalar4> d_vel(m_mpcd_pdata->getVelocities(),
+                                   access_location::device,
+                                   access_mode::read);
+        m_compute_order_tuner->begin();
+        mpcd::gpu::set_order(d_order.data,
+                             d_cell_id.data,
+                             d_vel.data,
+                             mpcd_N,
+                             m_compute_order_tuner->getParam()[0]);
         if (m_exec_conf->isCUDAErrorCheckingEnabled())
             CHECK_CUDA_ERROR();
-        if (num_select != m_mpcd_pdata->getN())
-            {
-            m_exec_conf->msg->error()
-                << "Error compacting cell list for sorting, lost particles." << std::endl;
-            }
-        }
-
-        // fill out the reverse ordering map
-        {
-        ArrayHandle<unsigned int> d_order(m_order, access_location::device, access_mode::read);
-        ArrayHandle<unsigned int> d_rorder(m_rorder,
-                                           access_location::device,
-                                           access_mode::overwrite);
+        m_compute_order_tuner->end();
 
-        m_reverse_tuner->begin();
-        mpcd::gpu::sort_gen_reverse(d_rorder.data,
-                                    d_order.data,
-                                    m_mpcd_pdata->getN(),
-                                    m_reverse_tuner->getParam()[0]);
+        // perform sorting
+        mpcd::gpu::compute_order(d_order.data, d_cell_id.data, mpcd_N);
         if (m_exec_conf->isCUDAErrorCheckingEnabled())
             CHECK_CUDA_ERROR();
-        m_reverse_tuner->end();
         }
     }
 
diff --git a/hoomd/mpcd/SorterGPU.cu b/hoomd/mpcd/SorterGPU.cu
index 2367372fd4..5212fc7296 100644
--- a/hoomd/mpcd/SorterGPU.cu
+++ b/hoomd/mpcd/SorterGPU.cu
@@ -11,7 +11,9 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wconversion"
 #include <thrust/copy.h>
+#include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
+#include <thrust/sort.h>
 #pragma GCC diagnostic pop
 
 namespace hoomd
@@ -59,65 +61,27 @@ __global__ void sort_apply(Scalar4* d_pos_alt,
 
 //! Kernel to set the empty-cell-entry sentinel
 /*!
- * \param d_cell_list Cell list to fill in
- * \param d_cell_np Number of particles per cell
- * \param cli Two-dimensional cell-list indexer
- * \param sentinel Value to fill into empty cell-list entries
- * \param N_cli Number of total entries (filled and empty) in the cell list
- *
- * \b Implementation
- * Using one thread per cell-list entry, the 1D kernel index is mapped onto
- * the 2D entry in the cell list. If the current entry is not filled, the entry
- * is filled with the value of \a sentinel. Typically, \a sentinel should be
- * larger than the number of MPCD particles in the system. (A good value would be
- * 0xffffffff). The cell list can subsequently be compacted by mpcd::gpu::sort_cell_compact.
- */
-__global__ void sort_set_sentinel(unsigned int* d_cell_list,
-                                  const unsigned int* d_cell_np,
-                                  const Index2D cli,
-                                  const unsigned int sentinel,
-                                  const unsigned int N_cli)
-    {
-    // one thread per cell-list entry
-    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= N_cli)
-        return;
-
-    // convert the entry 1D index into a 2D index
-    const unsigned int cell = idx / cli.getW();
-    const unsigned int offset = idx - (cell * cli.getW());
-
-    // if the offset lies outside the number of particles in the cell, fill it with sentinel
-    const unsigned int np = d_cell_np[cell];
-    if (offset >= np)
-        d_cell_list[idx] = sentinel;
-    }
-
-//! Kernel to generate the reverse particle mapping
-/*!
- * \param d_rorder Map of old particle indexes onto new particle indexes (output)
- * \param d_order Map of new particle indexes onto old particle indexes
+ * \param d_order Particle order to fill with indices
+ * \param d_cell_id auxiliary array to sort cell id of each mpcd particles
+ * \param d_vel velocities and cell ids of mpcd particles
  * \param N Number of particles
  *
  * \b Implementation
- * Using one thread per particle, the map generated by mpcd::gpu::sort_cell_compact
- * is read and reversed so that new particle indexes can be looked up from old
- * particle indexes.
+ * Fills order with indices 0 to N and the auxiliary array d_cell_id with
+ * the cell ids of the mpcd particles in preparation to sort the values
+ * in d_order by the cell ids.
  */
 __global__ void
-sort_gen_reverse(unsigned int* d_rorder, const unsigned int* d_order, const unsigned int N)
+set_order(unsigned int* d_order, unsigned int* d_cell_id, const Scalar4* d_vel, unsigned int N)
     {
     // one thread per particle
     const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= N)
         return;
 
-    // inverse map the ordering
-    // d_order maps the new index (idx) onto the old index (pid), so we need to flip this around
-    const unsigned int pid = d_order[idx];
-    d_rorder[pid] = idx;
+    d_order[idx] = idx;
+    d_cell_id[idx] = __scalar_as_int(d_vel[idx].w);
     }
-
     } // end namespace kernel
 
 /*!
@@ -168,123 +132,45 @@ cudaError_t sort_apply(Scalar4* d_pos_alt,
     }
 
 /*!
- * \param d_cell_list Cell list to fill in
- * \param d_cell_np Number of particles per cell
- * \param cli Two-dimensional cell-list indexer
- * \param sentinel Value to fill into empty cell-list entries
- * \param N_cli Number of total entries (filled and empty) in the cell list
+ * \param d_order Compacted MPCD particle indexes in cell-list order (output)
+ * \param d_cell_id auxiliary array to sort cell id of each mpcd particles
+ * \param d_vel Cell indexes of the MPCD particles
+ * \param N Number of particles
  * \param block_size Number of threads per block
- *
+
  * \returns cudaSuccess on completion
  *
- * \sa mpcd::gpu::kernel::sort_set_sentinel
+ * \sa mpcd::gpu::kernel::set_order
  */
-cudaError_t sort_set_sentinel(unsigned int* d_cell_list,
-                              const unsigned int* d_cell_np,
-                              const Index2D& cli,
-                              const unsigned int sentinel,
-                              const unsigned int block_size)
+cudaError_t set_order(unsigned int* d_order,
+                      unsigned int* d_cell_id,
+                      const Scalar4* d_vel,
+                      const unsigned int N,
+                      const unsigned int block_size)
     {
     unsigned int max_block_size;
     cudaFuncAttributes attr;
-    cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::sort_set_sentinel);
+    cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::set_order);
     max_block_size = attr.maxThreadsPerBlock;
 
-    const unsigned int N_cli = cli.getNumElements();
-
     unsigned int run_block_size = min(block_size, max_block_size);
-    dim3 grid(N_cli / run_block_size + 1);
-    mpcd::gpu::kernel::sort_set_sentinel<<<grid, run_block_size>>>(d_cell_list,
-                                                                   d_cell_np,
-                                                                   cli,
-                                                                   sentinel,
-                                                                   N_cli);
-
+    dim3 grid(N / run_block_size + 1);
+    mpcd::gpu::kernel::set_order<<<grid, run_block_size>>>(d_order, d_cell_id, d_vel, N);
     return cudaSuccess;
     }
 
-//! Less-than comparison functor for cell-list compaction
-struct LessThan
-    {
-    //! Constructor
-    /*!
-     * \param compare_ Value to compare less than
-     */
-    __host__ __device__ LessThan(unsigned int compare_) : compare(compare_) { }
-
-    //! Less than comparison functor
-    /*!
-     * \param val
-     * \returns True if \a val is less than \a compare
-     */
-    __host__ __device__ bool operator()(const unsigned int& val) const
-        {
-        return (val < compare);
-        }
-
-    unsigned int compare; //!< Value to compare less-than to
-    };
-
 /*!
  * \param d_order Compacted MPCD particle indexes in cell-list order (output)
- * \param d_cell_list Cell list array to compact
- * \param num_items Number of items in the cell list
- * \param N_mpcd Number of MPCD particles
- *
- * \returns Number of items selected (should be equal to N_mpcd).
- *
- * \b Implementation
- * thrust::copy_if is used to efficiently compact the MPCD cell list to
- * a stream containing only MPCD particle indexes. First, a sentinel is set to
- * fill in empty entries (mpcd::gpu::sort_set_sentinel). Then, the LessThan functor
- * is used to compare all entries in the cell list, and return only those with
- * indexes less than \a N_mpcd.
- *
- * \b Note
- * A previous version of this function used CUB to perform the select, but this seemed to
- * fail when HOOMD bumped to CUB 1.8.0. Segfaults were encountered when executing DeviceSelect::If.
- * I decided to replace it with thrust since this is not so performance critical, and I was unable
- * to find the proper root cause of it.
- */
-unsigned int sort_cell_compact(unsigned int* d_order,
-                               const unsigned int* d_cell_list,
-                               const unsigned int num_items,
-                               const unsigned int N_mpcd)
-    {
-    unsigned int* last = thrust::copy_if(thrust::device,
-                                         d_cell_list,
-                                         d_cell_list + num_items,
-                                         d_order,
-                                         LessThan(N_mpcd));
-    return (unsigned int)(last - d_order);
-    }
-
-/*!
- * \param d_rorder Map of old particle indexes onto new particle indexes (output)
- * \param d_order Map of new particle indexes onto old particle indexes
+ * \param d_cell_id auxiliary array to sort cell id of each mpcd particles
  * \param N Number of particles
- *
+
  * \returns cudaSuccess on completion
- *
- * \sa mpcd::gpu::kernel::sort_gen_reverse
  */
-cudaError_t sort_gen_reverse(unsigned int* d_rorder,
-                             const unsigned int* d_order,
-                             const unsigned int N,
-                             const unsigned int block_size)
+cudaError_t compute_order(unsigned int* d_order, unsigned int* d_cell_id, const unsigned int N)
     {
-    if (N == 0)
-        return cudaSuccess;
-
-    unsigned int max_block_size;
-    cudaFuncAttributes attr;
-    cudaFuncGetAttributes(&attr, (const void*)mpcd::gpu::kernel::sort_gen_reverse);
-    max_block_size = attr.maxThreadsPerBlock;
-
-    unsigned int run_block_size = min(block_size, max_block_size);
-    dim3 grid(N / run_block_size + 1);
-    mpcd::gpu::kernel::sort_gen_reverse<<<grid, run_block_size>>>(d_rorder, d_order, N);
-
+    thrust::device_ptr<unsigned int> order(d_order);
+    thrust::device_ptr<unsigned int> cell_id(d_cell_id);
+    thrust::sort_by_key(thrust::device, cell_id, cell_id + N, order);
     return cudaSuccess;
     }
     } // end namespace gpu
diff --git a/hoomd/mpcd/SorterGPU.cuh b/hoomd/mpcd/SorterGPU.cuh
index 14d6126412..c2c2502340 100644
--- a/hoomd/mpcd/SorterGPU.cuh
+++ b/hoomd/mpcd/SorterGPU.cuh
@@ -31,24 +31,22 @@ cudaError_t sort_apply(Scalar4* d_pos_alt,
                        const unsigned int N,
                        const unsigned int block_size);
 
-//! Kernel driver to fill empty cell list entries with sentinel
-cudaError_t sort_set_sentinel(unsigned int* d_cell_list,
-                              const unsigned int* d_cell_np,
-                              const Index2D& cli,
-                              const unsigned int sentinel,
-                              const unsigned int block_size);
+//! Kernel driver to sort order
+cudaError_t set_order(unsigned int* d_order,
+                      unsigned int* d_cell_id,
+                      const Scalar4* d_vel,
+                      const unsigned int N_mpcd,
+                      const unsigned int block_size);
+
+cudaError_t
+compute_order(unsigned int* d_order, unsigned int* d_cell_id, const unsigned int N_mpcd);
 
 //! Driver for thrust to perform cell-list stream compaction
 unsigned int sort_cell_compact(unsigned int* d_order,
                                const unsigned int* d_cell_list,
                                const unsigned int num_items,
-                               const unsigned int N_mpcd);
+                               const unsigned int N);
 
-//! Kernel driver to reverse map the particle ordering
-cudaError_t sort_gen_reverse(unsigned int* d_rorder,
-                             const unsigned int* d_order,
-                             const unsigned int N,
-                             const unsigned int block_size);
     } // end namespace gpu
     } // end namespace mpcd
     } // end namespace hoomd
diff --git a/hoomd/mpcd/SorterGPU.h b/hoomd/mpcd/SorterGPU.h
index 566db8aeb7..e57e5106c8 100644
--- a/hoomd/mpcd/SorterGPU.h
+++ b/hoomd/mpcd/SorterGPU.h
@@ -32,11 +32,10 @@ class PYBIND11_EXPORT SorterGPU : public mpcd::Sorter
     SorterGPU(std::shared_ptr<SystemDefinition> sysdef, std::shared_ptr<Trigger> trigger);
 
     protected:
-    /// Kernel tuner for filling sentinels in cell list.
-    std::shared_ptr<Autotuner<1>> m_sentinel_tuner;
+    GPUArray<unsigned int> m_cell_id; //!< auxiliary array for sorting cell ids
 
-    /// Kernel tuner for setting reverse map.
-    std::shared_ptr<Autotuner<1>> m_reverse_tuner;
+    /// Kernel tuner for filling sentinels in cell list.
+    std::shared_ptr<Autotuner<1>> m_compute_order_tuner;
 
     //!< Kernel tuner for applying sorted order.
     std::shared_ptr<Autotuner<1>> m_apply_tuner;
diff --git a/hoomd/mpcd/module.cc b/hoomd/mpcd/module.cc
index 15225ca61d..1f6efac3ed 100644
--- a/hoomd/mpcd/module.cc
+++ b/hoomd/mpcd/module.cc
@@ -20,10 +20,8 @@ namespace mpcd
  */
 namespace detail
     {
-void export_ATCollisionMethod(pybind11::module&);
 void export_BlockForce(pybind11::module&);
 void export_CellList(pybind11::module&);
-void export_CellThermoCompute(pybind11::module&);
 void export_CollisionMethod(pybind11::module&);
 #ifdef ENABLE_MPI
 void export_Communicator(pybind11::module&);
@@ -51,9 +49,7 @@ void export_SRDCollisionMethod(pybind11::module&);
 void export_StreamingMethod(pybind11::module&);
 void export_VirtualParticleFiller(pybind11::module&);
 #ifdef ENABLE_HIP
-void export_ATCollisionMethodGPU(pybind11::module&);
 void export_CellListGPU(pybind11::module&);
-void export_CellThermoComputeGPU(pybind11::module&);
 #ifdef ENABLE_MPI
 void export_CommunicatorGPU(pybind11::module&);
 #endif // ENABLE_MPI
@@ -199,10 +195,8 @@ PYBIND11_MODULE(_mpcd, m)
     export_VirtualParticleFiller(m);
     export_StreamingMethod(m);
 
-    export_ATCollisionMethod(m);
     export_BlockForce(m);
     export_CellList(m);
-    export_CellThermoCompute(m);
 #ifdef ENABLE_MPI
     export_Communicator(m);
 #endif // ENABLE_MPI
@@ -227,9 +221,7 @@ PYBIND11_MODULE(_mpcd, m)
     export_SineForce(m);
     export_SRDCollisionMethod(m);
 #ifdef ENABLE_HIP
-    export_ATCollisionMethodGPU(m);
     export_CellListGPU(m);
-    export_CellThermoComputeGPU(m);
 #ifdef ENABLE_MPI
     export_CommunicatorGPU(m);
 #endif // ENABLE_MPI
diff --git a/hoomd/mpcd/pytest/test_collide.py b/hoomd/mpcd/pytest/test_collide.py
index 390bf829e0..554e31f9ea 100644
--- a/hoomd/mpcd/pytest/test_collide.py
+++ b/hoomd/mpcd/pytest/test_collide.py
@@ -31,15 +31,27 @@ def test_cell_list(small_snap, simulation_factory):
     assert cl.num_cells == (20, 30, 40)
 
 
+# @pytest.mark.parametrize(
+#     "cls, init_args",
+#     [
+#         (
+#             hoomd.mpcd.collide.AndersenThermostat,
+#             {
+#                 "kT": 1.0,
+#             },
+#         ),
+#         (
+#             hoomd.mpcd.collide.StochasticRotationDynamics,
+#             {
+#                 "angle": 90,
+#             },
+#         ),
+#     ],
+#     ids=["AndersenThermostat", "StochasticRotationDynamics"],
+# )
 @pytest.mark.parametrize(
     "cls, init_args",
     [
-        (
-            hoomd.mpcd.collide.AndersenThermostat,
-            {
-                "kT": 1.0,
-            },
-        ),
         (
             hoomd.mpcd.collide.StochasticRotationDynamics,
             {
@@ -47,7 +59,7 @@ def test_cell_list(small_snap, simulation_factory):
             },
         ),
     ],
-    ids=["AndersenThermostat", "StochasticRotationDynamics"],
+    ids=["StochasticRotationDynamics"],
 )
 class TestCollisionMethod:
     def test_create(self, small_snap, simulation_factory, cls, init_args):
diff --git a/hoomd/mpcd/test/CMakeLists.txt b/hoomd/mpcd/test/CMakeLists.txt
index 7e8ee23d24..cc4bbac2f9 100644
--- a/hoomd/mpcd/test/CMakeLists.txt
+++ b/hoomd/mpcd/test/CMakeLists.txt
@@ -1,8 +1,6 @@
 # external_field test will not link properly due to separable compilation, disabling.
 set(TEST_LIST
-    at_collision_method
     cell_list
-    cell_thermo_compute
     force
     parallel_plate_geometry_filler
     planar_pore_geometry_filler
@@ -19,9 +17,7 @@ if(ENABLE_MPI)
     SET(MPI_TEST_LIST ${MPI_TEST_LIST} ${_KEY})
     ENDMACRO(ADD_TO_MPI_TESTS)
 
-    ADD_TO_MPI_TESTS(cell_communicator 8)
     ADD_TO_MPI_TESTS(cell_list 8)
-    ADD_TO_MPI_TESTS(cell_thermo_compute 8)
     ADD_TO_MPI_TESTS(communicator 8)
     ADD_TO_MPI_TESTS(parallel_plate_geometry_filler 8)
     ADD_TO_MPI_TESTS(planar_pore_geometry_filler 8)
diff --git a/hoomd/mpcd/test/cell_list_mpi_test.cc b/hoomd/mpcd/test/cell_list_mpi_test.cc
index acfc01e764..a54e349246 100644
--- a/hoomd/mpcd/test/cell_list_mpi_test.cc
+++ b/hoomd/mpcd/test/cell_list_mpi_test.cc
@@ -817,9 +817,6 @@ void celllist_basic_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
         ArrayHandle<unsigned int> h_cell_np(cl->getCellSizeArray(),
                                             access_location::host,
                                             access_mode::read);
-        ArrayHandle<unsigned int> h_cell_list(cl->getCellList(),
-                                              access_location::host,
-                                              access_mode::read);
         Index3D ci = cl->getCellIndexer();
         ArrayHandle<Scalar4> h_vel(pdata->getVelocities(),
                                    access_location::host,
@@ -878,9 +875,6 @@ void celllist_basic_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
         ArrayHandle<unsigned int> h_cell_np(cl->getCellSizeArray(),
                                             access_location::host,
                                             access_mode::read);
-        ArrayHandle<unsigned int> h_cell_list(cl->getCellList(),
-                                              access_location::host,
-                                              access_mode::read);
         Index3D ci = cl->getCellIndexer();
         ArrayHandle<Scalar4> h_vel(pdata->getVelocities(),
                                    access_location::host,
@@ -938,9 +932,6 @@ void celllist_basic_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
         ArrayHandle<unsigned int> h_cell_np(cl->getCellSizeArray(),
                                             access_location::host,
                                             access_mode::read);
-        ArrayHandle<unsigned int> h_cell_list(cl->getCellList(),
-                                              access_location::host,
-                                              access_mode::read);
         Index3D ci = cl->getCellIndexer();
         ArrayHandle<Scalar4> h_vel(pdata->getVelocities(),
                                    access_location::host,
diff --git a/hoomd/mpcd/test/cell_list_test.cc b/hoomd/mpcd/test/cell_list_test.cc
index d816b9bc18..09302aff82 100644
--- a/hoomd/mpcd/test/cell_list_test.cc
+++ b/hoomd/mpcd/test/cell_list_test.cc
@@ -50,17 +50,6 @@ void celllist_dimension_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
     CHECK_EQUAL_UINT(cell_indexer.getNumElements(), 6 * 8 * 10);
     UP_ASSERT(cl->getCellSizeArray().getNumElements() >= 6 * 8 * 10); // Each cell has one number
 
-    unsigned int Nmax = cl->getNmax();
-    CHECK_EQUAL_UINT(Nmax,
-                     4); // Default is 4 particles per cell, ensure this happens if there's only one
-
-    Index2D cli = cl->getCellListIndexer();
-    CHECK_EQUAL_UINT(cli.getNumElements(),
-                     6 * 8 * 10 * Nmax); // Cell list indexer has Nmax entries per cell
-
-    // Cell list uses amortized sizing, so must only be at least this big
-    UP_ASSERT(cl->getCellList().getNumElements() >= 6 * 8 * 10 * Nmax);
-
     /*******************/
     // Change the cell size, and ensure everything stays up to date
     cl->setGlobalDim(make_uint3(3, 4, 5));
@@ -75,13 +64,6 @@ void celllist_dimension_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
     cell_indexer = cl->getCellIndexer();
     CHECK_EQUAL_UINT(cell_indexer.getNumElements(), 3 * 4 * 5);
     UP_ASSERT(cl->getCellSizeArray().getNumElements() >= 3 * 4 * 5); // Each cell has one number
-
-    cli = cl->getCellListIndexer();
-    CHECK_EQUAL_UINT(cli.getNumElements(),
-                     3 * 4 * 5 * Nmax); // Cell list indexer has Nmax entries per cell
-
-    // Cell list uses amortized sizing, so must only be at least this big
-    UP_ASSERT(cl->getCellList().getNumElements() >= 3 * 4 * 5 * Nmax);
     }
 
 //! Test for correct cell listing of a small system
@@ -121,9 +103,6 @@ void celllist_small_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
         ArrayHandle<unsigned int> h_cell_np(cl->getCellSizeArray(),
                                             access_location::host,
                                             access_mode::read);
-        ArrayHandle<unsigned int> h_cell_list(cl->getCellList(),
-                                              access_location::host,
-                                              access_mode::read);
 
         // validate that each cell has the right number
         Index3D ci = cl->getCellIndexer();
@@ -136,19 +115,6 @@ void celllist_small_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
         CHECK_EQUAL_UINT(h_cell_np.data[ci(1, 1, 0)], 1);
         CHECK_EQUAL_UINT(h_cell_np.data[ci(1, 1, 1)], 1);
 
-        // check the particle ids in each cell
-        Index2D cli = cl->getCellListIndexer();
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 0, 0))], 0);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(1, ci(0, 0, 0))], 8);
-
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 0, 1))], 4);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 1, 0))], 2);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 1, 1))], 6);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(1, 0, 0))], 1);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(1, 0, 1))], 5);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(1, 1, 0))], 3);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(1, 1, 1))], 7);
-
         ArrayHandle<Scalar4> h_vel(pdata_9->getVelocities(),
                                    access_location::host,
                                    access_mode::read);
@@ -185,9 +151,6 @@ void celllist_small_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
         ArrayHandle<unsigned int> h_cell_np(cl->getCellSizeArray(),
                                             access_location::host,
                                             access_mode::read);
-        ArrayHandle<unsigned int> h_cell_list(cl->getCellList(),
-                                              access_location::host,
-                                              access_mode::read);
 
         // validate that each cell has the right number
         Index3D ci = cl->getCellIndexer();
@@ -200,29 +163,6 @@ void celllist_small_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
         CHECK_EQUAL_UINT(h_cell_np.data[ci(1, 0, 0)], 0);
         CHECK_EQUAL_UINT(h_cell_np.data[ci(1, 0, 1)], 0);
         CHECK_EQUAL_UINT(h_cell_np.data[ci(1, 1, 0)], 0);
-
-        // check the particle ids in each cell
-        Index2D cli = cl->getCellListIndexer();
-            {
-            std::vector<unsigned int> pids(5, 0);
-            for (unsigned int i = 0; i < 5; ++i)
-                {
-                pids[i] = h_cell_list.data[cli(i, ci(0, 0, 0))];
-                }
-            sort(pids.begin(), pids.end());
-            unsigned int check_pids[] = {0, 2, 4, 6, 8};
-            UP_ASSERT_EQUAL(pids, check_pids);
-            }
-            {
-            std::vector<unsigned int> pids(4, 0);
-            for (unsigned int i = 0; i < 4; ++i)
-                {
-                pids[i] = h_cell_list.data[cli(i, ci(1, 1, 1))];
-                }
-            sort(pids.begin(), pids.end());
-            unsigned int check_pids[] = {1, 3, 5, 7};
-            UP_ASSERT_EQUAL(pids, check_pids);
-            }
         }
 
         // bring all particles into one box, which triggers a resize, and check that all particles
@@ -425,9 +365,6 @@ void celllist_embed_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
         ArrayHandle<unsigned int> h_cell_np(cl->getCellSizeArray(),
                                             access_location::host,
                                             access_mode::read);
-        ArrayHandle<unsigned int> h_cell_list(cl->getCellList(),
-                                              access_location::host,
-                                              access_mode::read);
 
         // validate that each cell has the right number
         Index3D ci = cl->getCellIndexer();
@@ -440,17 +377,6 @@ void celllist_embed_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
         CHECK_EQUAL_UINT(h_cell_np.data[ci(1, 1, 0)], 1);
         CHECK_EQUAL_UINT(h_cell_np.data[ci(1, 1, 1)], 1);
 
-        // check the particle ids in each cell
-        Index2D cli = cl->getCellListIndexer();
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 0, 0))], 0);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 0, 1))], 4);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 1, 0))], 2);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 1, 1))], 6);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(1, 0, 0))], 1);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(1, 0, 1))], 5);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(1, 1, 0))], 3);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(1, 1, 1))], 7);
-
         ArrayHandle<Scalar4> h_vel(pdata_8->getVelocities(),
                                    access_location::host,
                                    access_mode::read);
@@ -477,9 +403,6 @@ void celllist_embed_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
         ArrayHandle<unsigned int> h_cell_np(cl->getCellSizeArray(),
                                             access_location::host,
                                             access_mode::read);
-        ArrayHandle<unsigned int> h_cell_list(cl->getCellList(),
-                                              access_location::host,
-                                              access_mode::read);
 
         // validate that each cell has the right number
         Index3D ci = cl->getCellIndexer();
@@ -492,45 +415,6 @@ void celllist_embed_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
         CHECK_EQUAL_UINT(h_cell_np.data[ci(1, 1, 0)], 2);
         CHECK_EQUAL_UINT(h_cell_np.data[ci(1, 1, 1)], 2);
 
-        // check the particle ids in each cell
-        Index2D cli = cl->getCellListIndexer();
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 0, 0))], 0);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 0, 1))], 4);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 1, 0))], 2);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 1, 1))], 6);
-            // check two particles in cell (1,0,0)
-            {
-            std::vector<unsigned int> result(2);
-            result[0] = h_cell_list.data[cli(0, ci(1, 0, 0))];
-            result[1] = h_cell_list.data[cli(1, ci(1, 0, 0))];
-            sort(result.begin(), result.end());
-            UP_ASSERT_EQUAL(result, std::vector<unsigned int> {1, 8});
-            }
-            // check two particles in cell (1,0,1)
-            {
-            std::vector<unsigned int> result(2);
-            result[0] = h_cell_list.data[cli(0, ci(1, 0, 1))];
-            result[1] = h_cell_list.data[cli(1, ci(1, 0, 1))];
-            sort(result.begin(), result.end());
-            UP_ASSERT_EQUAL(result, std::vector<unsigned int> {5, 10});
-            }
-            // check two particles in cell (1,1,0)
-            {
-            std::vector<unsigned int> result(2);
-            result[0] = h_cell_list.data[cli(0, ci(1, 1, 0))];
-            result[1] = h_cell_list.data[cli(1, ci(1, 1, 0))];
-            sort(result.begin(), result.end());
-            UP_ASSERT_EQUAL(result, std::vector<unsigned int> {3, 9});
-            }
-            // check two particles in cell (1,1,1)
-            {
-            std::vector<unsigned int> result(2);
-            result[0] = h_cell_list.data[cli(0, ci(1, 1, 1))];
-            result[1] = h_cell_list.data[cli(1, ci(1, 1, 1))];
-            sort(result.begin(), result.end());
-            UP_ASSERT_EQUAL(result, std::vector<unsigned int> {7, 11});
-            }
-
         ArrayHandle<Scalar4> h_vel(pdata_8->getVelocities(),
                                    access_location::host,
                                    access_mode::read);
@@ -579,9 +463,6 @@ void celllist_embed_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
         ArrayHandle<unsigned int> h_cell_np(cl->getCellSizeArray(),
                                             access_location::host,
                                             access_mode::read);
-        ArrayHandle<unsigned int> h_cell_list(cl->getCellList(),
-                                              access_location::host,
-                                              access_mode::read);
 
         // validate that each cell has the right number
         Index3D ci = cl->getCellIndexer();
@@ -594,39 +475,6 @@ void celllist_embed_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
         CHECK_EQUAL_UINT(h_cell_np.data[ci(1, 1, 0)], 3);
         CHECK_EQUAL_UINT(h_cell_np.data[ci(1, 1, 1)], 2);
 
-        // check the particle ids in each cell
-        Index2D cli = cl->getCellListIndexer();
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 0, 0))], 0);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 0, 1))], 4);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 1, 0))], 2);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(1, 0, 0))], 1);
-        CHECK_EQUAL_UINT(h_cell_list.data[cli(0, ci(0, 1, 1))], 6);
-            // check two particles in cell (1,0,1)
-            {
-            std::vector<unsigned int> result(2);
-            result[0] = h_cell_list.data[cli(0, ci(1, 0, 1))];
-            result[1] = h_cell_list.data[cli(1, ci(1, 0, 1))];
-            sort(result.begin(), result.end());
-            UP_ASSERT_EQUAL(result, std::vector<unsigned int> {5, 10});
-            }
-            // check two particles in cell (1,1,0)
-            {
-            std::vector<unsigned int> result(3);
-            result[0] = h_cell_list.data[cli(0, ci(1, 1, 0))];
-            result[1] = h_cell_list.data[cli(1, ci(1, 1, 0))];
-            result[2] = h_cell_list.data[cli(2, ci(1, 1, 0))];
-            sort(result.begin(), result.end());
-            UP_ASSERT_EQUAL(result, std::vector<unsigned int> {3, 8, 9});
-            }
-            // check two particles in cell (1,1,1)
-            {
-            std::vector<unsigned int> result(2);
-            result[0] = h_cell_list.data[cli(0, ci(1, 1, 1))];
-            result[1] = h_cell_list.data[cli(1, ci(1, 1, 1))];
-            sort(result.begin(), result.end());
-            UP_ASSERT_EQUAL(result, std::vector<unsigned int> {7, 11});
-            }
-
         ArrayHandle<Scalar4> h_vel(pdata_8->getVelocities(),
                                    access_location::host,
                                    access_mode::read);
@@ -662,6 +510,347 @@ void celllist_embed_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
         }
     }
 
+//! Test for correct calculation of cell thermo properties for MPCD particles
+template<class CL>
+void celllist_thermo_basic_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
+                                const Scalar3& L,
+                                const Scalar3& tilt)
+    {
+    auto ref_box = std::make_shared<BoxDim>(2.0);
+    auto box = std::make_shared<BoxDim>(L);
+    box->setTiltFactors(tilt.x, tilt.y, tilt.z);
+
+    std::shared_ptr<SnapshotSystemData<Scalar>> snap(new SnapshotSystemData<Scalar>());
+    snap->global_box = box;
+    snap->particle_data.type_mapping.push_back("A");
+    // place each particle in a different cell, doubling the first cell
+    snap->mpcd_data.resize(5);
+    snap->mpcd_data.type_mapping.push_back("A");
+    snap->mpcd_data.position[0] = vec3<Scalar>(-0.5, -0.5, -0.5);
+    snap->mpcd_data.position[1] = vec3<Scalar>(-0.5, -0.5, -0.5);
+    snap->mpcd_data.position[2] = vec3<Scalar>(0.5, 0.5, 0.5);
+    snap->mpcd_data.position[3] = vec3<Scalar>(0.5, 0.5, 0.5);
+    snap->mpcd_data.position[4] = vec3<Scalar>(-0.5, 0.5, 0.5);
+
+    snap->mpcd_data.velocity[0] = vec3<Scalar>(2.0, 0.0, 0.0);
+    snap->mpcd_data.velocity[1] = vec3<Scalar>(1.0, 0.0, 0.0);
+    snap->mpcd_data.velocity[2] = vec3<Scalar>(0.0, -3.0, 0.0);
+    snap->mpcd_data.velocity[3] = vec3<Scalar>(0.0, 0.0, -5.0);
+    snap->mpcd_data.velocity[4] = vec3<Scalar>(1.0, -1.0, 4.0);
+    std::shared_ptr<SystemDefinition> sysdef(new SystemDefinition(snap, exec_conf));
+
+    std::shared_ptr<mpcd::ParticleData> pdata_5 = sysdef->getMPCDParticleData();
+
+    std::shared_ptr<mpcd::CellList> cl(new CL(sysdef, make_uint3(2, 2, 2), false));
+    AllThermoRequest thermo_req(cl);
+    cl->compute(0);
+        {
+        const Index3D ci = cl->getCellIndexer();
+        ArrayHandle<unsigned int> h_cell_np(cl->getCellSizeArray(),
+                                            access_location::host,
+                                            access_mode::read);
+        ArrayHandle<double4> h_avg_vel(cl->getCellVelocities(),
+                                       access_location::host,
+                                       access_mode::read);
+        ArrayHandle<double> h_cell_energy(cl->getCellEnergies(),
+                                          access_location::host,
+                                          access_mode::read);
+        ArrayHandle<double> h_cell_temp(cl->getCellTemperature(),
+                                        access_location::host,
+                                        access_mode::read);
+        // Two particle cell (0,0,0)
+        // Average velocity, mass
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].x, 1.5, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].y, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].z, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].w, 2.0, tol);
+        // energy, temperature (relative to COM), np
+        CHECK_CLOSE(h_cell_energy.data[ci(0, 0, 0)], 2.5, tol);
+        CHECK_CLOSE(h_cell_temp.data[ci(0, 0, 0)], 2.0 * 0.5 * 0.5 / 3.0, tol);
+        UP_ASSERT_EQUAL(h_cell_np.data[ci(0, 0, 0)], 2);
+
+        // Two particle cell (1,1,1)
+        // Average velocity, mass
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].x, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].y, -1.5, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].z, -2.5, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].w, 2.0, tol);
+        // energy, temperature (relative to COM), np
+        CHECK_CLOSE(h_cell_energy.data[ci(1, 1, 1)], 17.0, tol);
+        CHECK_CLOSE(h_cell_temp.data[ci(1, 1, 1)], 2.0 * (1.5 * 1.5 + 2.5 * 2.5) / 3.0, tol);
+        UP_ASSERT_EQUAL(h_cell_np.data[ci(1, 1, 1)], 2);
+
+        // One particle cell (0,1,1)
+        // Average velocity, mass
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 1, 1)].x, 1.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 1, 1)].y, -1.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 1, 1)].z, 4.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 1, 1)].w, 1.0, tol);
+        // Has kinetic energy, but temperature should be zero
+        CHECK_CLOSE(h_cell_energy.data[ci(0, 1, 1)], 9.0, tol);
+        CHECK_CLOSE(h_cell_temp.data[ci(0, 1, 1)], 0.0, tol);
+        UP_ASSERT_EQUAL(h_cell_np.data[ci(0, 1, 1)], 1);
+        }
+
+    // Check the net stats of the system
+    CHECK_CLOSE(cl->getNetMomentum().x, 4.0, tol);
+    CHECK_CLOSE(cl->getNetMomentum().y, -4.0, tol);
+    CHECK_CLOSE(cl->getNetMomentum().z, -1.0, tol);
+    CHECK_CLOSE(cl->getNetEnergy(), 28.5, tol);
+    CHECK_CLOSE(cl->getTemperature(),
+                0.5 * (2.0 * 0.5 * 0.5 / 3.0 + 2.0 * (1.5 * 1.5 + 2.5 * 2.5) / 3.0),
+                tol);
+
+    // increase the mass and make sure that energies depend on mass, but velocities don't
+    pdata_5->setMass(4.0);
+    cl->compute(1);
+        {
+        const Index3D ci = cl->getCellIndexer();
+        ArrayHandle<unsigned int> h_cell_np(cl->getCellSizeArray(),
+                                            access_location::host,
+                                            access_mode::read);
+        ArrayHandle<double4> h_avg_vel(cl->getCellVelocities(),
+                                       access_location::host,
+                                       access_mode::read);
+        ArrayHandle<double> h_cell_energy(cl->getCellEnergies(),
+                                          access_location::host,
+                                          access_mode::read);
+        ArrayHandle<double> h_cell_temp(cl->getCellTemperature(),
+                                        access_location::host,
+                                        access_mode::read);
+
+        // Two particle cell (0,0,0)
+        // Average velocity, mass
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].x, 1.5, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].y, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].z, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].w, 8.0, tol);
+        // energy, temperature (relative to COM), np, flag
+        CHECK_CLOSE(h_cell_energy.data[ci(0, 0, 0)], 4.0 * 2.5, tol);
+        CHECK_CLOSE(h_cell_temp.data[ci(0, 0, 0)], 4.0 * 2.0 * 0.5 * 0.5 / 3.0, tol);
+        UP_ASSERT_EQUAL(h_cell_np.data[ci(0, 0, 0)], 2);
+
+        // Two particle cell (1,1,1)
+        // Average velocity, mass
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].x, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].y, -1.5, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].z, -2.5, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].w, 8.0, tol);
+        // energy, temperature (relative to COM), np, flag
+        CHECK_CLOSE(h_cell_energy.data[ci(1, 1, 1)], 4.0 * 17.0, tol);
+        CHECK_CLOSE(h_cell_temp.data[ci(1, 1, 1)], 4.0 * 2.0 * (1.5 * 1.5 + 2.5 * 2.5) / 3.0, tol);
+        UP_ASSERT_EQUAL(h_cell_np.data[ci(1, 1, 1)], 2);
+
+        // One particle cell (0,1,1)
+        // Average velocity, mass
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 1, 1)].x, 1.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 1, 1)].y, -1.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 1, 1)].z, 4.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 1, 1)].w, 4.0, tol);
+        // Has kinetic energy, but temperature should be zero
+        CHECK_CLOSE(h_cell_energy.data[ci(0, 1, 1)], 4.0 * 9.0, tol);
+        CHECK_CLOSE(h_cell_temp.data[ci(0, 1, 1)], 0.0, tol);
+        UP_ASSERT_EQUAL(h_cell_np.data[ci(0, 1, 1)], 1);
+        }
+
+    // Check the net stats of the system
+    CHECK_CLOSE(cl->getNetMomentum().x, 4.0 * 4.0, tol);
+    CHECK_CLOSE(cl->getNetMomentum().y, 4.0 * -4.0, tol);
+    CHECK_CLOSE(cl->getNetMomentum().z, 4.0 * -1.0, tol);
+    CHECK_CLOSE(cl->getNetEnergy(), 4.0 * 28.5, tol);
+    CHECK_CLOSE(cl->getTemperature(),
+                4.0 * 0.5 * (2.0 * 0.5 * 0.5 / 3.0 + 2.0 * (1.5 * 1.5 + 2.5 * 2.5) / 3.0),
+                tol);
+
+    // switch a particle into a different cell, and make sure the DOF are reduced accordingly
+    pdata_5->setMass(1.0);
+        {
+        ArrayHandle<Scalar4> h_pos(pdata_5->getPositions(),
+                                   access_location::host,
+                                   access_mode::readwrite);
+        h_pos.data[2] = make_scalar4(-0.5, -0.5, -0.5, 0.0);
+        }
+    cl->compute(2);
+        {
+        const Index3D ci = cl->getCellIndexer();
+        ArrayHandle<unsigned int> h_cell_np(cl->getCellSizeArray(),
+                                            access_location::host,
+                                            access_mode::read);
+        ArrayHandle<double4> h_avg_vel(cl->getCellVelocities(),
+                                       access_location::host,
+                                       access_mode::read);
+        ArrayHandle<double> h_cell_energy(cl->getCellEnergies(),
+                                          access_location::host,
+                                          access_mode::read);
+        ArrayHandle<double> h_cell_temp(cl->getCellTemperature(),
+                                        access_location::host,
+                                        access_mode::read);
+
+        // Three particle cell (0,0,0)
+        // Average velocity, mass
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].x, 1.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].y, -1.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].z, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].w, 3.0, tol);
+        // energy, temperature (relative to COM), np, flag
+        CHECK_CLOSE(h_cell_energy.data[ci(0, 0, 0)], 7.0, tol);
+        CHECK_CLOSE(h_cell_temp.data[ci(0, 0, 0)],
+                    (2 * 1.0 * 1.0 + 2 * 1.0 * 1.0 + 2.0 * 2.0) / 6.0,
+                    tol);
+        UP_ASSERT_EQUAL(h_cell_np.data[ci(0, 0, 0)], 3);
+
+        // One particle cell (1,1,1)
+        // Average velocity, mass
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].x, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].y, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].z, -5.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].w, 1.0, tol);
+        // energy, temperature (relative to COM), np, flag
+        CHECK_CLOSE(h_cell_energy.data[ci(1, 1, 1)], 12.5, tol);
+        CHECK_CLOSE(h_cell_temp.data[ci(1, 1, 1)], 0.0, tol);
+        UP_ASSERT_EQUAL(h_cell_np.data[ci(1, 1, 1)], 1);
+
+        // One particle cell (0,1,1)
+        // Average velocity, mass
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 1, 1)].x, 1.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 1, 1)].y, -1.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 1, 1)].z, 4.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 1, 1)].w, 1.0, tol);
+        // Has kinetic energy, but temperature should be zero
+        CHECK_CLOSE(h_cell_energy.data[ci(0, 1, 1)], 9.0, tol);
+        CHECK_CLOSE(h_cell_temp.data[ci(0, 1, 1)], 0.0, tol);
+        UP_ASSERT_EQUAL(h_cell_np.data[ci(0, 1, 1)], 1);
+        }
+
+    // Check the net stats of the system, only average temperature should change now
+    CHECK_CLOSE(cl->getNetMomentum().x, 4.0, tol);
+    CHECK_CLOSE(cl->getNetMomentum().y, -4.0, tol);
+    CHECK_CLOSE(cl->getNetMomentum().z, -1.0, tol);
+    CHECK_CLOSE(cl->getNetEnergy(), 28.5, tol);
+    CHECK_CLOSE(cl->getTemperature(), (2 * 1.0 * 1.0 + 2 * 1.0 * 1.0 + 2.0 * 2.0) / 6.0, tol);
+    }
+
+//! Test for correct calculation of cell thermo properties with embedded particles
+template<class CL>
+void celllist_thermo_embed_test(std::shared_ptr<ExecutionConfiguration> exec_conf,
+                                const Scalar3& L,
+                                const Scalar3& tilt)
+    {
+    auto ref_box = std::make_shared<BoxDim>(2.0);
+    auto box = std::make_shared<BoxDim>(L);
+    box->setTiltFactors(tilt.x, tilt.y, tilt.z);
+
+    std::shared_ptr<SnapshotSystemData<Scalar>> snap(new SnapshotSystemData<Scalar>());
+    snap->global_box = box;
+        {
+        SnapshotParticleData<Scalar>& pdata_snap = snap->particle_data;
+        pdata_snap.type_mapping.push_back("A");
+        pdata_snap.resize(4);
+
+        pdata_snap.pos[0] = vec3<Scalar>(-0.5, -0.5, -0.5);
+        pdata_snap.pos[1] = vec3<Scalar>(-0.5, -0.5, 0.5);
+        pdata_snap.pos[2] = vec3<Scalar>(0.5, -0.5, 0.5);
+        pdata_snap.pos[3] = vec3<Scalar>(0.5, 0.5, 0.5);
+
+        pdata_snap.mass[0] = 3.0;
+        pdata_snap.mass[1] = 2.0;
+        pdata_snap.mass[2] = 4.0;
+        pdata_snap.mass[3] = 5.0;
+
+        pdata_snap.vel[0] = vec3<Scalar>(-2.0, 0.0, 0.0);
+        pdata_snap.vel[1] = vec3<Scalar>(1.0, 0.0, 0.0);
+        pdata_snap.vel[2] = vec3<Scalar>(0.0, -3.0, 0.0);
+        pdata_snap.vel[3] = vec3<Scalar>(0.0, 0.0, -5.0);
+        }
+    snap->mpcd_data.resize(4);
+    snap->mpcd_data.type_mapping.push_back("A");
+    snap->mpcd_data.position[0] = vec3<Scalar>(-0.5, -0.5, -0.5);
+    snap->mpcd_data.position[1] = vec3<Scalar>(-0.5, -0.5, 0.5);
+    snap->mpcd_data.position[2] = vec3<Scalar>(0.5, -0.5, 0.5);
+    snap->mpcd_data.position[3] = vec3<Scalar>(0.5, 0.5, 0.5);
+
+    snap->mpcd_data.velocity[0] = vec3<Scalar>(2.0, 0.0, 0.0);
+    snap->mpcd_data.velocity[1] = vec3<Scalar>(1.0, 0.0, 0.0);
+    snap->mpcd_data.velocity[2] = vec3<Scalar>(0.0, -3.0, 0.0);
+    snap->mpcd_data.velocity[3] = vec3<Scalar>(0.0, 0.0, -5.0);
+    std::shared_ptr<SystemDefinition> sysdef(new SystemDefinition(snap, exec_conf));
+
+    std::shared_ptr<ParticleData> embed_pdata = sysdef->getParticleData();
+    std::shared_ptr<ParticleFilter> selector(new ParticleFilterAll());
+    std::shared_ptr<ParticleGroup> group(new ParticleGroup(sysdef, selector));
+
+    auto cl = std::make_shared<mpcd::CellList>(sysdef, 1.0, false);
+    cl->setEmbeddedGroup(group);
+    AllThermoRequest thermo_req(cl);
+    cl->compute(0);
+        {
+        const Index3D ci = cl->getCellIndexer();
+        ArrayHandle<unsigned int> h_cell_np(cl->getCellSizeArray(),
+                                            access_location::host,
+                                            access_mode::read);
+        ArrayHandle<double4> h_avg_vel(cl->getCellVelocities(),
+                                       access_location::host,
+                                       access_mode::read);
+        ArrayHandle<double> h_cell_energy(cl->getCellEnergies(),
+                                          access_location::host,
+                                          access_mode::read);
+        ArrayHandle<double> h_cell_temp(cl->getCellTemperature(),
+                                        access_location::host,
+                                        access_mode::read);
+
+        // Cell (0,0,0)
+        // Average velocity, mass
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].x, -1.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].y, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].z, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 0)].w, 4.0, tol);
+        // energy, temperature (relative to COM), np, flag
+        CHECK_CLOSE(h_cell_energy.data[ci(0, 0, 0)], 8.0, tol);
+        CHECK_CLOSE(h_cell_temp.data[ci(0, 0, 0)], 4.0, tol);
+        UP_ASSERT_EQUAL(h_cell_np.data[ci(0, 0, 0)], 2);
+
+        // Cell (0,0,1)
+        // Average velocity, mass
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 1)].x, 1.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 1)].y, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 1)].z, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(0, 0, 1)].w, 3.0, tol);
+        // energy, temperature (relative to COM), np, flag
+        CHECK_CLOSE(h_cell_energy.data[ci(0, 0, 1)], 1.5, tol);
+        CHECK_CLOSE(h_cell_temp.data[ci(0, 0, 1)], 0.0, tol);
+        UP_ASSERT_EQUAL(h_cell_np.data[ci(0, 0, 1)], 2);
+
+        // Cell (1,0,1)
+        // Average velocity, mass
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 0, 1)].x, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 0, 1)].y, -3.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 0, 1)].z, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 0, 1)].w, 5.0, tol);
+        // Has kinetic energy, but temperature should be zero
+        CHECK_CLOSE(h_cell_energy.data[ci(1, 0, 1)], 22.5, tol);
+        CHECK_CLOSE(h_cell_temp.data[ci(1, 0, 1)], 0.0, tol);
+        UP_ASSERT_EQUAL(h_cell_np.data[ci(1, 0, 1)], 2);
+
+        // Cell (1,1,1)
+        // Average velocity, mass
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].x, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].y, 0.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].z, -5.0, tol);
+        CHECK_CLOSE(h_avg_vel.data[ci(1, 1, 1)].w, 6.0, tol);
+        // energy, temperature (relative to COM), np, flag
+        CHECK_CLOSE(h_cell_energy.data[ci(1, 1, 1)], 75.0, tol);
+        CHECK_CLOSE(h_cell_temp.data[ci(1, 1, 1)], 0.0, tol);
+        UP_ASSERT_EQUAL(h_cell_np.data[ci(1, 1, 1)], 2);
+        }
+
+    // Check the net stats of the system
+    CHECK_CLOSE(cl->getNetMomentum().x, -1.0, tol);
+    CHECK_CLOSE(cl->getNetMomentum().y, -15.0, tol);
+    CHECK_CLOSE(cl->getNetMomentum().z, -30.0, tol);
+    CHECK_CLOSE(cl->getNetEnergy(), 107.0, tol);
+    CHECK_CLOSE(cl->getTemperature(), (4.0 + 0.0 + 0.0 + 0.0) / 4., tol);
+    }
+
 //! dimension test case for MPCD CellList class
 UP_TEST(mpcd_cell_list_dimensions)
     {
@@ -770,6 +959,22 @@ UP_TEST(mpcd_cell_list_embed_test_triclinic)
         make_scalar3(0.5, -0.75, 1.0));
     }
 
+//! test case for cell list velocity and energy
+UP_TEST(mpcd_cell_list_thermo_basic)
+    {
+    celllist_thermo_basic_test<mpcd::CellList>(
+        std::make_shared<ExecutionConfiguration>(ExecutionConfiguration::CPU),
+        make_scalar3(2.0, 2.0, 2.0),
+        make_scalar3(0, 0, 0));
+    }
+UP_TEST(mpcd_cell_list_thermo_embed)
+    {
+    celllist_thermo_embed_test<mpcd::CellList>(
+        std::make_shared<ExecutionConfiguration>(ExecutionConfiguration::CPU),
+        make_scalar3(2.0, 2.0, 2.0),
+        make_scalar3(0, 0, 0));
+    }
+
 #ifdef ENABLE_HIP
 //! dimension test case for MPCD CellListGPU class
 UP_TEST(mpcd_cell_list_gpu_dimensions)
@@ -878,4 +1083,20 @@ UP_TEST(mpcd_cell_list_gpu_embed_test_triclinic)
         make_scalar3(2.0, 2.0, 2.0),
         make_scalar3(0.5, -0.75, 1.0));
     }
+
+//! test case for cell list velocity and energy
+UP_TEST(mpcd_cell_list_thermo_basic_gpu)
+    {
+    celllist_thermo_basic_test<mpcd::CellListGPU>(
+        std::make_shared<ExecutionConfiguration>(ExecutionConfiguration::GPU),
+        make_scalar3(2.0, 2.0, 2.0),
+        make_scalar3(0, 0, 0));
+    }
+UP_TEST(mpcd_cell_list_thermo_embed_gpu)
+    {
+    celllist_thermo_embed_test<mpcd::CellListGPU>(
+        std::make_shared<ExecutionConfiguration>(ExecutionConfiguration::GPU),
+        make_scalar3(2.0, 2.0, 2.0),
+        make_scalar3(0, 0, 0));
+    }
 #endif // ENABLE_HIP
diff --git a/hoomd/mpcd/test/sorter_test.cc b/hoomd/mpcd/test/sorter_test.cc
index b055f31966..abd045afdd 100644
--- a/hoomd/mpcd/test/sorter_test.cc
+++ b/hoomd/mpcd/test/sorter_test.cc
@@ -168,40 +168,6 @@ template<class T> void sorter_test(std::shared_ptr<ExecutionConfiguration> exec_
         UP_ASSERT_EQUAL(__scalar_as_int(h_vel.data[6].w), 6);
         UP_ASSERT_EQUAL(__scalar_as_int(h_vel.data[7].w), 7);
         }
-
-        // check that the cell list has been updated as well
-        {
-        ArrayHandle<unsigned int> h_cl(cl->getCellList(), access_location::host, access_mode::read);
-        ArrayHandle<unsigned int> h_np(cl->getCellSizeArray(),
-                                       access_location::host,
-                                       access_mode::read);
-        const Index3D& ci = cl->getCellIndexer();
-        const Index2D& cli = cl->getCellListIndexer();
-
-        // all cells should have one particle, except the first cell, which has the embedded one
-        UP_ASSERT_EQUAL(h_np.data[ci(0, 0, 0)], 2);
-        UP_ASSERT_EQUAL(h_np.data[ci(1, 0, 0)], 1);
-        UP_ASSERT_EQUAL(h_np.data[ci(0, 1, 0)], 1);
-        UP_ASSERT_EQUAL(h_np.data[ci(1, 1, 0)], 1);
-        UP_ASSERT_EQUAL(h_np.data[ci(0, 0, 1)], 1);
-        UP_ASSERT_EQUAL(h_np.data[ci(1, 0, 1)], 1);
-        UP_ASSERT_EQUAL(h_np.data[ci(0, 1, 1)], 1);
-        UP_ASSERT_EQUAL(h_np.data[ci(1, 1, 1)], 1);
-
-        // the particles should be in ascending order
-        UP_ASSERT_EQUAL(h_cl.data[cli(0, ci(1, 0, 0))], 1);
-        UP_ASSERT_EQUAL(h_cl.data[cli(0, ci(0, 1, 0))], 2);
-        UP_ASSERT_EQUAL(h_cl.data[cli(0, ci(1, 1, 0))], 3);
-        UP_ASSERT_EQUAL(h_cl.data[cli(0, ci(0, 0, 1))], 4);
-        UP_ASSERT_EQUAL(h_cl.data[cli(0, ci(1, 0, 1))], 5);
-        UP_ASSERT_EQUAL(h_cl.data[cli(0, ci(0, 1, 1))], 6);
-        UP_ASSERT_EQUAL(h_cl.data[cli(0, ci(1, 1, 1))], 7);
-        // do first cell separately, since it needs to be a sorted list
-        std::vector<unsigned int> cell_0
-            = {h_cl.data[cli(0, ci(0, 0, 0))], h_cl.data[cli(1, ci(0, 0, 0))]};
-        std::sort(cell_0.begin(), cell_0.end());
-        UP_ASSERT_EQUAL(cell_0, std::vector<unsigned int> {0, 8});
-        }
     }
 
 //! Test for MPCD sorting with virtual particles
@@ -370,36 +336,6 @@ template<class T> void sorter_virtual_test(std::shared_ptr<ExecutionConfiguratio
         UP_ASSERT_EQUAL(__scalar_as_int(h_vel.data[6].w), 1);
         UP_ASSERT_EQUAL(__scalar_as_int(h_vel.data[7].w), 3);
         }
-
-        // check that the cell list has been updated as well
-        {
-        ArrayHandle<unsigned int> h_cl(cl->getCellList(), access_location::host, access_mode::read);
-        ArrayHandle<unsigned int> h_np(cl->getCellSizeArray(),
-                                       access_location::host,
-                                       access_mode::read);
-        const Index3D& ci = cl->getCellIndexer();
-        const Index2D& cli = cl->getCellListIndexer();
-
-        // all cells should have one particle
-        UP_ASSERT_EQUAL(h_np.data[ci(0, 0, 0)], 1);
-        UP_ASSERT_EQUAL(h_np.data[ci(1, 0, 0)], 1);
-        UP_ASSERT_EQUAL(h_np.data[ci(0, 1, 0)], 1);
-        UP_ASSERT_EQUAL(h_np.data[ci(1, 1, 0)], 1);
-        UP_ASSERT_EQUAL(h_np.data[ci(0, 0, 1)], 1);
-        UP_ASSERT_EQUAL(h_np.data[ci(1, 0, 1)], 1);
-        UP_ASSERT_EQUAL(h_np.data[ci(0, 1, 1)], 1);
-        UP_ASSERT_EQUAL(h_np.data[ci(1, 1, 1)], 1);
-
-        // the particles should be in ascending order, with VPs interleaved unsorted
-        UP_ASSERT_EQUAL(h_cl.data[cli(0, ci(0, 0, 0))], 0);
-        UP_ASSERT_EQUAL(h_cl.data[cli(0, ci(1, 0, 0))], 6);
-        UP_ASSERT_EQUAL(h_cl.data[cli(0, ci(0, 1, 0))], 1);
-        UP_ASSERT_EQUAL(h_cl.data[cli(0, ci(1, 1, 0))], 7);
-        UP_ASSERT_EQUAL(h_cl.data[cli(0, ci(0, 0, 1))], 2);
-        UP_ASSERT_EQUAL(h_cl.data[cli(0, ci(1, 0, 1))], 3);
-        UP_ASSERT_EQUAL(h_cl.data[cli(0, ci(0, 1, 1))], 4);
-        UP_ASSERT_EQUAL(h_cl.data[cli(0, ci(1, 1, 1))], 5);
-        }
     }
 
 //! basic test case for MPCD sorter
diff --git a/hoomd/mpcd/test/srd_collision_method_test.cc b/hoomd/mpcd/test/srd_collision_method_test.cc
index 943b81b27d..713c040098 100644
--- a/hoomd/mpcd/test/srd_collision_method_test.cc
+++ b/hoomd/mpcd/test/srd_collision_method_test.cc
@@ -56,10 +56,7 @@ void srd_collision_method_basic_test(std::shared_ptr<ExecutionConfiguration> exe
     auto cl = std::make_shared<mpcd::CellList>(sysdef, 1.0, false);
     std::shared_ptr<mpcd::SRDCollisionMethod> collide = std::make_shared<CM>(sysdef, 0, 2, 1, 130.);
     collide->setCellList(cl);
-
-    // create a thermo, and use it to check the current
-    auto thermo = std::make_shared<mpcd::CellThermoCompute>(sysdef, cl);
-    AllThermoRequest thermo_req(thermo);
+    AllThermoRequest thermo_req(cl);
 
     UP_ASSERT(!collide->peekCollide(0));
     collide->collide(0);
@@ -77,16 +74,16 @@ void srd_collision_method_basic_test(std::shared_ptr<ExecutionConfiguration> exe
 
         {
         // check net properties of cells, which should match our inputs
-        thermo->compute(0);
-        const Scalar3 mom = thermo->getNetMomentum();
+        cl->compute(0);
+        const Scalar3 mom = cl->getNetMomentum();
         CHECK_CLOSE(mom.x, orig_mom.x, tol_small);
         CHECK_CLOSE(mom.y, orig_mom.y, tol_small);
         CHECK_CLOSE(mom.z, orig_mom.z, tol_small);
 
-        const Scalar energy = thermo->getNetEnergy();
+        const Scalar energy = cl->getNetEnergy();
         CHECK_CLOSE(energy, orig_energy, tol_small);
 
-        const Scalar temp = thermo->getTemperature();
+        const Scalar temp = cl->getTemperature();
         CHECK_CLOSE(temp, orig_temp, tol_small);
         }
 
@@ -151,16 +148,16 @@ void srd_collision_method_basic_test(std::shared_ptr<ExecutionConfiguration> exe
         }
 
     // recompute net properties, and make sure they are still the same
-    thermo->compute(2);
-    const Scalar3 mom = thermo->getNetMomentum();
+    cl->compute(2);
+    const Scalar3 mom = cl->getNetMomentum();
     CHECK_CLOSE(mom.x, orig_mom.x, tol_small);
     CHECK_CLOSE(mom.y, orig_mom.y, tol_small);
     CHECK_CLOSE(mom.z, orig_mom.z, tol_small);
 
-    const Scalar energy = thermo->getNetEnergy();
+    const Scalar energy = cl->getNetEnergy();
     CHECK_CLOSE(energy, orig_energy, tol_small);
 
-    const Scalar temp = thermo->getTemperature();
+    const Scalar temp = cl->getTemperature();
     CHECK_CLOSE(temp, orig_temp, tol_small);
     }
 
@@ -284,10 +281,7 @@ void srd_collision_method_embed_test(std::shared_ptr<ExecutionConfiguration> exe
     std::shared_ptr<mpcd::SRDCollisionMethod> collide
         = std::make_shared<CM>(sysdef, 0, 1, -1, 130.);
     collide->setCellList(cl);
-
-    // create a thermo, and use it to check the current
-    auto thermo = std::make_shared<mpcd::CellThermoCompute>(sysdef, cl);
-    AllThermoRequest thermo_req(thermo);
+    AllThermoRequest thermo_req(cl);
 
     // embed the particle group into the mpcd system
     std::shared_ptr<ParticleFilter> selector_one(new ParticleFilterAll());
@@ -295,10 +289,10 @@ void srd_collision_method_embed_test(std::shared_ptr<ExecutionConfiguration> exe
     collide->setEmbeddedGroup(group_all);
 
     // Save original momentum for comparison as well
-    thermo->compute(0);
-    const Scalar orig_energy = thermo->getNetEnergy();
-    const Scalar orig_temp = thermo->getTemperature();
-    const Scalar3 orig_mom = thermo->getNetMomentum();
+    cl->compute(0);
+    const Scalar orig_energy = cl->getNetEnergy();
+    const Scalar orig_temp = cl->getTemperature();
+    const Scalar3 orig_mom = cl->getNetMomentum();
     collide->collide(0);
         {
         // velocity should be different now, but the mass should stay the same
@@ -312,10 +306,10 @@ void srd_collision_method_embed_test(std::shared_ptr<ExecutionConfiguration> exe
         }
 
     // compute properties after rotation
-    thermo->compute(1);
-    Scalar energy = thermo->getNetEnergy();
-    Scalar temp = thermo->getTemperature();
-    Scalar3 mom = thermo->getNetMomentum();
+    cl->compute(1);
+    Scalar energy = cl->getNetEnergy();
+    Scalar temp = cl->getTemperature();
+    Scalar3 mom = cl->getNetMomentum();
 
     // energy (temperature) and momentum should be conserved after a collision
     CHECK_CLOSE(orig_energy, energy, tol_small);
@@ -337,8 +331,7 @@ void srd_collision_method_thermostat_test(std::shared_ptr<ExecutionConfiguration
     auto cl = std::make_shared<mpcd::CellList>(sysdef, 1.0, false);
     std::shared_ptr<mpcd::SRDCollisionMethod> collide = std::make_shared<CM>(sysdef, 0, 1, -1, 827);
     collide->setCellList(cl);
-    auto thermo = std::make_shared<mpcd::CellThermoCompute>(sysdef, cl);
-    AllThermoRequest thermo_req(thermo);
+    AllThermoRequest thermo_req(cl);
 
     // timestep counter and number of samples to make
     uint64_t timestep = 0;
@@ -351,8 +344,8 @@ void srd_collision_method_thermostat_test(std::shared_ptr<ExecutionConfiguration
         double mean(0.0);
         for (unsigned int i = 0; i < N; ++i)
             {
-            thermo->compute(timestep);
-            mean += thermo->getTemperature();
+            cl->compute(timestep);
+            mean += cl->getTemperature();
             collide->collide(timestep++);
             }
         mean /= N;
@@ -366,8 +359,8 @@ void srd_collision_method_thermostat_test(std::shared_ptr<ExecutionConfiguration
         double mean(0.0);
         for (unsigned int i = 0; i < N; ++i)
             {
-            thermo->compute(timestep);
-            mean += thermo->getTemperature();
+            cl->compute(timestep);
+            mean += cl->getTemperature();
             collide->collide(timestep++);
             }
         mean /= N;
diff --git a/hoomd/mpcd/test/utils.h b/hoomd/mpcd/test/utils.h
index 0ceda735a3..a19b8c0ec8 100644
--- a/hoomd/mpcd/test/utils.h
+++ b/hoomd/mpcd/test/utils.h
@@ -5,7 +5,6 @@
 #define MPCD_TEST_UTILS_H_
 
 #include "hoomd/mpcd/CellList.h"
-#include "hoomd/mpcd/CellThermoCompute.h"
 
 namespace hoomd
     {
@@ -15,15 +14,14 @@ class AllThermoRequest
     public:
     //! Constructor
     /*!
-     * \param thermo Thermo compute to supply flags to.
+     * \param cl CellList to supply flags to.
      *
-     * \post This object is connected to \a thermo.
+     * \post This object is connected to \a cl.
      */
-    AllThermoRequest(std::shared_ptr<mpcd::CellThermoCompute> thermo) : m_thermo(thermo)
+    AllThermoRequest(std::shared_ptr<mpcd::CellList> cl) : m_cl(cl)
         {
-        if (m_thermo)
-            m_thermo->getFlagsSignal().connect<AllThermoRequest, &AllThermoRequest::operator()>(
-                this);
+        if (m_cl)
+            m_cl->getFlagsSignal().connect<AllThermoRequest, &AllThermoRequest::operator()>(this);
         }
 
     //! Destructor
@@ -32,8 +30,8 @@ class AllThermoRequest
      */
     ~AllThermoRequest()
         {
-        if (m_thermo)
-            m_thermo->getFlagsSignal().disconnect<AllThermoRequest, &AllThermoRequest::operator()>(
+        if (m_cl)
+            m_cl->getFlagsSignal().disconnect<AllThermoRequest, &AllThermoRequest::operator()>(
                 this);
         }
 
@@ -48,7 +46,7 @@ class AllThermoRequest
         }
 
     private:
-    std::shared_ptr<mpcd::CellThermoCompute> m_thermo;
+    std::shared_ptr<mpcd::CellList> m_cl;
     };
 
 Scalar3 scale(const Scalar3& ref_pos, std::shared_ptr<BoxDim> ref_box, std::shared_ptr<BoxDim> box)